diff --git a/Docker_README.txt b/Docker_README.txt new file mode 100644 index 0000000..80009a7 --- /dev/null +++ b/Docker_README.txt @@ -0,0 +1,11 @@ +#build: +docker build -t gemli-r:v0 -f Dockerfile_v0 . + +#push to registry +docker login #user and pass from dockerhub here +docker tag gemli-r:v0 fenix07/gemli-r:v0 +docker push fenix07/gemli-r:v0 + +#pull with docker +docker pull fenix07/gemli-r:v0 +docker run -it --rm --privileged fenix07/gemli-r:v0 diff --git a/Dockerfile_v0 b/Dockerfile_v0 new file mode 100644 index 0000000..6c50010 --- /dev/null +++ b/Dockerfile_v0 @@ -0,0 +1,27 @@ +# Use the official R base image +FROM rocker/r-ver:4.3.3 + +# Install necessary system dependencies for R packages +RUN apt-get update && apt-get install -y \ + libcurl4-openssl-dev \ + libssl-dev \ + libxml2-dev \ + git \ + libnetcdf-dev \ + netcdf-bin \ + libpng-dev \ + libfontconfig1-dev \ + libfreetype6-dev \ + && rm -rf /var/lib/apt/lists/* + +# Install remotes package to enable installation from GitHub +RUN R -e "install.packages('remotes', repos='https://cloud.r-project.org/')" + +# Install the GEMLI package from the specified subdirectory on GitHub +RUN R -e "remotes::install_github('UPSUTER/GEMLI', subdir = 'GEMLI_package_v0')" + +# Verify the installation +RUN R -e "library(GEMLI)" + +# Set the command to R console +CMD ["R"] diff --git a/Dockerfile_v1 b/Dockerfile_v1 new file mode 100644 index 0000000..75f93b1 --- /dev/null +++ b/Dockerfile_v1 @@ -0,0 +1,34 @@ +# Use the official R base image +FROM rocker/r-ver:4.4.1 + +# Install necessary system dependencies for R packages and LaTeX in smaller batches +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + libcurl4-openssl-dev \ + libssl-dev \ + libxml2-dev \ + libnetcdf-dev \ + netcdf-bin \ + libpng-dev \ + libfontconfig1-dev \ + libfreetype6-dev \ + texlive-latex-base \ + texlive-fonts-recommended && \ + apt-get clean && rm -rf /var/lib/apt/lists/*# Install necessary system dependencies for R packages + +# Install remotes package to enable installation from GitHub +RUN R -e "install.packages('remotes', repos='https://cloud.r-project.org/')" +RUN R -e "install.packages('Seurat')" + +# Copy the local GEMLI package directory to the Docker image and install the GEMLI package from the local directory +COPY GEMLI_package_v1 /opt/GEMLI_package_v1 +RUN R -e "remotes::install_local('/opt/GEMLI_package_v1')" + +# Install the GEMLI package from the specified subdirectory on GitHub +#RUN R -e "remotes::install_github('UPSUTER/GEMLI', subdir = 'GEMLI_package_v1')" + +# Verify the installation +RUN R -e "library(GEMLI)" + +# Set the command to R console +CMD ["R"] diff --git a/GEMLI_package_v1/DESCRIPTION b/GEMLI_package_v1/DESCRIPTION new file mode 100644 index 0000000..20db58e --- /dev/null +++ b/GEMLI_package_v1/DESCRIPTION @@ -0,0 +1,27 @@ +Package: GEMLI +Type: Package +Title: Gene expression memory based lineage inference +Version: 0.2.0 +Author: Marcel Tarbier and Almut Eisele +Maintainer: The package maintainer +Description: Uses general characteristics of genes with lineage-specific expression + to predict cell lineages in scRNA-seq datasets. +URL: https://github.com/UPSUTER/GEMLI +Depends: R (>= 4.3.3) +Imports: + Matrix, + matrixStats, + fastcluster, + ggrepel, + methods, + igraph (>= 2.0.3), + reshape (>= 0.8.9), + dplyr (>= 1.1.2), + ggplot2 (>= 3.4.2), + UpSetR (>= 1.4.0), + tidyr (>= 1.3.0) +Suggests: + Seurat (>= 4.3.0) +License: GPL-3 +Encoding: UTF-8 +LazyData: true diff --git a/GEMLI_package_v1/GEMLI.Rproj b/GEMLI_package_v1/GEMLI.Rproj new file mode 100644 index 0000000..497f8bf --- /dev/null +++ b/GEMLI_package_v1/GEMLI.Rproj @@ -0,0 +1,20 @@ +Version: 1.0 + +RestoreWorkspace: Default +SaveWorkspace: Default +AlwaysSaveHistory: Default + +EnableCodeIndexing: Yes +UseSpacesForTab: Yes +NumSpacesForTab: 2 +Encoding: UTF-8 + +RnwWeave: Sweave +LaTeX: pdfLaTeX + +AutoAppendNewline: Yes +StripTrailingWhitespace: Yes + +BuildType: Package +PackageUseDevtools: Yes +PackageInstallArgs: --no-multiarch --with-keep.source diff --git a/GEMLI_package_v1/NAMESPACE b/GEMLI_package_v1/NAMESPACE new file mode 100644 index 0000000..f303707 --- /dev/null +++ b/GEMLI_package_v1/NAMESPACE @@ -0,0 +1,32 @@ +importFrom(igraph, "clusters", "plot.igraph", "edge_attr", "layout.fruchterman.reingold", "layout.kamada.kawai", "layout.grid") +importFrom(graphics, "axis", "layout", "legend", "mtext", "par", "title") +importFrom(methods, "hasArg") +importFrom(stats, "as.dist", "cutree", "lm", "loess", "loess.control", "quantile", "residuals", "sd") +importFrom(UpSetR, "upset", "fromList") +importFrom(ggplot2, "ggplot", "aes", "geom_line", "geom_point", "theme_classic", "theme", "theme_bw", "scale_colour_manual", "geom_vline", "geom_hline", "labs", "xlim", "element_blank", "element_text") +importFrom(ggrepel, "geom_text_repel") +importFrom(tidyr, "spread", "gather") +importFrom(reshape, "cast") +importFrom(Matrix, "Matrix", "t", "crossprod", "colMeans", "colSums", "Diagonal") +importFrom(fastcluster, hclust) +importFrom(matrixStats, rowMeans2, rowSums2) +importFrom("stats", "na.omit") +import(dplyr) + +export(cell_fate_DEG_calling) +export(cell_type_composition_plot) +export(cluster_stability_plot) +export(DEG_volcano_plot) +export(extract_cell_fate_lineages) +export(memory_gene_calling) +export(predict_lineages) +export(predict_lineages_multiple_sizes) +export(predict_lineages_with_known_markers) +export(prediction_to_lineage_information) +export(quantify_clusters_iterative) +export(suggest_network_trimming_to_size) +export(test_lineages) +export(trim_network_to_size) +export(visualize_as_network) +exportMethods(addBarcodes, addPrediction, addTestingResults) +export(GEMLI) diff --git a/GEMLI_package_v1/R/DEG_volcano_plot.R b/GEMLI_package_v1/R/DEG_volcano_plot.R new file mode 100644 index 0000000..301ea09 --- /dev/null +++ b/GEMLI_package_v1/R/DEG_volcano_plot.R @@ -0,0 +1,18 @@ +DEG_volcano_plot<-function(GEMLI_items, name1, name2){ +DEG<-GEMLI_items[['DEG']] +DEG$change = ifelse(DEG$p_val_adj <= 0.05 & abs(DEG$avg_log2FC) >= 0.5, ifelse(DEG$avg_log2FC> 0.5 ,name1,name2),'Stable') +DEG$label=rownames(DEG) +plt<-ggplot(data = DEG, aes(x = avg_log2FC , y = -log10(p_val_adj), colour=change, label=label)) + + ggplot2::geom_point(alpha=0.4, size=3.5)+ + ggplot2::xlim(c(-4.5, 4.5)) + + ggplot2::scale_color_manual(values=c("#5386BD", "darkred","grey"))+ + ggplot2::geom_vline(xintercept=c(-0.5,0.5),lty=4,col="black",lwd=0.8) + + ggplot2::geom_hline(yintercept = 1.301,lty=4,col="black",lwd=0.8)+ + ggplot2::labs(x="log2(fold change)",y="-log10 (p-value)", title=paste0("DEG"," ",name1," vs ",name2)) + + ggplot2::theme_bw()+ + ggplot2::theme(plot.title = element_text(hjust = 0.5), + legend.position="right", + legend.title = element_blank()) + + ggrepel::geom_text_repel(data = subset(DEG, avg_log2FC >= 0.5 | avg_log2FC < -0.5), aes(label = label), max.overlaps = 15) +suppressWarnings(print(plt)) +} diff --git a/GEMLI_package_v1/R/calculate_correlations.R b/GEMLI_package_v1/R/calculate_correlations.R new file mode 100644 index 0000000..7b29f84 --- /dev/null +++ b/GEMLI_package_v1/R/calculate_correlations.R @@ -0,0 +1,11 @@ +calculate_correlations <- function(data) +{ + data <- t(as.matrix(data)) + tmp = t(apply(data, 1, rank)) + tmp = as.matrix(tmp) + tmp = tmp - matrixStats::rowMeans2(tmp) + tmp = tmp / sqrt(matrixStats::rowSums2(tmp^2)) + r = tcrossprod(tmp) + diag(r) <- 0 + return(r) +} diff --git a/GEMLI_package_v1/R/cell_fate_DEG_calling.R b/GEMLI_package_v1/R/cell_fate_DEG_calling.R new file mode 100644 index 0000000..795b3d6 --- /dev/null +++ b/GEMLI_package_v1/R/cell_fate_DEG_calling.R @@ -0,0 +1,28 @@ +cell_fate_DEG_calling<-function(GEMLI_items, ident1, ident2, min.pct=0.05, logfc.threshold=0.1) +{ + if (!requireNamespace("Seurat", quietly = TRUE)) { + stop("Seurat is required for this function. Please install and load it before using cell_fate_DEG_calling") + } + if (class(GEMLI_items)=='list') { + gene_expression = Matrix::Matrix(GEMLI_items[['gene_expression']]) + } else if (class(GEMLI_items)=='GEMLI') { + gene_expression = Matrix::Matrix(GEMLI_items@gene_expression) + } else { + stop('Object GEMLI_items should be either of class list or GEMLI') + } + GEMLI_Seurat<-Seurat::CreateSeuratObject(gene_expression, project = "SeuratProject", assay = "RNA") + Metadata<-GEMLI_items[['cell_fate_analysis']] + Metadata$ident<-NA + Metadata$ident[Metadata$cell.fate %in% ident1]<-"ident1" + Metadata$ident[Metadata$cell.fate%in%ident2]<-"ident2" + Meta<-as.data.frame(Metadata[,c(5)]) + rownames(Meta)<-Metadata$cell.ID + colnames(Meta)<-c("cell.fate") + GEMLI_Seurat<-Seurat::AddMetaData(GEMLI_Seurat, Meta, col.name = NULL) + Seurat::DefaultAssay(object = GEMLI_Seurat) <- "RNA" + Seurat::Idents(GEMLI_Seurat) <- GEMLI_Seurat$cell.fate + GEMLI_Seurat <- Seurat::NormalizeData(GEMLI_Seurat, normalization.method = "LogNormalize", scale.factor = 10000) + DEG <- Seurat::FindMarkers(object = GEMLI_Seurat, ident.1 = "ident1", ident.2 = "ident2", min.pct = min.pct, logfc.threshold = logfc.threshold, assay='RNA') + GEMLI_items[['DEG']]<-DEG + return(GEMLI_items) +} diff --git a/GEMLI_package_v1/R/cell_type_composition_plot.R b/GEMLI_package_v1/R/cell_type_composition_plot.R new file mode 100644 index 0000000..ab8ccab --- /dev/null +++ b/GEMLI_package_v1/R/cell_type_composition_plot.R @@ -0,0 +1,49 @@ +cell_type_composition_plot <- function(GEMLI_items, ground_truth=F, cell_type_colors=F, type, intersections=NULL) +{ + + if (class(GEMLI_items)=='list') { + cell_type = GEMLI_items[['cell_type']] + barcodes = GEMLI_items[['barcodes']] + predicted_lineage_table = GEMLI_items[['predicted_lineage_table']] + } else if (class(GEMLI_items)=='GEMLI') { + cell_type = GEMLI_items@cell_type + barcodes = GEMLI_items@barcodes + predicted_lineage_table = GEMLI_items[['predicted_lineage_table']] + } else { + stop('Object GEMLI_items should be either of class list or GEMLI') + } + base_colors = rep(c('#a50026','#d73027','#f46d43','#fdae61','#fee090','#e0f3f8','#abd9e9','#74add1','#4575b4','#313695','#40004b','#762a83','#9970ab','#c2a5cf','#e7d4e8','#d9f0d3','#a6dba0','#5aae61','#1b7837','#00441b','#543005','#8c510a','#bf812d','#dfc27d','#f6e8c3','#e0e0e0','#bababa','#878787','#4d4d4d','#1a1a1a'), 100) + if (cell_type_colors==F){ + cell.type<-unique(cell_type$cell.type) + color<-base_colors[rank(unique(cell_type$cell.type))] + GEMLI_items[['cell_type_color']] = data.frame(cell.type, color) + } + if (class(GEMLI_items)=='list') { + cell_type = GEMLI_items[['cell_type']] + } else if (class(GEMLI_items)=='GEMLI') { + cell_type = GEMLI_items@cell_type + } + + if (ground_truth){cell.ID<-names(barcodes); clone.ID<-unname(barcodes); GT<-as.data.frame(cbind(clone.ID,cell.ID)); + Lookup<-merge(GT, cell_type, by="cell.ID", all=TRUE)} else { + Lookup<-merge(as.data.frame(predicted_lineage_table), cell_type, by="cell.ID", all=TRUE) + } + + if (type == "bubble"){ + Lookup <- Lookup %>% dplyr::group_by(clone.ID, cell.type) %>% dplyr::summarise(cnt = dplyr::n()) %>% dplyr::mutate(freq = round(cnt / sum(cnt), 3)); Lookup <- reshape::cast(Lookup, clone.ID~cell.type, value="freq"); + base_colors = GEMLI_items[['cell_type_color']]$color[match(colnames(Lookup[,2:length(Lookup)]),GEMLI_items[['cell_type_color']]$cell.type)] + p<-Lookup %>% gather(cell.type, percentage, -clone.ID)%>% ggplot(group=cell.type) + ggplot2::geom_point(aes(x = cell.type, y = clone.ID, size = percentage, col= cell.type))+ ggplot2::theme_classic()+ scale_colour_manual(values = base_colors)} + + if (type == "upsetR"){ + Lookup_list <- split(Lookup$clone.ID, Lookup$cell.type) + p<-UpSetR::upset(fromList(Lookup_list), order.by = "freq", nsets = length(Lookup_list), + sets.x.label = "Lineages in cell type", mainbar.y.label = "Number of lineages", + nintersects = NA, intersections = NULL, point.size=5, mb.ratio = c(0.5, 0.5), text.scale = 2, + set_size.show = TRUE, set_size.numbers_size = 7, set_size.scale_max = length(unique(Lookup$clone.ID)))} + + if (type == "plain"){ + Lookup<-unique(Lookup[,-c(1)]) + p<-Lookup %>% dplyr::group_by(clone.ID) %>% dplyr::arrange(clone.ID, cell.type) %>% dplyr::summarize(combi = paste0(cell.type, collapse = "__"), .groups = "drop") %>% dplyr::count(combi)} + + return(p) +} diff --git a/GEMLI_package_v1/R/classes.R b/GEMLI_package_v1/R/classes.R new file mode 100644 index 0000000..4195fd7 --- /dev/null +++ b/GEMLI_package_v1/R/classes.R @@ -0,0 +1,100 @@ +setClass( + "GEMLI", + slots = list( + gene_expression = "dgCMatrix", # Assuming gene_expression is a matrix + barcodes = "character", # Optional, vector of barcodes + prediction = "matrix", # Optional, data frame for prediction results + testing_results = "matrix" # Optional, data frame for testing results + ), + prototype = list( # Default values for optional slots + barcodes = character(0), + prediction = matrix(), + testing_results = matrix() + ) +) + +GEMLI <- function(gene_expression, barcodes=character(0), prediction=NULL, testing_results=NULL) { + if (!is.matrix(gene_expression) & class(gene_expression)[1]!='Seurat' & class(gene_expression)[1]!='dgCMatrix' & class(gene_expression)[1]!='dgeMatrix') { + stop("gene_expression must be a matrix, a Seurat object or a dgCMatrix") + } + if (!is.null(barcodes) && !is.character(barcodes)) { + stop("barcodes must be a character vector") + } + if (!is.null(prediction) && !is.matrix(prediction)) { + stop("prediction must be a data frame") + } + if (!is.null(testing_results) && !is.matrix(testing_results)) { + stop("testing_results must be a data frame") + } + + # Provide default empty data frames if NULL + prediction <- if (is.null(prediction)) matrix() else prediction + testing_results <- if (is.null(testing_results)) matrix() else testing_results + + if (class(gene_expression)[1]=='Seurat') { + if (!requireNamespace("Seurat", quietly = TRUE)) { + stop("Seurat is required for this function. Please install and load it before using cell_fate_DEG_calling") + } else { + gene_expression <- Matrix::Matrix(GetAssayData(object=gene_expression, layer="counts")) + } + } + if (class(gene_expression)[1]=='dgeMatrix' | is.matrix(gene_expression)) { + gene_expression <- Matrix::Matrix(gene_expression, sparse=T) + } + + new("GEMLI", gene_expression = gene_expression, barcodes = barcodes, prediction = prediction, + testing_results = testing_results) +} + +# Define the custom show method for the GEMLI class +setMethod("show", "GEMLI", function(object) { + cat(paste0("GEMLI object with ", nrow(object@gene_expression), " genes and ", ncol(object@gene_expression), " cells\n\n")) + + if(length(object@barcodes) > 1) { + cat("Barcodes: Provided\n") + } else { + cat("Barcodes: Not provided\n") + } + + if(nrow(object@prediction) > 1) { + cat("Prediction: Available\n") + } else { + cat("Prediction: Not available\n") + } + + if(nrow(object@testing_results) > 1) { + cat("Testing Results: Available\n") + } else { + cat("Testing Results: Not available\n") + } +}) + +# Method to add or update barcodes +setGeneric("addBarcodes", function(object, barcodes) standardGeneric("addBarcodes")) +setMethod("addBarcodes", "GEMLI", function(object, barcodes) { + if (!is.character(barcodes)) { + stop("barcodes must be a character vector") + } + validObject(object@barcodes <- barcodes) + return(object) +}) + +# Method to add or update prediction +setGeneric("addPrediction", function(object, prediction) standardGeneric("addPrediction")) +setMethod("addPrediction", "GEMLI", function(object, prediction) { + if (!is.matrix(prediction)) { + stop("prediction must be a matrix") + } + validObject(object@prediction <- prediction) + return(object) +}) + +# Method to add or update testing results +setGeneric("addTestingResults", function(object, testing_results) standardGeneric("addTestingResults")) +setMethod("addTestingResults", "GEMLI", function(object, testing_results) { + if (!is.matrix(testing_results)) { + stop("testing results must be a matrix") + } + validObject(object@testing_results <- testing_results) + return(object) +}) diff --git a/GEMLI_package_v1/R/cluster_stability_plot.R b/GEMLI_package_v1/R/cluster_stability_plot.R new file mode 100644 index 0000000..227747b --- /dev/null +++ b/GEMLI_package_v1/R/cluster_stability_plot.R @@ -0,0 +1,7 @@ +cluster_stability_plot <- function(GEMLI_items) # check +{ + data_matrix<-GEMLI_items[['prediction_multiple_sizes']] + clustree<-clustree(data_matrix, prefix = "K") + clustree<-clustree[["data"]][which(clustree[["data"]]$size != 1),] + plot(clustree$K, clustree$sc3_stability, xlab="lineage size", ylab="clustree_stability_index") +} diff --git a/GEMLI_package_v1/R/extract_cell_fate_lineages.R b/GEMLI_package_v1/R/extract_cell_fate_lineages.R new file mode 100644 index 0000000..5f2738e --- /dev/null +++ b/GEMLI_package_v1/R/extract_cell_fate_lineages.R @@ -0,0 +1,19 @@ +extract_cell_fate_lineages<- function(GEMLI, selection, unique=FALSE, threshold) +{ + Lookup<-merge(as.data.frame(GEMLI[['predicted_lineage_table']]), GEMLI[['cell_type']], by="cell.ID", all=TRUE) + if (unique){ + Lookup$cell.fate<-NA + Lookup<-Lookup %>% dplyr::group_by(clone.ID) %>% dplyr::mutate(cell.fate=dplyr::case_when((n_distinct(cell.type)==length(selection)& all(cell.type %in% selection)& is.na(clone.ID)==FALSE)~ "asym", (n_distinct(cell.type)==1& all(cell.type %in% selection) & n_distinct(cell.ID)>1& is.na(clone.ID)==FALSE)~"sym")) + } else { + Lookup2<-Lookup[Lookup$cell.type %in% selection, ] + Lookup2<-Lookup2 %>% dplyr::group_by(clone.ID) %>% dplyr::mutate(cell.fate=dplyr::case_when((n_distinct(cell.type)==length(selection)& all(cell.type %in% selection)& is.na(clone.ID)==FALSE)~ "asym", (n_distinct(cell.type)==1& all(cell.type %in% selection) & n_distinct(cell.ID)>1& is.na(clone.ID)==FALSE)~"sym")) + Lookup2<-Lookup2[,c(1,4)]; Lookup<-merge(Lookup, Lookup2, by=c("cell.ID" ), all=TRUE) + } + # filter by threshold + Lookup <- Lookup %>% dplyr::group_by(cell.fate, clone.ID) %>% dplyr::mutate(cnt = dplyr::n()); Lookup<-Lookup%>% dplyr::group_by(cell.fate, clone.ID, cell.type) %>% dplyr::mutate(per= (n()/cnt)*100) + for(i in 1:length(selection)){Lookup <- Lookup %>% dplyr::group_by(cell.fate, clone.ID) %>% dplyr::mutate(cell.fate=dplyr::case_when(((cell.fate=="asym") & (cell.type==selection[i] ) & (per < threshold[i]))~"filtered", TRUE~cell.fate))} + Lookup<-Lookup %>% dplyr::group_by(clone.ID) %>% dplyr::mutate(cell.fate=dplyr::case_when(any(cell.fate=="filtered")~NA, TRUE~cell.fate)) + Lookup$cell.fate <- paste(Lookup$cell.fate,Lookup$cell.type,sep = "_") + GEMLI[['cell_fate_analysis']]<-Lookup[,1:4] + return(GEMLI) +} diff --git a/GEMLI_package_v1/R/memory_gene_calling.R b/GEMLI_package_v1/R/memory_gene_calling.R new file mode 100644 index 0000000..71666c1 --- /dev/null +++ b/GEMLI_package_v1/R/memory_gene_calling.R @@ -0,0 +1,65 @@ +memory_gene_calling <- function(GEMLI_items, valid_lineage_sizes=2:5, use_median=T, ground_truth=F, cell_fate) +{ + if (class(GEMLI_items)=='list') { + data_matrix = Matrix::Matrix(GEMLI_items[['gene_expression']]) + barcodes = GEMLI_items[['barcodes']] + predicted_lineages = GEMLI_items[['predicted_lineages']] + predicted_lineage_table = GEMLI_items[['predicted_lineage_table']] + cell_fate_analysis = GEMLI_items[['cell_fate_analysis']] + } else if (class(GEMLI_items)=='GEMLI') { + data_matrix = Matrix::Matrix(GEMLI_items@gene_expression) + barcodes = GEMLI_items@barcodes + predicted_lineages = GEMLI_items@predicted_lineages + predicted_lineage_table = GEMLI_items[['predicted_lineage_table']] + cell_fate_analysis = GEMLI_items[['cell_fate_analysis']] + } else { + stop('Object GEMLI_items should be either of class list or GEMLI') + } + markers_by_cvsq_of_lineage_means <- function(data_matrix, lineage_dict, valid_lineage_sizes=(2:5), use_median=T) + { + cv_sq <- function(data_matrix) + { + sd = apply(data_matrix, 1, sd, na.rm = TRUE) + mean = apply(data_matrix, 1, mean, na.rm = TRUE) + noise = (sd/mean)**2 + return(noise) + } + lineage_dict_filt = lineage_dict[intersect(colnames(data_matrix), names(lineage_dict))] + valid_lineage_dict = lineage_dict_filt[as.character(lineage_dict_filt) %in% names(table(lineage_dict_filt))[table(lineage_dict_filt) %in% valid_lineage_sizes]] + lineage_center = matrix(NA, ncol=length(unique(valid_lineage_dict)), nrow=nrow(data_matrix)); colnames(lineage_center) = unique(valid_lineage_dict); rownames(lineage_center) = rownames(data_matrix) + for (lineage in as.character(unique(valid_lineage_dict))) + { + if (use_median){lineage_center[,lineage] = apply(data_matrix[,names(valid_lineage_dict)[valid_lineage_dict==lineage]], 1, quantile, probs = 0.5, na.rm = TRUE)} + else {lineage_center[,lineage] = rowMeans(data_matrix[,names(valid_lineage_dict)[valid_lineage_dict==lineage]])} + } + x = rowMeans(lineage_center); y = cv_sq(lineage_center); x = log2(x); y = log2(y) + filter = is.na(x) | is.na(y) | is.infinite(x) | is.infinite(y) | (x==0); x=x[!filter]; y=y[!filter]; loess_means = loess(y ~ x, span=0.75, control=loess.control(surface="direct")) + filter = names(which(!filter)) + lineage_center_variation = loess_means$residuals + return(sort(lineage_center_variation[filter], decreasing=T)) + } + + if (ground_truth) {lineage_dict = barcodes} else { + if (length(predicted_lineages)>0){lineage_dict = predicted_lineages} else { + lineage_dict = predicted_lineage_table$clone.ID; names(lineage_dict) = predicted_lineage_table$cell.ID}} + if (hasArg(cell_fate)){match<-cell_fate_analysis[cell_fate_analysis$cell.fate ==cell_fate,] + lineage_dict=lineage_dict[names(lineage_dict)%in% match$cell.ID]} + + lineage_center_variation = markers_by_cvsq_of_lineage_means(data_matrix, lineage_dict, valid_lineage_sizes=valid_lineage_sizes, use_median=use_median) + + data_matrix_control = matrix(NA, ncol=20, nrow=nrow(data_matrix)); rownames(data_matrix_control) = rownames(data_matrix) + for (i in c(1:20)) + { + lineage_dict_sampled = lineage_dict; names(lineage_dict_sampled) = sample(names(lineage_dict)) + tmp = markers_by_cvsq_of_lineage_means(as.matrix(data_matrix), lineage_dict_sampled, valid_lineage_sizes=valid_lineage_sizes, use_median=use_median) + data_matrix_control[names(tmp),i] = tmp + } + markers_pvalue = rowSums(data_matrix_control[intersect(rownames(data_matrix_control), names(lineage_center_variation)),]>lineage_center_variation[intersect(rownames(data_matrix_control), names(lineage_center_variation))], na.rm=T)/20 + + shared_genes = intersect(names(lineage_center_variation), names(markers_pvalue)) + marker_table = data.frame(cbind(lineage_center_variation[shared_genes], markers_pvalue[shared_genes])); rownames(marker_table) = shared_genes; colnames(marker_table) = c("var","p") + marker_table = marker_table[with(marker_table, order(p, -var)),] + + GEMLI_items[["memory_genes"]] = marker_table + return(GEMLI_items) +} diff --git a/GEMLI_package_v1/R/potential_markers.R b/GEMLI_package_v1/R/potential_markers.R new file mode 100644 index 0000000..b7172e3 --- /dev/null +++ b/GEMLI_package_v1/R/potential_markers.R @@ -0,0 +1,58 @@ +#potential_markers <- function(data_matrix) # check +#{ + #means = rowMeans(data_matrix) + #variation = (apply(data_matrix, 1, sd, na.rm=T) / apply(data_matrix, 1, mean, na.rm=T))**2 + #filter = names(which(!(is.na(means) | is.na(variation) | is.infinite(means) | is.infinite(variation) | (means==0)))); x=means[filter]; y=variation[filter] + #linear_fit = lm(log2(y) ~ log2(x)) + #variation_residuals = residuals(linear_fit) + #means = means[filter] + #mean_quantiles = quantile(means, seq(0.01,1,0.01)) + #variation_quantiles = quantile(variation_residuals, seq(0.01,1,0.01)) + #memory_genes = names(which((means>=mean_quantiles[98]) | (means>=mean_quantiles[90] & variation_residuals>=variation_quantiles[40]) | (means>=mean_quantiles[80] & variation_residuals>=variation_quantiles[80]) |(means>=mean_quantiles[60] & variation_residuals>=variation_quantiles[90]))) + #return(memory_genes) +#} + +potential_markers <- function(data_matrix) { + # Calculate row means for a sparse matrix + means <- Matrix::rowMeans(data_matrix) + + # Calculate standard deviation for each row in a sparse matrix + row_sd_sparse <- function(sparse_matrix) { + row_means <- Matrix::rowMeans(sparse_matrix) + row_squares_means <- Matrix::rowMeans(sparse_matrix^2) + sqrt(row_squares_means - row_means^2) + } + sds <- row_sd_sparse(data_matrix) + + # Calculate coefficient of variation squared + variation <- (sds / means)^2 + + # Filter out rows with NA, Inf, or means == 0 + filter <- which(!is.na(means) & !is.na(variation) & !is.infinite(means) & !is.infinite(variation) & means != 0) + x <- means[filter] + y <- variation[filter] + + # Perform linear regression on the log2-transformed data + linear_fit <- lm(log2(y) ~ log2(x)) + + # Get residuals from the linear model + variation_residuals <- residuals(linear_fit) + + # Filter means and variation residuals + means <- means[filter] + + # Calculate quantiles + mean_quantiles <- quantile(means, seq(0.01, 1, 0.01)) + variation_quantiles <- quantile(variation_residuals, seq(0.01, 1, 0.01)) + + # Identify memory genes based on quantiles + memory_genes <- names(which( + (means >= mean_quantiles[98]) | + (means >= mean_quantiles[90] & variation_residuals >= variation_quantiles[40]) | + (means >= mean_quantiles[80] & variation_residuals >= variation_quantiles[80]) | + (means >= mean_quantiles[60] & variation_residuals >= variation_quantiles[90]) + )) + + return(memory_genes) +} + diff --git a/GEMLI_package_v1/R/predict_lineages.R b/GEMLI_package_v1/R/predict_lineages.R new file mode 100644 index 0000000..eed3526 --- /dev/null +++ b/GEMLI_package_v1/R/predict_lineages.R @@ -0,0 +1,69 @@ +predict_lineages <- function(GEMLI_items, repetitions=100, sample_size=(2/3), desired_cluster_size=c(2,3), + verbose=T) { + if (class(GEMLI_items)=='list') { + data_matrix = Matrix::Matrix(GEMLI_items[['gene_expression']]) + } else if (class(GEMLI_items)=='GEMLI') { + data_matrix = Matrix::Matrix(GEMLI_items@gene_expression) + } else { + stop('Object GEMLI_items should be either of class list or GEMLI') + } + + cat('Define marker genes') + marker_genes = potential_markers(data_matrix) + + cat('\nCompute predictions') + #function to apply in parallel + myfun <- function(idx, seed, verbose) { + set.seed(seed) + if (verbose) cat(' sample genes') + marker_genes_sample = sample(marker_genes, round(length(marker_genes) * sample_size, 0)) + cell_clusters <- quantify_clusters_iterative(data_matrix, marker_genes_sample, N=2, verbose) + clustersize_dict = table(paste0( + unlist(lapply(1:ncol(cell_clusters), function(i) rep(i, nrow(cell_clusters)))), + '_', as.numeric(cell_clusters) + )) + smallest_clusters = names(clustersize_dict)[clustersize_dict %in% desired_cluster_size] + smallest_clusters = smallest_clusters[!grepl('_NA$', smallest_clusters)] + best_prediction = matrix(FALSE, nrow=ncol(data_matrix), ncol=ncol(data_matrix)) + rownames(best_prediction) = colnames(data_matrix) + colnames(best_prediction) = colnames(data_matrix) + if (verbose) cat(', loop clusters') + cells_in_cluster = lapply(smallest_clusters, function(cluster) { + col <- as.numeric(strsplit(cluster, '_')[[1]][[1]]) + val <- as.numeric(strsplit(cluster, '_')[[1]][[2]]) + ans = na.omit(rownames(best_prediction)[cell_clusters[, col] == val]) + ans + }) + for (i in 1:length(cells_in_cluster)) { + best_prediction[cells_in_cluster[[i]], cells_in_cluster[[i]]] = T + } + diag(best_prediction) = F + best_prediction + } + + #compute + set.seed(42) #for reproducibility + results_list <- vector('list', repetitions) + for (i in 1:repetitions) { + if (verbose) cat(paste0('\n Repetition', i, ': ')) + results_list[[i]] <- myfun(i, 42 +i, verbose) + } + + #combine results + cat('\nCombine results') + results = matrix(0, nrow=ncol(data_matrix), ncol=ncol(data_matrix)) + rownames(results) = colnames(data_matrix) + colnames(results) = colnames(data_matrix) + for (mat in results_list) { + results <- results + mat + } + + #store results + if (class(GEMLI_items) == 'list') { + GEMLI_items[["prediction"]] = results + } else if (class(GEMLI_items) == 'GEMLI') { + GEMLI_items <- addPrediction(GEMLI_items, results) + } + + return(GEMLI_items) +} diff --git a/GEMLI_package_v1/R/predict_lineages_multiple_sizes.R b/GEMLI_package_v1/R/predict_lineages_multiple_sizes.R new file mode 100644 index 0000000..6e8718c --- /dev/null +++ b/GEMLI_package_v1/R/predict_lineages_multiple_sizes.R @@ -0,0 +1,49 @@ +predict_lineages_multiple_sizes <- function(GEMLI_items, repetitions=10, sample_size=(2/3), minimal_maximal_cluster_size=c(2,50), cutoff=5) # check +{ + if (class(GEMLI_items)=='list') { + data_matrix = Matrix::Matrix(GEMLI_items[['gene_expression']]) + } else if (class(GEMLI_items)=='GEMLI') { + data_matrix = Matrix::Matrix(GEMLI_items@gene_expression) + } else { + stop('Object GEMLI_items should be either of class list or GEMLI') + } + # split out the minimal_maximal_cluster_size into vectors encompassing all combis between the min and max value + # for each of the vectors then generate the lineage prediction and combine them into one table + desired_sizes<-list() + for (m in 2:minimal_maximal_cluster_size[2]){desired_sizes[[m-1]]<-c(1:m)} + GEMLI_prediction_list<-list() + for (j in 1:length(desired_sizes)) { + progress((100*j)/length(desired_sizes)) + sub_desired_cluster_size<-desired_sizes[[j]] + marker_genes = potential_markers(data_matrix) + results = data.matrix(matrix(0, nrow=ncol(data_matrix), ncol=ncol(data_matrix))); rownames(results) = colnames(data_matrix); colnames(results) = colnames(data_matrix) + for (i in seq(1,repetitions)) + { + marker_genes_sample = sample(intersect(marker_genes, rownames(data_matrix)), round(length(intersect(marker_genes, rownames(data_matrix)))*sample_size,0)) + cell_clusters = quantify_clusters_iterative(data_matrix, marker_genes_sample, N=2) + cell_clusters_unique_name = cell_clusters; for (colname in 1:ncol(cell_clusters)){cell_clusters_unique_name[!is.na(cell_clusters_unique_name[,colname]),colname] = paste0(colname,'_',cell_clusters_unique_name[!is.na(cell_clusters_unique_name[,colname]),colname])} + clustersize_dict = table(cell_clusters_unique_name) + + # This is where the desired_cluster_size comes into play + smallest_clusters = names(clustersize_dict)[clustersize_dict %in% sub_desired_cluster_size] + best_prediction = data.matrix(matrix(F, nrow=ncol(data_matrix), ncol=ncol(data_matrix))); rownames(best_prediction) = colnames(data_matrix); colnames(best_prediction) = colnames(data_matrix) + for (cluster in smallest_clusters){cells_in_cluster = rownames(best_prediction)[rowSums(cell_clusters_unique_name==cluster, na.rm=T)>0]; best_prediction[cells_in_cluster,cells_in_cluster] <- T} + diag(best_prediction) = F + results = results + best_prediction + } + # Transform the prediction matrix into a lineage info +network = (results >= cutoff) + network_edges = as.matrix(network) + network_graph = igraph::graph.adjacency(network_edges, mode="undirected", weighted=NULL) + prediction_dict = igraph::clusters(network_graph)$membership + family_table = cbind(names(prediction_dict), as.vector(prediction_dict)) + colnames(family_table) = c("cell.ID", "clone.ID") + GEMLI_prediction_list[[j]] = family_table + } + + # combine in one dataframe that can be used for cluster stability calculation + for(u in 1:length(GEMLI_prediction_list)){colnames(GEMLI_prediction_list[[u]]) <- c("cell.ID",paste0("K", u)) } + GEMLI_prediction_multiple_sizes_output <- Reduce(function(x,y)merge(x,y,by="cell.ID"), GEMLI_prediction_list) + GEMLI_items[['prediction_multiple_sizes']]<-GEMLI_prediction_multiple_sizes_output + return(GEMLI_items) +} diff --git a/GEMLI_package_v1/R/predict_lineages_with_known_markers.R b/GEMLI_package_v1/R/predict_lineages_with_known_markers.R new file mode 100644 index 0000000..6176955 --- /dev/null +++ b/GEMLI_package_v1/R/predict_lineages_with_known_markers.R @@ -0,0 +1,27 @@ +predict_lineages_with_known_markers <- function(GEMLI_items, repetitions=100, sample_size=(2/3), desired_cluster_size=c(2,3)) +{ + if (class(GEMLI_items)=='list') { + data_matrix = Matrix::Matrix(GEMLI_items[['gene_expression']]) + } else if (class(GEMLI_items)=='GEMLI') { + data_matrix = Matrix::Matrix(GEMLI_items@gene_expression) + } else { + stop('Object GEMLI_items should be either of class list or GEMLI') + } + norm_data = norm_data = data_matrix + marker_genes = GEMLI_items[['known_markers']] + results = data.matrix(matrix(0, nrow=ncol(norm_data), ncol=ncol(norm_data))); rownames(results) = colnames(norm_data); colnames(results) = colnames(norm_data) + for (i in seq(1,repetitions)) + { + marker_genes_sample = sample(intersect(marker_genes, rownames(norm_data)), round(length(intersect(marker_genes, rownames(norm_data)))*sample_size,0)) + cell_clusters = quantify_clusters_iterative(norm_data, marker_genes_sample, N=2) + cell_clusters_unique_name = cell_clusters; for (colname in 1:ncol(cell_clusters)){cell_clusters_unique_name[!is.na(cell_clusters_unique_name[,colname]),colname] = paste0(colname,'_',cell_clusters_unique_name[!is.na(cell_clusters_unique_name[,colname]),colname])} + clustersize_dict = table(cell_clusters_unique_name) + smallest_clusters = names(clustersize_dict)[clustersize_dict %in% desired_cluster_size] + best_prediction = data.matrix(matrix(F, nrow=ncol(norm_data), ncol=ncol(norm_data))); rownames(best_prediction) = colnames(norm_data); colnames(best_prediction) = colnames(norm_data) + for (cluster in smallest_clusters){cells_in_cluster = rownames(best_prediction)[rowSums(cell_clusters_unique_name==cluster, na.rm=T)>0]; best_prediction[cells_in_cluster,cells_in_cluster] <- T} + diag(best_prediction) = F + results = results + best_prediction + } + GEMLI_items[["prediction"]] = results + return(GEMLI_items) +} diff --git a/GEMLI_package_v1/R/prediction_to_lineage_information.R b/GEMLI_package_v1/R/prediction_to_lineage_information.R new file mode 100644 index 0000000..9fc2d5b --- /dev/null +++ b/GEMLI_package_v1/R/prediction_to_lineage_information.R @@ -0,0 +1,13 @@ +library(igraph) +prediction_to_lineage_information <- function(GEMLI_items, cutoff=50, output_as_dict=T) # check +{ + lineage_predictions_matrix = GEMLI_items[["prediction"]] + network = (lineage_predictions_matrix >= cutoff) + network_edges = as.matrix(network) + network_graph = igraph::graph.adjacency(network_edges, mode="undirected", weighted=NULL) + family_dict = igraph::clusters(network_graph)$membership + GEMLI_items[["predicted_lineages"]] = family_dict + family_table = cbind(names(family_dict), as.vector(family_dict)); colnames(family_table) = c("cell.ID", "clone.ID") + GEMLI_items[["predicted_lineage_table"]] = family_table + return(GEMLI_items) +} diff --git a/GEMLI_package_v1/R/quantify_clusters_iterative.R b/GEMLI_package_v1/R/quantify_clusters_iterative.R new file mode 100644 index 0000000..5a674f5 --- /dev/null +++ b/GEMLI_package_v1/R/quantify_clusters_iterative.R @@ -0,0 +1,34 @@ +quantify_clusters_iterative <- function(data_matrix, marker_genes, N = 2, verbose=T) { + iterate <- TRUE + i <- 2 + if (verbose) cat(', get correlation') + corr_expr_raw <- (1 - calculate_correlations(data_matrix[marker_genes, , drop=F]) ) / 2 + cell_clusters <- Matrix::Matrix(0, nrow = ncol(data_matrix), ncol = 1, sparse = F) + rownames(cell_clusters) <- colnames(data_matrix) + cell_clusters[, 1] <- rep(1, ncol(data_matrix)) + + if (verbose) cat(', iterate') + while (iterate) { + cell_clusters <- cbind(cell_clusters, Matrix::Matrix(0, nrow = nrow(cell_clusters), sparse = F)) + unique_clusters <- setdiff(unique(cell_clusters[, (i - 1)]), 0) + + for (cluster in unique_clusters) { + cluster_indices <- which(cell_clusters[, (i - 1)] == cluster) + cells_in_cluster <- rownames(cell_clusters)[cluster_indices] + + if (length(cells_in_cluster) >= 4) { + clustering <- cutree(fastcluster::hclust(as.dist(corr_expr_raw[cells_in_cluster, cells_in_cluster]), method = "ward.D2"), k = N) + cell_clusters[cluster_indices, i] <- as.vector(clustering) + max(c(0, cell_clusters[, i]), na.rm = TRUE) + } + } + dim(cell_clusters) + + if (sum(cell_clusters[, i], na.rm = TRUE) == 0) { + iterate <- FALSE + } + i <- i + 1 + } + lapply(c("corr_expr_raw", "unique_clusters", "cluster_indices", "cells_in_cluster", "clustering"), function(obj) if (exists(obj)) rm(list = obj)) + cell_clusters[cell_clusters == 0] <- NA + return(cell_clusters) +} diff --git a/GEMLI_package_v1/R/suggest_network_trimming_to_size.R b/GEMLI_package_v1/R/suggest_network_trimming_to_size.R new file mode 100644 index 0000000..9922a7c --- /dev/null +++ b/GEMLI_package_v1/R/suggest_network_trimming_to_size.R @@ -0,0 +1,60 @@ +suggest_network_trimming_to_size <- function(GEMLI_items, max_size=4, cutoff=70, max_edge_width=5, display_orphan=F, include_labels=F, ground_truth=F, layout_style="fr") +{ + if (class(GEMLI_items)=='list') { + barcodes = GEMLI_items[['barcodes']] + } else if (class(GEMLI_items)=='GEMLI') { + barcodes = GEMLI_items@barcodes + } else { + stop('Object GEMLI_items should be either of class list or GEMLI') + } + lineage_predictions_matrix_original = GEMLI_items[["prediction"]] + predicted_lineages_original = GEMLI_items[["predicted_lineages"]] + GEMLI_items_in_trimming = GEMLI_items + predicted_lineages = prediction_to_lineage_information(GEMLI_items, cutoff, output_as_dict=T)$predicted_lineages + if (ground_truth) {lineage_dict = barcodes} else {lineage_dict = prediction_to_lineage_information(GEMLI_items, cutoff, output_as_dict=T)$predicted_lineages} + while (sum(table(GEMLI_items_in_trimming$predicted_lineages)>max_size)!=0) + { predicted_lineages = GEMLI_items_in_trimming$predicted_lineages; + lineage_predictions_matrix = GEMLI_items_in_trimming$prediction + oversized_families = names(table(predicted_lineages))[table(predicted_lineages)>max_size] + for (oversized_family in oversized_families) + { + oversized_familiy_scores = lineage_predictions_matrix[names(which(predicted_lineages==oversized_family)), names(which(predicted_lineages==oversized_family))] + weakest_link = min(oversized_familiy_scores[oversized_familiy_scores!=0]) + oversized_familiy_scores[oversized_familiy_scores==weakest_link] <- 0 + lineage_predictions_matrix[names(which(predicted_lineages==oversized_family)), names(which(predicted_lineages==oversized_family))] = oversized_familiy_scores + } + GEMLI_items_in_trimming$prediction = lineage_predictions_matrix + GEMLI_items_in_trimming$predicted_lineages = prediction_to_lineage_information(GEMLI_items_in_trimming, cutoff, output_as_dict=T)$predicted_lineages + } + # visualization + par(mar=c(0,0,3.5,0)) + layout(mat = matrix(c(1, 2), nrow = 2, ncol = 1), heights = c(3, 1)) + base_colors = rep(c('#a50026','#d73027','#f46d43','#fdae61','#fee090','#e0f3f8','#abd9e9','#74add1','#4575b4','#313695','#40004b','#762a83','#9970ab','#c2a5cf','#e7d4e8','#d9f0d3','#a6dba0','#5aae61','#1b7837','#00441b','#543005','#8c510a','#bf812d','#dfc27d','#f6e8c3','#e0e0e0','#bababa','#878787','#4d4d4d','#1a1a1a'), 100) + network_edges = as.matrix(lineage_predictions_matrix_original) + network_edges[lineage_predictions_matrix[rownames(network_edges),rownames(network_edges)]==0] <- ((-1) * network_edges[lineage_predictions_matrix[rownames(network_edges),rownames(network_edges)]==0]) + network_edges[abs(network_edges)cutoff)])), col = c("black", "black"), bty = "n", lty=1:1, lwd=c(max(edge_width),(min(edge_width)+ (max_edge_width*0.1))), title = "Confidence", title.adj =0, horiz=F, xpd=TRUE, inset=c(.15,0),ncol=1) + # Vertex_color + if (include_labels==T) {legend("topright", legend=c("Color - prediction","Number - cell ID"), bty = "n", title = "Vertex ", title.adj =0.5, horiz=F, xpd=TRUE, inset=c(.05, 0),ncol=1)} else + {legend("topright", legend=c("Color - prediction"), bty = "n", title = "Vertex ", title.adj =0.5, horiz=F, xpd=TRUE, inset=c(.05, 0),ncol=1)} +} diff --git a/GEMLI_package_v1/R/test_lineages.R b/GEMLI_package_v1/R/test_lineages.R new file mode 100644 index 0000000..0381adf --- /dev/null +++ b/GEMLI_package_v1/R/test_lineages.R @@ -0,0 +1,29 @@ +test_lineages <- function(GEMLI_items, valid_fam_sizes=(1:5), max_interval=100, plot_results=F) +{ + if (class(GEMLI_items)=='list') { + lineage_predictions_matrix = GEMLI_items[['prediction']] + lineage_dict_bc = GEMLI_items[['barcodes']] + } else if (class(GEMLI_items)=='GEMLI') { + lineage_predictions_matrix = GEMLI_items@prediction + lineage_dict_bc = GEMLI_items@barcodes + } else { + stop('Object GEMLI_items should be either of class list or GEMLI') + } + valid_family_dict = lineage_dict_bc[as.character(lineage_dict_bc) %in% names(table(lineage_dict_bc))[table(lineage_dict_bc) %in% valid_fam_sizes]] + cell_with_annotation = intersect(rownames(lineage_predictions_matrix), names(valid_family_dict)) + family_dict_filt = valid_family_dict[cell_with_annotation] + real_family_matrix = outer(family_dict_filt[cell_with_annotation], family_dict_filt[cell_with_annotation], FUN='=='); diag(real_family_matrix) = F + results_repeated_annotated = lineage_predictions_matrix[cell_with_annotation, cell_with_annotation] + if (is.na(max_interval)) {intervals = unique(round(seq(0,1,0.1)*max(results_repeated_annotated),0))} else {intervals = unique(round(seq(0,1,0.1)*max_interval,0))} + output_matrix = matrix(NA, ncol=4, nrow=length(intervals)); colnames(output_matrix) = c('precision','TP','FP','sensitivity'); rownames(output_matrix) = intervals + for (interval in intervals){output_matrix[as.character(interval),1:3] = c(sum(real_family_matrix & (results_repeated_annotated>=interval)) / sum(results_repeated_annotated>=interval), sum(real_family_matrix & (results_repeated_annotated>=interval)), sum((!real_family_matrix) & (results_repeated_annotated>=interval)))} + # replace this with an apply function + output_matrix[,"sensitivity"] = output_matrix[,"TP"]/output_matrix["0","TP"] + output_matrix = output_matrix[,c('TP','FP','precision','sensitivity')] + if (plot_results) + { + par(mar=c(4.5, 4.5, 3.5, 4.5)); plot(as.numeric(rownames(output_matrix)), output_matrix[,"precision"], type="o", pch=16, lwd=3, col="darkred", xlab="confidence level", ylab="precision (red)", log="", main="testing lineage prediction", ylim=c(0,1)); par(new=T); plot(as.numeric(rownames(output_matrix)), output_matrix[,"sensitivity"], type="o", pch=16, lwd=3, axes=F, bty="n", xlab="", ylab="", col="grey", log="", ylim=c(0,1)); axis(side=4, at=pretty(range(output_matrix[,"sensitivity"]))); mtext("sensitivity (grey)", side=4, line=3) + } + GEMLI_items[['testing_results']] = output_matrix + return(GEMLI_items) +} diff --git a/GEMLI_package_v1/R/trim_network_to_size.R b/GEMLI_package_v1/R/trim_network_to_size.R new file mode 100644 index 0000000..9600337 --- /dev/null +++ b/GEMLI_package_v1/R/trim_network_to_size.R @@ -0,0 +1,19 @@ +trim_network_to_size <- function(GEMLI_items, max_size=4, cutoff=70) +{ + GEMLI_items_in_trimming = GEMLI_items + while (sum(table(GEMLI_items_in_trimming$predicted_lineages)>max_size)!=0) + { + predicted_lineages = GEMLI_items_in_trimming$predicted_lineages; lineage_predictions_matrix = GEMLI_items_in_trimming$prediction + oversized_families = names(table(predicted_lineages))[table(predicted_lineages)>max_size] + for (oversized_family in oversized_families) + { + oversized_familiy_scores = lineage_predictions_matrix[names(which(predicted_lineages==oversized_family)), names(which(predicted_lineages==oversized_family))] + weakest_link = min(oversized_familiy_scores[oversized_familiy_scores!=0]) + oversized_familiy_scores[oversized_familiy_scores==weakest_link] <- 0 + lineage_predictions_matrix[names(which(predicted_lineages==oversized_family)), names(which(predicted_lineages==oversized_family))] = oversized_familiy_scores + } + GEMLI_items_in_trimming$prediction = lineage_predictions_matrix + GEMLI_items_in_trimming = prediction_to_lineage_information(GEMLI_items_in_trimming, cutoff) + } + return(GEMLI_items_in_trimming) +} diff --git a/GEMLI_package_v1/R/visualize_as_network.R b/GEMLI_package_v1/R/visualize_as_network.R new file mode 100644 index 0000000..c947150 --- /dev/null +++ b/GEMLI_package_v1/R/visualize_as_network.R @@ -0,0 +1,69 @@ +visualize_as_network <- function(GEMLI_items, cutoff=70, max_edge_width=5, display_orphan=F, include_labels=F, ground_truth=F, highlight_FPs=F, layout_style='fr', cell_type_colors=F, title.cex=1, legend.cex=1) +{ + if (class(GEMLI_items)=='list') { + barcodes = GEMLI_items[['barcodes']] + } else if (class(GEMLI_items)=='GEMLI') { + barcodes = GEMLI_items@barcodes + } else { + stop('Object GEMLI_items should be either of class list or GEMLI') + } + par(mar=c(1,1,5,1)) + if (cell_type_colors) {layout(mat = matrix(c(1, 2, 3, 0), nrow = 2, ncol = 2), heights = c(3, 1), widths = c(3,1))} else {layout(mat = matrix(c(1, 2), nrow = 2, ncol = 1), heights = c(3, 1))} + # title + lineage_predictions_matrix = GEMLI_items[["prediction"]] + if (ground_truth) {lineage_dict = barcodes} else {lineage_dict = prediction_to_lineage_information(GEMLI_items, cutoff, output_as_dict=T)$predicted_lineages} + base_colors = rep(c('#a50026','#d73027','#f46d43','#fdae61','#fee090','#e0f3f8','#abd9e9','#74add1','#4575b4','#313695','#40004b','#762a83','#9970ab','#c2a5cf','#e7d4e8','#d9f0d3','#a6dba0','#5aae61','#1b7837','#00441b','#543005','#8c510a','#bf812d','#dfc27d','#f6e8c3','#e0e0e0','#bababa','#878787','#4d4d4d','#1a1a1a'), 100) + if (cell_type_colors) { if (length(GEMLI_items[['cell_type_color']])!=0){ + base_colors = GEMLI_items[['cell_type_color']]$color + } else { + cell.type<-unique(GEMLI_items[['cell_type']]$cell.type) + color<-base_colors[rank(unique(GEMLI_items[['cell_type']]$cell.type))] + GEMLI_items[['cell_type_color']] = data.frame(cell.type, color) + }} else {base_colors = base_colors + } + network_edges = as.matrix(lineage_predictions_matrix) + network_edges[network_edgescutoff)])), col = c("black", "black"), bty = "n", lty=1:1, lwd=c(max(edge_width),(min(edge_width)+ (max_edge_width*0.1))), title = "Confidence", title.adj =0, horiz=F, xpd=TRUE, ncol=1, cex=legend.cex) + # Vertex_color + if ((ground_truth==T) & (cell_type_colors==F) & (include_labels==F)) {legend("bottomright", legend=c("Color - ground truth","White - no ground truth"), bty = "n", title = "Color ", title.adj =0.5, horiz=F, xpd=TRUE, inset=c(.05, 0),ncol=1, cex=legend.cex)} + if ((ground_truth==F) & (cell_type_colors==F) & (include_labels==F)) {legend("bottom", legend=c("prediction",""), bty = "n", title = " Color by", title.adj =0.5, horiz=F, xpd=TRUE,ncol=1, cex=legend.cex)} + if ((ground_truth==T) & (cell_type_colors==F) & (include_labels==T)) {legend("topright", legend=c("Color - ground truth","White - no ground truth","Number - cell ID"), bty = "n", title = "Vertex ", title.adj =0.5, horiz=F, xpd=TRUE, inset=c(.05, 0),ncol=1, cex=legend.cex)} + if ((ground_truth==F) & (cell_type_colors==F) & (include_labels==T)) {legend("top", legend=c("Color - prediction","Number - cell ID"), bty = "n", title = " Vertex", title.adj =0.5, horiz=F, xpd=TRUE, inset=c(0, 0),ncol=1, cex=legend.cex)} + if (cell_type_colors) { + plot(NULL ,xaxt='n',yaxt='n',bty='n',ylab='',xlab='', xlim=0:1, ylim=0:1) + legend("left", legend = GEMLI_items[['cell_type_color']]$cell.type, pch = 16, col = GEMLI_items[['cell_type_color']]$color, title = "Cell type", bty = "o", horiz=F, xpd=TRUE, inset=c(0, 0),ncol=1, cex=legend.cex)} + par(mar=c(5.1, 4.1, 4.1, 2.1)) +} diff --git a/GEMLI_package_v1/data/GEMLI_example_barcode_information.RData b/GEMLI_package_v1/data/GEMLI_example_barcode_information.RData new file mode 100644 index 0000000..6f2502a Binary files /dev/null and b/GEMLI_package_v1/data/GEMLI_example_barcode_information.RData differ diff --git a/GEMLI_package_v1/data/GEMLI_example_data_matrix.RData b/GEMLI_package_v1/data/GEMLI_example_data_matrix.RData new file mode 100644 index 0000000..e530c12 Binary files /dev/null and b/GEMLI_package_v1/data/GEMLI_example_data_matrix.RData differ diff --git a/GEMLI_package_v1/man/DEG_volcano_plot.Rd b/GEMLI_package_v1/man/DEG_volcano_plot.Rd new file mode 100644 index 0000000..7cb7f94 --- /dev/null +++ b/GEMLI_package_v1/man/DEG_volcano_plot.Rd @@ -0,0 +1,51 @@ +\name{DEG_volcano_plot} +\alias{DEG_volcano_plot} +%- Also NEED an '\alias' for EACH other topic documented here. +\title{ +DEG_volcano_plot +} +\description{ +This function plots a simple volcano plot for the differential expressed genes (DEG) called using the function 'cell_fate_DEG_calling'. +} +\usage{ +DEG_volcano_plot(GEMLI_items, name1, name2) +} +\arguments{ + \item{ + GEMLI_items}{GEMLI_items is a list of GEMLI inputs and outputs. '. To run 'DEG_volcano_plot' it should contain a 'DEG' element. The 'DEG' element is the output of the 'cell_fate_DEG_calling' function. It is a dataframe with the columns 'p_val', 'avg_log2FC', 'pct1', 'pct.2', 'p_val-adj'. + } + \item{ + name1}{'name1' is a character vector specifying the first population of cells analysed for DEG calling. It will appears in the title and legend of the volcano plot. It should correspond to the 'ident1' parameter of the 'cell.fate_DEG_calling' function used to generate the 'GEMLI_items' 'DEG' element. + } + \item{ + name2}{'name2' is a character vector specifying the second population of cells analysed for DEG calling. See 'name1' parameter. + } +} +\value{ +'DEG_volcano_plot' plots a volcano plot for the DEG called using the function 'cell_fate_DEG_calling'. +} +\references{ +A.S. Eisele*, M. Tarbier*, A.A. Dormann, V. Pelechano, D.M. Suter | "Gene-expression memory-based prediction of cell lineages from scRNA-seq datasets" | Nature Communications 15, 2744 (2024). https://doi.org/10.1038/s41467-024-47158-y +} +\author{ +Marcel Tarbier and Almut Eisele +} +\examples{ +##---- Should be DIRECTLY executable !! ---- +##-- ==> Define data, use random, +##-- or do help(data=index) for the standard data sets. + +## The function is currently defined as +function (x) +{ + } +} +% Add one or more standard keywords, see file 'KEYWORDS' in the +% R documentation directory (show via RShowDoc("KEYWORDS")): +% \keyword{ ~kwd1 } +% \keyword{ ~kwd2 } +% Use only one keyword per line. +% For non-standard keywords, use \concept instead of \keyword: +% \concept{ ~cpt1 } +% \concept{ ~cpt2 } +% Use only one concept per line. diff --git a/GEMLI_package_v1/man/GEMLI-class.Rd b/GEMLI_package_v1/man/GEMLI-class.Rd new file mode 100644 index 0000000..b036b5c --- /dev/null +++ b/GEMLI_package_v1/man/GEMLI-class.Rd @@ -0,0 +1,33 @@ +\name{GEMLI-class} +\alias{GEMLI-class} +\title{Class "GEMLI"} +\description{ + The \code{GEMLI} class represents a data structure for storing gene expression data, optional barcodes, prediction data, and testing results. +} +\details{ + The \code{GEMLI} class contains the following slots: + \itemize{ + \item{\code{gene_expression}:} A \code{Matrix} object representing the gene expression data. It can be a matrix of counts or a sparse matrix. + \item{\code{barcodes}:} A character vector of barcodes associated with the cells. It defaults to an empty character vector if not provided. + \item{\code{prediction}:} A matrix containing prediction results. It defaults to an empty matrix if not provided. + \item{\code{testing_results}:} A matrix containing testing results. It defaults to an empty matrix if not provided. + } + + This class is designed to support the analysis and manipulation of gene expression data along with associated metadata. +} +\section{Slots}{ + \describe{ + \item{\code{gene_expression}}{A \code{Matrix} object.} + \item{\code{barcodes}}{A character vector.} + \item{\code{prediction}}{A matrix.} + \item{\code{testing_results}}{A matrix.} + } +} +\examples{ + # Example of creating a GEMLI object + gemli_obj <- GEMLI(gene_expression = matrix(rnorm(100), nrow = 10)) + + # Displaying the object + show(gemli_obj) +} + diff --git a/GEMLI_package_v1/man/GEMLI-method.Rd b/GEMLI_package_v1/man/GEMLI-method.Rd new file mode 100644 index 0000000..d1d90d4 --- /dev/null +++ b/GEMLI_package_v1/man/GEMLI-method.Rd @@ -0,0 +1,25 @@ +\name{show,GEMLI-method} +\alias{show,GEMLI-method} +\title{Show Method for GEMLI Objects} +\description{ + Displays a summary of the \code{GEMLI} object. +} +\usage{ + \S4method{show}{GEMLI}(object) +} +\arguments{ + \item{object}{ + An object of class \code{GEMLI}. + } +} +\details{ + The \code{show} method provides a summary of the \code{GEMLI} object, including the number of genes and cells in the gene expression matrix, and the availability of barcodes, predictions, and testing results. +} +\value{ + Prints a summary to the console. No return value. +} +\examples{ + gemli_obj <- GEMLI(gene_expression = matrix(rnorm(100), nrow = 10)) + show(gemli_obj) +} + diff --git a/GEMLI_package_v1/man/GEMLI.Rd b/GEMLI_package_v1/man/GEMLI.Rd new file mode 100644 index 0000000..15477ae --- /dev/null +++ b/GEMLI_package_v1/man/GEMLI.Rd @@ -0,0 +1,40 @@ +\name{GEMLI} +\alias{GEMLI} +\title{Create a GEMLI Object} +\description{ + Initializes a \code{GEMLI} object with gene expression data, optional barcodes, prediction, and testing results. +} +\usage{ + GEMLI(gene_expression, barcodes = character(0), prediction = NULL, testing_results = NULL) +} +\arguments{ + \item{gene_expression}{ + A matrix of gene expression data, a \code{Seurat} object, or a \code{dgCMatrix}. + If a \code{Seurat} object is provided, the function will extract the count matrix. + } + \item{barcodes}{ + A character vector of barcodes. Default is an empty character vector. + } + \item{prediction}{ + A matrix containing prediction data. Default is \code{NULL}, which will be replaced by an empty matrix. + } + \item{testing_results}{ + A matrix containing testing results. Default is \code{NULL}, which will be replaced by an empty matrix. + } +} +\details{ + This function validates inputs for \code{gene_expression}, \code{barcodes}, \code{prediction}, and \code{testing_results}. It converts the gene expression data to a \code{Matrix} object if necessary, and ensures that \code{Seurat} is available if required. + + If \code{prediction} or \code{testing_results} are \code{NULL}, they are replaced with empty matrices. +} +\value{ + Returns an object of class \code{GEMLI} with the provided or default values. +} +\seealso{ + \code{\link{GEMLI-class}} for details on the \code{GEMLI} class. +} +\examples{ + # Example usage of the GEMLI function + gemli_obj <- GEMLI(gene_expression = matrix(rnorm(100), nrow = 10)) +} + diff --git a/GEMLI_package_v1/man/addBarcodes,GEMLI-method.Rd b/GEMLI_package_v1/man/addBarcodes,GEMLI-method.Rd new file mode 100644 index 0000000..f3c4109 --- /dev/null +++ b/GEMLI_package_v1/man/addBarcodes,GEMLI-method.Rd @@ -0,0 +1,28 @@ +\name{addBarcodes,gemli-method} +\alias{addBarcodes,GEMLI-method} +\title{Add Barcodes Method for GEMLI Objects} +\description{ + Adds or updates the barcodes in a \code{GEMLI} object. +} +\usage{ + \S4method{addBarcodes}{GEMLI}(object, barcodes) +} +\arguments{ + \item{object}{ + An object of class \code{GEMLI}. + } + \item{barcodes}{ + A character vector of barcodes to be added or updated. + } +} +\details{ + This method updates the \code{barcodes} slot of the \code{GEMLI} object. It checks that \code{barcodes} is a character vector. +} +\value{ + Returns the updated \code{GEMLI} object. +} +\examples{ + gemli_obj <- GEMLI(gene_expression = matrix(rnorm(100), nrow = 10)) + gemli_obj <- addBarcodes(gemli_obj, barcodes = c("barcode1", "barcode2")) +} + diff --git a/GEMLI_package_v1/man/addBarcodes.Rd b/GEMLI_package_v1/man/addBarcodes.Rd new file mode 100644 index 0000000..516315f --- /dev/null +++ b/GEMLI_package_v1/man/addBarcodes.Rd @@ -0,0 +1,23 @@ +\name{addBarcodes} +\alias{addBarcodes} +\title{Generic Function to Add or Update Barcodes} +\description{ + A generic function to add or update barcodes in an object. +} +\usage{ + addBarcodes(object, barcodes) +} +\arguments{ + \item{object}{ + An object in which barcodes will be added or updated. + } + \item{barcodes}{ + A character vector of barcodes to be added or updated. + } +} +\details{ + This is a generic function that will dispatch to the appropriate method depending on the class of the object. +} +\value{ + The result depends on the specific method for the class of the object. +} diff --git a/GEMLI_package_v1/man/addPrediction.Rd b/GEMLI_package_v1/man/addPrediction.Rd new file mode 100644 index 0000000..8641386 --- /dev/null +++ b/GEMLI_package_v1/man/addPrediction.Rd @@ -0,0 +1,29 @@ +\name{addPrediction} +\alias{addPrediction} +\alias{addPrediction,GEMLI-method} +\title{Add Prediction Method for GEMLI Objects} +\description{ + A generic function to add or update the prediction matrix in a \code{GEMLI} object. +} +\usage{ + \S4method{addPrediction}{GEMLI}(object, prediction) +} +\arguments{ + \item{object}{ + An object of class \code{GEMLI}. + } + \item{prediction}{ + A matrix containing prediction data. + } +} +\details{ + This method updates the \code{prediction} slot of the \code{GEMLI} object. It checks that \code{prediction} is a matrix. +} +\value{ + Returns the updated \code{GEMLI} object. +} +\examples{ + gemli_obj <- GEMLI(gene_expression = matrix(rnorm(100), nrow = 10)) + gemli_obj <- addPrediction(gemli_obj, prediction = matrix(rnorm(20), nrow = 2)) +} + diff --git a/GEMLI_package_v1/man/addTestingResults.Rd b/GEMLI_package_v1/man/addTestingResults.Rd new file mode 100644 index 0000000..5a9cf12 --- /dev/null +++ b/GEMLI_package_v1/man/addTestingResults.Rd @@ -0,0 +1,29 @@ +\name{addTestingResults} +\alias{addTestingResults} +\alias{addTestingResults,GEMLI-method} +\title{Add Testing Results Method for GEMLI Objects} +\description{ + A generic function to add or update the testing results matrix in a \code{GEMLI} object. +} +\usage{ + \S4method{addTestingResults}{GEMLI}(object, testing_results) +} +\arguments{ + \item{object}{ + An object of class \code{GEMLI}. + } + \item{testing_results}{ + A matrix containing testing results. + } +} +\details{ + This method updates the \code{testing_results} slot of the \code{GEMLI} object. It checks that \code{testing_results} is a matrix. +} +\value{ + Returns the updated \code{GEMLI} object. +} +\examples{ + gemli_obj <- GEMLI(gene_expression = matrix(rnorm(100), nrow = 10)) + gemli_obj <- addTestingResults(gemli_obj, testing_results = matrix(rnorm(30), nrow = 3)) +} + diff --git a/GEMLI_package_v1/man/calculate_correlations.Rd b/GEMLI_package_v1/man/calculate_correlations.Rd new file mode 100644 index 0000000..10c4fc2 --- /dev/null +++ b/GEMLI_package_v1/man/calculate_correlations.Rd @@ -0,0 +1,45 @@ +\name{calculate_correlations} +\alias{calculate_correlations} +%- Also NEED an '\alias' for EACH other topic documented here. +\title{ +calculate_correlations +} +\description{ +This function provides a fast way to calculate Spearmans ranked correlation or Pearsons correlation. +} +\usage{ +calculate_correlations(data) +} +\arguments{ + \item{ + data}{'data_matrix' is a gene expression matrix where rownames are genes (features) and column names are cell IDs (samples). + } +} +\value{ +The output is a cell by cell matrix with each value representing the rho value of Spearmans ranked correlation. +} +\references{ +A.S. Eisele*, M. Tarbier*, A.A. Dormann, V. Pelechano, D.M. Suter | "Gene-expression memory-based prediction of cell lineages from scRNA-seq datasets" | Nature Communications 15, 2744 (2024). https://doi.org/10.1038/s41467-024-47158-y +} +\author{ +Marcel Tarbier and Almut Eisele +} +\examples{ +##---- Should be DIRECTLY executable !! ---- +##-- ==> Define data, use random, +##-- or do help(data=index) for the standard data sets. + +## The function is currently defined as +function (x) +{ + } +} +% Add one or more standard keywords, see file 'KEYWORDS' in the +% R documentation directory (show via RShowDoc("KEYWORDS")): +% \keyword{ ~kwd1 } +% \keyword{ ~kwd2 } +% Use only one keyword per line. +% For non-standard keywords, use \concept instead of \keyword: +% \concept{ ~cpt1 } +% \concept{ ~cpt2 } +% Use only one concept per line. diff --git a/GEMLI_package_v1/man/cell_fate_DEG_calling.Rd b/GEMLI_package_v1/man/cell_fate_DEG_calling.Rd new file mode 100644 index 0000000..b5eb01d --- /dev/null +++ b/GEMLI_package_v1/man/cell_fate_DEG_calling.Rd @@ -0,0 +1,57 @@ +\name{cell_fate_DEG_calling} +\alias{cell_fate_DEG_calling} +%- Also NEED an '\alias' for EACH other topic documented here. +\title{ +cell_fate_DEG_calling +} +\description{ +This function calls differential expressed genes (DEG) between cells of specific cell types being members of asymmetric cell lineages (containing members of two or more cell types of interest) or members of symmetric cell lineages (contains members of only one cell type of interest). DEG calling is performed using Seurats FindMarkers function. +} +\usage{ +cell_fate_DEG_calling(GEMLI_items, ident1, ident2, min.pct=0.05, logfc.threshold=0.1) +} +\arguments{ + \item{ + GEMLI_items}{GEMLI_items is a list of GEMLI inputs and outputs. To run 'cell_fate_DEG_calling' it should contain a 'gene_expression' as well as a 'cell_fate_analysis' element. The 'gene_expression' element is a quality controlled and normalised gene expression matrix where rownames are genes (features) and column names are cell IDs (cell barcodes). The 'cell_fate_analysis' element is a data frame with column 'cell.ID, 'clone.ID', 'cell.type' and 'cell.fate' generated by the 'extract_cell_fate_lineages' function. + } + \item{ + ident1}{'ident1' specifies the first 'cell.fate' of the GEMLI_items 'cell_fate_analysis' element to be used for DEG calling. It is a character vector which can encompass several cell fates. Cell fates contain the lineage type (sym or asym for symmetric or asymmetric respectively) and the cell type separated by an underscore. Cell fates can for example be 'sym_DCIS' or 'asym_inv_tumor'. + } + \item{ + ident2}{'ident2' specifies the second 'cell.fate' of the GEMLI_items 'cell_fate_analysis' element to be used for DEG calling. See ident1 for the format. + } + \item{ + min.pct}{'min.pct' is the min.pct parameter of Seurats FindMarker function. It is the minimum fraction of cells in either of the compared cell populations in which a gene should be expressed in order to be tested. the default value is 0.05. + } + \item{ + logfc.threshold}{'logfc.threshold' is the logfc.threshold parameter of Seurats FindMarker function. It limits the DEG calling to genes which show, on average, at least x-fold differences (log-scale) between the two compared cell populations. + } +} +\value{ +'cell_fate_DEG_calling' yields a data frame which is added to the 'GEMLI_items' under the name 'DEG'. The data frame is the output of Seurats FindMarker function and contains the column 'p_val', 'avg_log2FC', 'pct.1', 'pct2' and 'p_val_adj'. +} +\references{ +A.S. Eisele*, M. Tarbier*, A.A. Dormann, V. Pelechano, D.M. Suter | "Gene-expression memory-based prediction of cell lineages from scRNA-seq datasets" | Nature Communications 15, 2744 (2024). https://doi.org/10.1038/s41467-024-47158-y +} +\author{ +Almut Eisele and Marcel Tarbier +} +\examples{ +##---- Should be DIRECTLY executable !! ---- +##-- ==> Define data, use random, +##-- or do help(data=index) for the standard data sets. + +## The function is currently defined as +function (x) +{ + } +} +% Add one or more standard keywords, see file 'KEYWORDS' in the +% R documentation directory (show via RShowDoc("KEYWORDS")): +% \keyword{ ~kwd1 } +% \keyword{ ~kwd2 } +% Use only one keyword per line. +% For non-standard keywords, use \concept instead of \keyword: +% \concept{ ~cpt1 } +% \concept{ ~cpt2 } +% Use only one concept per line. diff --git a/GEMLI_package_v1/man/cell_type_composition_plot.Rd b/GEMLI_package_v1/man/cell_type_composition_plot.Rd new file mode 100644 index 0000000..735e732 --- /dev/null +++ b/GEMLI_package_v1/man/cell_type_composition_plot.Rd @@ -0,0 +1,56 @@ +\name{cell_type_composition_plot} +\alias{cell_type_composition_plot} +%- Also NEED an '\alias' for EACH other topic documented here. +\title{ +cell_type_composition_plot +} +\description{ +This function generates simple plots of the cell type composition of predicted or ground truth lineages. +} +\usage{ +cell_type_composition_plot(GEMLI_items, ground_truth=F, cell_type_colors=F, type, intersections=NULL) +} +\arguments{ + \item{ + GEMLI_items}{GEMLI_items is a list of GEMLI inputs and outputs. To run 'cell_type_composition_plot' it should contain a 'barcodes' (composition of ground truth) or 'predicted_lineage_table' (composition of predicted lineages) element. The 'predicted_lineage_table' is the output of the 'prediction_to_lineage_info' function. + } + \item{ + ground_truth}{==T/TRUE indicates that the composition of ground truth lineages is analyzed. If 'ground_truth'==F, the compositionof predicted lineages is analyzed. Default is F. + } + \item{ + cell_type_colors}{'cell_type_colors'==T/TRUE specifies that custom colors for every cell type stored in GEMLI_items 'cell_type_colors' elment should be used. Default is F. + } + \item{ + type}{'type' specifies which of three plots is generated. Type can be 'bubble', 'upsetR', or 'plain'. type='plain' will output a simple table of the number of lineages for different cell type combinations. type='upsetR' will generate an upsetR plot showing the number of lineages for different cell type combinations. type='bubble' will generate a bubble plot of the cell type composition of individual lineages. This is especially meaningful when analyzing multicellular structures. + } + \item{ + intersections}{'intersections' AlmutNeeded} +} +\value{ +'cell_type_composition_plot' yields one of three possible plot types (see 'type') specifying the cell type composition of ground truth or predicted lineages. +} +\references{ +A.S. Eisele*, M. Tarbier*, A.A. Dormann, V. Pelechano, D.M. Suter | "Gene-expression memory-based prediction of cell lineages from scRNA-seq datasets" | Nature Communications 15, 2744 (2024). https://doi.org/10.1038/s41467-024-47158-y +} +\author{ +Almut Eisele and Marcel Tarbier +} +\examples{ +##---- Should be DIRECTLY executable !! ---- +##-- ==> Define data, use random, +##-- or do help(data=index) for the standard data sets. + +## The function is currently defined as +function (x) +{ + } +} +% Add one or more standard keywords, see file 'KEYWORDS' in the +% R documentation directory (show via RShowDoc("KEYWORDS")): +% \keyword{ ~kwd1 } +% \keyword{ ~kwd2 } +% Use only one keyword per line. +% For non-standard keywords, use \concept instead of \keyword: +% \concept{ ~cpt1 } +% \concept{ ~cpt2 } +% Use only one concept per line. diff --git a/GEMLI_package_v1/man/cluster_stability_plot.Rd b/GEMLI_package_v1/man/cluster_stability_plot.Rd new file mode 100644 index 0000000..d6318f1 --- /dev/null +++ b/GEMLI_package_v1/man/cluster_stability_plot.Rd @@ -0,0 +1,45 @@ +\name{cluster_stability_plot} +\alias{cluster_stability_plot} +%- Also NEED an '\alias' for EACH other topic documented here. +\title{ +cluster_stability_plot +} +\description{ +This function calculates a cluster stability index for lineage predictions allowing for different cluster sizes and generates a plot of cluster stability index vs cluster size. For multicellular structures, the cluster size at which the cluster stability index plateaus allows to estimate the maximal size of the multicellular structures present in the data. This is the cluster size till which lineage predictions will have a high precision, and the cluster size at which recovery will be maximal. +} +\usage{ +cluster_stability_plot(GEMLI_items) +} +\arguments{ + \item{ + GEMLI_items}{GEMLI_items is a list of GEMLI inputs and outputs. To run 'cluster_stability_plot' it should contain a prediction matrix named 'prediction_multiple_sizes' that is generated and added to the items list by the function 'predict lineages_multiple_sizes'. + } +} +\value{ +'cluster_stability_plot' yields a plot of cluster stability index vs cluster size based on which the size of multicellular structures present in the single-cell RNA-sequencing dataset can be estimated. +} +\references{ +A.S. Eisele*, M. Tarbier*, A.A. Dormann, V. Pelechano, D.M. Suter | "Gene-expression memory-based prediction of cell lineages from scRNA-seq datasets" | Nature Communications 15, 2744 (2024). https://doi.org/10.1038/s41467-024-47158-y +} +\author{ +Almut Eisele and Marcel Tarbier +} +\examples{ +##---- Should be DIRECTLY executable !! ---- +##-- ==> Define data, use random, +##-- or do help(data=index) for the standard data sets. + +## The function is currently defined as +function (x) +{ + } +} +% Add one or more standard keywords, see file 'KEYWORDS' in the +% R documentation directory (show via RShowDoc("KEYWORDS")): +% \keyword{ ~kwd1 } +% \keyword{ ~kwd2 } +% Use only one keyword per line. +% For non-standard keywords, use \concept instead of \keyword: +% \concept{ ~cpt1 } +% \concept{ ~cpt2 } +% Use only one concept per line. diff --git a/GEMLI_package_v1/man/data_matrix.Rd b/GEMLI_package_v1/man/data_matrix.Rd new file mode 100644 index 0000000..c0178e1 --- /dev/null +++ b/GEMLI_package_v1/man/data_matrix.Rd @@ -0,0 +1,20 @@ +\name{data_matrix} +\docType{data} +\alias{data_matrix} +\title{Example dataset for GEMLI} +\description{ + Example of gene expression table. +} +\usage{ +data(GEMLI_example_data_matrix) +} +\format{ + A data frame with 32285 genes (rows) and 311 cells (columns). +} +\examples{ +\dontrun{ + data(GEMLI_example_data_matrix) + summary(data_matrix) +} +} +\keyword{datasets} diff --git a/GEMLI_package_v1/man/extract_cell_fate_lineages.Rd b/GEMLI_package_v1/man/extract_cell_fate_lineages.Rd new file mode 100644 index 0000000..3467e7c --- /dev/null +++ b/GEMLI_package_v1/man/extract_cell_fate_lineages.Rd @@ -0,0 +1,54 @@ +\name{extract_cell_fate_lineages} +\alias{extract_cell_fate_lineages} +%- Also NEED an '\alias' for EACH other topic documented here. +\title{ +extract_cell_fate_lineages +} +\description{ +This function extracts symmetric (with all members in one considered cell type) and asymmetric cell lineages (with members in two or more of considered cell types). The function generates the input for the cell_fate_DEG_calling function. +} +\usage{ +extract_cell_fate_lineages(GEMLI, selection, unique=FALSE, threshold) +} +\arguments{ + \item{ + GEMLI}{GEMLI is a list of GEMLI inputs and outputs. To run 'extract_cell_type_lineages' it should contain a 'predicted_lineage_table' as well as a 'cell_type' table. The 'predicted_lineage_table' is generated using the function 'prediction_to_lineage_information'. The 'cell_type' table is a data frame with column 'cell.ID' and celltype'. + } + \item{ + selection}{'selection' specifies the cell types to be considered for the extraction of symmetric and asymmetric lineages. It is a vector of minimal two characters specifying the cell types to be considered. + } + \item{ + unique}{'unique'=TRUE specifies that extracted lineages should contain only the cell types given in the 'selection' parameter. If 'unique'=FALSE, also other cell types, not considered in the lineage selection, can be present in the extraccted lineages. The default value is FALSE. + } + \item{ + threshold}{'threshold' specifies the minimal percentage of a given cell type asymmetric lineages should contain in order to be considered. It is a vector of percentages (numbers) which give the percentages for different cell types in the order in which they are given in the 'selection' parameter. Threshold values for all cell types have to be met for an asymmetric lineage to be kept. + } +} +\value{ +'extract_cell_fate_lineages' yields a data frame which is added to the 'GEMLI_items' under the name 'cell_fate_analysis'. The data frame contains the column 'cell.ID', 'clone.ID', 'cell.type' and 'cell.fate'. the 'cell.fate' column contains the lable 'asym' for selected asymmetric lineages and 'sym' for selected symmetric lineages, followed by the cell type of the specific cell, separated by an underscore (e.g. 'sym_DCIS', or 'asym_inv_tumor'). The function generates the input for the function 'cell_fate_DEG_calling'. +} +\references{ +A.S. Eisele*, M. Tarbier*, A.A. Dormann, V. Pelechano, D.M. Suter | "Gene-expression memory-based prediction of cell lineages from scRNA-seq datasets" | Nature Communications 15, 2744 (2024). https://doi.org/10.1038/s41467-024-47158-y +} +\author{ +Almut Eisele and Marcel Tarbier +} +\examples{ +##---- Should be DIRECTLY executable !! ---- +##-- ==> Define data, use random, +##-- or do help(data=index) for the standard data sets. + +## The function is currently defined as +function (x) +{ + } +} +% Add one or more standard keywords, see file 'KEYWORDS' in the +% R documentation directory (show via RShowDoc("KEYWORDS")): +% \keyword{ ~kwd1 } +% \keyword{ ~kwd2 } +% Use only one keyword per line. +% For non-standard keywords, use \concept instead of \keyword: +% \concept{ ~cpt1 } +% \concept{ ~cpt2 } +% Use only one concept per line. diff --git a/GEMLI_package_v1/man/lineage_dict_bc.Rd b/GEMLI_package_v1/man/lineage_dict_bc.Rd new file mode 100644 index 0000000..be4fd23 --- /dev/null +++ b/GEMLI_package_v1/man/lineage_dict_bc.Rd @@ -0,0 +1,20 @@ +\name{lineage_dict_bc} +\docType{data} +\alias{lineage_dict_bc} +\title{Example dataset for GEMLI} +\description{ + Named integer with 156 values and cell ids as names. +} +\usage{ +data(GEMLI_example_barcode_information) +} +\format{ + A data frame with 32285 genes (rows) and 311 cells (columns). +} +\examples{ +\dontrun{ + data(GEMLI_example_barcode_information) + summary(lineage_dict_bc) +} +} +\keyword{datasets} diff --git a/GEMLI_package_v1/man/memory_gene_calling.Rd b/GEMLI_package_v1/man/memory_gene_calling.Rd new file mode 100644 index 0000000..ee2d69b --- /dev/null +++ b/GEMLI_package_v1/man/memory_gene_calling.Rd @@ -0,0 +1,57 @@ +\name{memory_gene_calling} +\alias{memory_gene_calling} +%- Also NEED an '\alias' for EACH other topic documented here. +\title{ +memory_gene_calling +} +\description{ +This function identifies memory genes (or lineage markers) based on gene expression variability across lineages (either from predictions or from ground truth) from single-cell RNA-sequencing data. +} +\usage{ +memory_gene_calling(GEMLI_items, valid_lineage_sizes=2:5, use_median=T, ground_truth=F, cell_fate) +} +\arguments{ + \item{ + GEMLI_items}{'GEMLI_items' is a list of GEMLI inputs and outputs. To run 'memory_gene_calling' GEMLI_items must contain a 'gene_expression' element. If it is run on bracodes 'GEMLI_items' needs to contain a 'barcodes' element. To run 'memory_gene_calling' on predictions 'GEMLI_items' needs to contain a 'predicted_lineage_table' elment. It is also possible to run 'memory_gene_calling' on a symmetric or asymmetric lineage type of specific cell types. In this case, the GEMLI_items must contain a 'cell_fate_analysis' element. The ‘gene_expression’ element is a quality controlled and normalised gene expression matrix where rownames are genes (features) and column names are cell IDs (cell barcodes). The 'predicted_lineage_table' is generated using the function prediction_to_lineage_information. The 'barcodes' element is a named vector of lineage ground truth (names=cell.ID, value=clone.ID). The 'cell_fate_analysis' element is generated using the 'extract_cell_fate_lineages' function. + } + \item{ + valid_lineage_sizes}{'valid_lineage_sizes' specifies the range of lineage sizes to be included. Depending on the question to be investigated it can be beneficial to either restrict this to small lineages or large lineages respectively. Default is small lineage from 2 to 5 cells (2:5). + } + \item{ + use_median}{'use_median' specifies whether the median of lineages should be used rather than the mean. This makes the approach more robust to outliers. Default is 'true'/'T'. + } + \item{ + ground_truth}{'ground_truth' specifies whether to call memory genes on ground truth instead of predictions. Default is 'false'/'F'. + } + \item{ + cell_fate}{'cell_fate', when present, specifies to call memory genes on specific symmetric or asymmetric lineages. It is a vector of the 'cell.fate' in the 'cell_fate_analysis' GEMLI_items element of the lineages to be used for memory gene calling. Cell fates start with sym or asym, followed by the cell type, separated by an underscore. + } +} +\value{ +'memory_gene_calling' yields a table of gene names or IDs of potential memory genes (row names), as well as their variability ('var') across lineages and a p-values ('p'). This table is stored un the GEMLI_items list as element 'memory_genes'. +} +\references{ +A.S. Eisele*, M. Tarbier*, A.A. Dormann, V. Pelechano, D.M. Suter | "Gene-expression memory-based prediction of cell lineages from scRNA-seq datasets" | Nature Communications 15, 2744 (2024). https://doi.org/10.1038/s41467-024-47158-y +} +\author{ +Marcel Tarbier and Almut Eisele +} +\examples{ +##---- Should be DIRECTLY executable !! ---- +##-- ==> Define data, use random, +##-- or do help(data=index) for the standard data sets. + +## The function is currently defined as +function (x) +{ + } +} +% Add one or more standard keywords, see file 'KEYWORDS' in the +% R documentation directory (show via RShowDoc("KEYWORDS")): +% \keyword{ ~kwd1 } +% \keyword{ ~kwd2 } +% Use only one keyword per line. +% For non-standard keywords, use \concept instead of \keyword: +% \concept{ ~cpt1 } +% \concept{ ~cpt2 } +% Use only one concept per line. diff --git a/GEMLI_package_v1/man/potential_markers.Rd b/GEMLI_package_v1/man/potential_markers.Rd new file mode 100644 index 0000000..2b237d6 --- /dev/null +++ b/GEMLI_package_v1/man/potential_markers.Rd @@ -0,0 +1,32 @@ +\name{potential_markers} +\alias{potential_markers} +%- Also NEED an '\alias' for EACH other topic documented here. +\title{ +potential_markers +} +\description{ +Add description here ! +} +\usage{ +potential_markers(data_matrix) +} +\arguments{ + \item{ + data_matrix}{Explain argument !} +} +\value{ +Explain output ! +} +\references{ +A.S. Eisele*, M. Tarbier*, A.A. Dormann, V. Pelechano, D.M. Suter | "Gene-expression memory-based prediction of cell lineages from scRNA-seq datasets" | Nature Communications 15, 2744 (2024). https://doi.org/10.1038/s41467-024-47158-y +} +\author{ +Marcel Tarbier and Almut Eisele +} +\examples{ +##---- Should be DIRECTLY executable !! ---- +##-- ==> Define data, use random, +##-- or do help(data=index) for the standard data sets. + +## The function is currently defined as +} diff --git a/GEMLI_package_v1/man/predict_lineages.Rd b/GEMLI_package_v1/man/predict_lineages.Rd new file mode 100644 index 0000000..36c3baf --- /dev/null +++ b/GEMLI_package_v1/man/predict_lineages.Rd @@ -0,0 +1,57 @@ +\name{predict_lineages} +\alias{predict_lineages} +%- Also NEED an '\alias' for EACH other topic documented here. +\title{ +predict_lineages +} +\description{ +This function predicts cell lineages from from single-cell RNA-sequencing data. It identifies potential lineage markers based on mean gene expression and gene expression variability and uses these markers in a repeated iterative clustering approach. Subsets of these genes are used to cluster cells until the desired cluster size is reached. This clustering is repeated many times for random subsets. The result is a cell by cell matrix that lists how many times each cell pair clustered together which translates into a confidence score. Cell pairs with high confidence scores are likely to be members of the same lineage. +} +\usage{ +predict_lineages(GEMLI_items, repetitions=100, sample_size=(2/3), desired_cluster_size=c(2,3), verbose=T) +} +\arguments{ + \item{ + GEMLI_items}{GEMLI_items is a list of GEMLI inputs and outputs. To run 'predict_lineages' it should contain a gene expression matrix named 'gene_expression'. This is a quality controlled and normalized gene expression matrix where rownames are genes (features) and column names are cell IDs (samples). + } + \item{ + repetitions}{'repetitions' specifies how many times the input matrix will be clustered using random subsamples of potential markers. A higher number of iterations leads to more robust results. 10 iterations is considered to be the minimum, but 100 iterations are strongly recommended (default value). Runtime is linear with regard to the number of iterations. + } + \item{ + sample_size}{'sample_size' is a value between 0 and 1 and specifies the fraction of potential markers that are used in each clustering. values between 0.5 and 0.67 (default value) are recommended. + } + \item{ + desired_cluster_size}{'desired_cluster_size' specifies the number of cells per cluster to be achieved in each clustering. The input is a list of values, e.g. c(2,3,4) or (2:4). The desired_cluster_size parameter should generally be small. Values between 2 and 4 are recommended (default). + } + \item{ + verbose}{'verbose' if you want to print information on screen for each repetition. + } +} +\value{ +'predict_lineages' yields a cell by cell matrix containing confidence scores. Cell pairs with high confidence scores are likely to be members of the same lineage. This matrix is added to the 'GEMLI_items' under the name 'prediction'. +} +\references{ +A.S. Eisele*, M. Tarbier*, A.A. Dormann, V. Pelechano, D.M. Suter | "Gene-expression memory-based prediction of cell lineages from scRNA-seq datasets" | Nature Communications 15, 2744 (2024). https://doi.org/10.1038/s41467-024-47158-y +} +\author{ +Marcel Tarbier and Almut Eisele +} +\examples{ +##---- Should be DIRECTLY executable !! ---- +##-- ==> Define data, use random, +##-- or do help(data=index) for the standard data sets. + +## The function is currently defined as +function (x) +{ + } +} +% Add one or more standard keywords, see file 'KEYWORDS' in the +% R documentation directory (show via RShowDoc("KEYWORDS")): +% \keyword{ ~kwd1 } +% \keyword{ ~kwd2 } +% Use only one keyword per line. +% For non-standard keywords, use \concept instead of \keyword: +% \concept{ ~cpt1 } +% \concept{ ~cpt2 } +% Use only one concept per line. diff --git a/GEMLI_package_v1/man/predict_lineages_multiple_sizes.Rd b/GEMLI_package_v1/man/predict_lineages_multiple_sizes.Rd new file mode 100644 index 0000000..31eccc0 --- /dev/null +++ b/GEMLI_package_v1/man/predict_lineages_multiple_sizes.Rd @@ -0,0 +1,57 @@ +\name{predict_lineages_multiple_sizes} +\alias{predict_lineages_multiple_sizes} +%- Also NEED an '\alias' for EACH other topic documented here. +\title{ +predict_lineages_multiple_sizes +} +\description{ +This function predicts cell lineages from single-cell RNA-sequencing data with sizes ranging from a minimal to a maximal value. The prediction of lineages is performed as for the ‘predict_lineages’ function not only for a single desired lineage size, but independently for all cluster sizes in between a minimal and maximal value. For each prediction, the predicted lineage information is generated as for the ‘prediction_to_lineage_information function’. The function generates the input for the function ‘cluster_stability_plot’, which allows to estimate the size of multicellular structures present in the single-cell RNA-sequencing data. + } +\usage{ +predict_lineages_multiple_sizes(GEMLI_items, repetitions=10, sample_size=(2/3), minimal_maximal_cluster_size=c(2,50), cutoff=5) +} +\arguments{ + \item{ + GEMLI_items}{GEMLI_items is a list of GEMLI inputs and outputs. To run 'predict_lineages' it should contain a gene expression matrix named 'gene_expression'. This is a quality controlled and normalized gene expression matrix where rownames are genes (features) and column names are cell IDs (samples). + } + \item{ + repetitions}{'repetitions' specifies how many times the input matrix will be clustered using random subsamples of potential markers. A higher number of iterations leads to more robust results. 10 iterations is considered to be the minimum, but 100 iterations are strongly recommended (default value). Runtime is linear with regard to the number of iterations. + } + \item{ + sample_size}{'sample_size' is a value between 0 and 1 and specifies the fraction of potential markers that are used in each clustering. values between 0.5 and 0.67 (default value) are recommended. + } + \item{ + minimal_maximal_cluster_size}{'minimal_maximal_cluster_size' gives the minimal and maximal number of cells per cluster for which independent lineage predictions are run. The input is a vector of two values (minimal, maximal), e.g. c(2,50). The maximal value chosen should correspond to a value close or above the maximal expected size of multicellular structures present in the single-cell RNA-sequencing data. The default value is c(2,50). + } + \item{ + cutoff}{'cutoff' specifies the confidence score at which a cell pair is considered to be part of the same lineage. High values (e.g. 70-100) provide high precision but lower sensitivity. Low values (e.g. 30-60) provide higher sensitivity but lower precision. + } +} +\value{ + 'predict_lineages_multiple_sizes' yields a cell by lineage size matrix containing lineage IDs. This matrix is added to the 'GEMLI_items' under the name 'prediction_multiple_sizes'. The function generates the input for the function 'cluster_stability_plot', which allows to estimate the size of multicellular structures present in the single-cell RNA-sequencing data +} +\references{ +A.S. Eisele*, M. Tarbier*, A.A. Dormann, V. Pelechano, D.M. Suter | "Gene-expression memory-based prediction of cell lineages from scRNA-seq datasets" | Nature Communications 15, 2744 (2024). https://doi.org/10.1038/s41467-024-47158-y +} +\author{ +Almut Eisele and Marcel Tarbier +} +\examples{ +##---- Should be DIRECTLY executable !! ---- +##-- ==> Define data, use random, +##-- or do help(data=index) for the standard data sets. + +## The function is currently defined as +function (x) +{ + } +} +% Add one or more standard keywords, see file 'KEYWORDS' in the +% R documentation directory (show via RShowDoc("KEYWORDS")): +% \keyword{ ~kwd1 } +% \keyword{ ~kwd2 } +% Use only one keyword per line. +% For non-standard keywords, use \concept instead of \keyword: +% \concept{ ~cpt1 } +% \concept{ ~cpt2 } +% Use only one concept per line. diff --git a/GEMLI_package_v1/man/predict_lineages_with_known_markers.Rd b/GEMLI_package_v1/man/predict_lineages_with_known_markers.Rd new file mode 100644 index 0000000..5e7cd6c --- /dev/null +++ b/GEMLI_package_v1/man/predict_lineages_with_known_markers.Rd @@ -0,0 +1,54 @@ +\name{predict_lineages_with_known_markers} +\alias{predict_lineages_with_known_markers} +%- Also NEED an '\alias' for EACH other topic documented here. +\title{ +predict_lineages_with_known_markers +} +\description{ +This function predicts cell lineages from from single-cell RNA-sequencing data when lineage markers are already knwon, e.g. from a barcoding experiment. Subsets of these genes are used to cluster cells until the desired cluster size is reached. This clustering is repeated many times for random subsets. The result is a cell by cell matrix that lists how many times each cell pair clustered together which translates into a confidence score. Cell pairs with high confidence scores are likely to be members of the same lineage. +} +\usage{ +predict_lineages_with_known_markers(GEMLI_items, repetitions=100, sample_size=(2/3), desired_cluster_size=c(2,3)) +} +\arguments{ + \item{ + GEMLI_items}{GEMLI_items is a list of GEMLI inputs and outputs. To run 'predict_lineages' it should contain a gene expression matrix named 'gene_expression'. This is a quality controlled and normalized gene expression matrix where rownames are genes (features) and column names are cell IDs (samples). In addition it needs to contain a vector of known marker genes names named 'known_markers'. + } + \item{ + repetitions}{'repetitions' specifies how many times the input matrix will be clustered using random subsamples of potential markers. A higher number of iterations leads to more robust results. 10 iterations is considered to be the minimum, but 100 iterations are strongly recommended (default value). Runtime is linear with regard to the number of iterations. + } + \item{ + sample_size}{'sample_size' is a value between 0 and 1 and specifies the fraction of potential markers that are used in each clustering. values between 0.5 and 0.67 (default value) are recommended. + } + \item{ + desired_cluster_size}{'desired_cluster_size' specifies the number of cells per cluster to be achieved in each clustering. The input is a lsit of values, e.g. c(2,3,4) or (2:4). The desired_cluster_size parameter should generally be small. Values between 2 and 4 are recommended (default). + } +} +\value{ +'predict_lineages' yields a cell by cell matrix containing confidence scores. Cell pairs with high confidence scores are likely to be members of the same lineage. This matrix is added to the 'GEMLI_items' under the name 'prediction'. +} +\references{ +A.S. Eisele*, M. Tarbier*, A.A. Dormann, V. Pelechano, D.M. Suter | "Gene-expression memory-based prediction of cell lineages from scRNA-seq datasets" | Nature Communications 15, 2744 (2024). https://doi.org/10.1038/s41467-024-47158-y +} +\author{ +Marcel Tarbier and Almut Eisele +} +\examples{ +##---- Should be DIRECTLY executable !! ---- +##-- ==> Define data, use random, +##-- or do help(data=index) for the standard data sets. + +## The function is currently defined as +function (x) +{ + } +} +% Add one or more standard keywords, see file 'KEYWORDS' in the +% R documentation directory (show via RShowDoc("KEYWORDS")): +% \keyword{ ~kwd1 } +% \keyword{ ~kwd2 } +% Use only one keyword per line. +% For non-standard keywords, use \concept instead of \keyword: +% \concept{ ~cpt1 } +% \concept{ ~cpt2 } +% Use only one concept per line. diff --git a/GEMLI_package_v1/man/prediction_to_lineage_information.Rd b/GEMLI_package_v1/man/prediction_to_lineage_information.Rd new file mode 100644 index 0000000..e435a36 --- /dev/null +++ b/GEMLI_package_v1/man/prediction_to_lineage_information.Rd @@ -0,0 +1,50 @@ +\name{prediction_to_lineage_information} +\alias{prediction_to_lineage_information} +%- Also NEED an '\alias' for EACH other topic documented here. +\title{ +prediction_to_lineage_information +} +\description{ +This transforms a cell by cell matrix of confidence scores as created by the 'predict_lineages' function into a table (or vector, if specified) of predicted lineages. +} +\usage{ +prediction_to_lineage_information(GEMLI_items, cutoff, output_as_dict = T) +} +\arguments{ + \item{ + GEMLI_items}{'GEMLI_items' is a list of GEMLI inputs and outputs. To run 'test_lineages' it should contain a prediction matrix named 'prediction' that is generated and added to the items list by the function 'predict lineages'. + } + \item{ + cutoff}{'cutoff' specifies the confidence score at which a cell pair is considered to be part of the same lineage. High values (e.g. 70-100) provide high precision but lower sensitivity. Low values (e.g. 30-60) provide higher sensitivity but lower precision. The default value is 50. + } + \item{ + output_as_dict}{'output_as_dict' AlmutNeeded } +} +\value{ +The output is a matrix that lists cell IDs in the first column and the predicted lineage in the second column. This is added to the 'GEMLI_items' under the name 'predicted_lineage_table'. It also generates and adds the result as a vector that contains the predicted lineage as values and the cell IDs as names under the name 'predicted_lineages'. +} +\references{ +A.S. Eisele*, M. Tarbier*, A.A. Dormann, V. Pelechano, D.M. Suter | "Gene-expression memory-based prediction of cell lineages from scRNA-seq datasets" | Nature Communications 15, 2744 (2024). https://doi.org/10.1038/s41467-024-47158-y +} +\author{ +Marcel Tarbier and Almut Eisele +} +\examples{ +##---- Should be DIRECTLY executable !! ---- +##-- ==> Define data, use random, +##-- or do help(data=index) for the standard data sets. + +## The function is currently defined as +function (x) +{ + } +} +% Add one or more standard keywords, see file 'KEYWORDS' in the +% R documentation directory (show via RShowDoc("KEYWORDS")): +% \keyword{ ~kwd1 } +% \keyword{ ~kwd2 } +% Use only one keyword per line. +% For non-standard keywords, use \concept instead of \keyword: +% \concept{ ~cpt1 } +% \concept{ ~cpt2 } +% Use only one concept per line. diff --git a/GEMLI_package_v1/man/quantify_clusters_iterative.Rd b/GEMLI_package_v1/man/quantify_clusters_iterative.Rd new file mode 100644 index 0000000..cb148e3 --- /dev/null +++ b/GEMLI_package_v1/man/quantify_clusters_iterative.Rd @@ -0,0 +1,54 @@ +\name{quantify_clusters_iterative} +\alias{quantify_clusters_iterative} +%- Also NEED an '\alias' for EACH other topic documented here. +\title{ +quantify_clusters_iterative +} +\description{ +This function clusters the input matrix repeatedly until a desired cluster size is reached. +} +\usage{ +quantify_clusters_iterative(data_matrix, marker_genes, N=2, verbose=T) +} +\arguments{ + \item{ + data_matrix}{'data_matrix' is a quality controlled and normalized gene expression matrix where rownames are genes (features) and column names are cell IDs (samples). + } + \item{ + marker_genes}{'marker_genes' is a vector of gene names or IDs of potential lineage marker genes. It is automatically created in the 'predict_lineages' function or can be computed manually using the 'potential_markers' function. + } + \item{ + N}{'N' describes in how many branches the data matrix is split in each clustering step. Higher numbers speed up the clustering but can negatively impact the result of the prediction. It is highly recommended to keep the default value, 2. + } + \item{ + verbose}{'verbose' if you want to print information on screen for each repetition. + } +} +\value{ +The output is a matrix that indicates which cells (rows) cluster together in each iteration (colums). It is used in the 'predict_lineages' function. +} +\references{ +A.S. Eisele*, M. Tarbier*, A.A. Dormann, V. Pelechano, D.M. Suter | "Gene-expression memory-based prediction of cell lineages from scRNA-seq datasets" | Nature Communications 15, 2744 (2024). https://doi.org/10.1038/s41467-024-47158-y +} +\author{ +Marcel Tarbier and Almut Eisele +} +\examples{ +##---- Should be DIRECTLY executable !! ---- +##-- ==> Define data, use random, +##-- or do help(data=index) for the standard data sets. + +## The function is currently defined as +function (x) +{ + } +} +% Add one or more standard keywords, see file 'KEYWORDS' in the +% R documentation directory (show via RShowDoc("KEYWORDS")): +% \keyword{ ~kwd1 } +% \keyword{ ~kwd2 } +% Use only one keyword per line. +% For non-standard keywords, use \concept instead of \keyword: +% \concept{ ~cpt1 } +% \concept{ ~cpt2 } +% Use only one concept per line. diff --git a/GEMLI_package_v1/man/suggest_network_trimming_to_size.Rd b/GEMLI_package_v1/man/suggest_network_trimming_to_size.Rd new file mode 100644 index 0000000..810d7a7 --- /dev/null +++ b/GEMLI_package_v1/man/suggest_network_trimming_to_size.Rd @@ -0,0 +1,66 @@ +\name{suggest_network_trimming_to_size} +\alias{suggest_network_trimming_to_size} +%- Also NEED an '\alias' for EACH other topic documented here. +\title{ +suggest_network_trimming_to_size +} +\description{ +This function visualizes the lineage prediction as a network and highlights edges that could be removed based on a maximum lineage size cutoff. In lineages that exceed this size, the weakest links (least likely shared lineages) are suggested to be trimmed until the desired maximum size is reached. Trimming suggestions are highlighted in red. +} +\usage{ +suggest_network_trimming_to_size(GEMLI_items, max_size=4, cutoff=70, max_edge_width=5, display_orphan=F, include_labels=F, ground_truth=F, layout_style="fr") +} +\arguments{ + \item{ + GEMLI_items}{'GEMLI_items' is a list of GEMLI inputs and outputs. To run 'suggest_network_trimming_to_size' it should contain a prediction matrix named 'prediction' that is generated and added to the items list by the function 'predict lineages'. It also needs to contain a vector of lineages (values are the lineages and names are the cell IDs) from predictions named 'predicted_lineages'. This vector can be added to the 'GEMLI_items' using the 'prediction_to_lineage_information' function. + } + \item{ + max_size}{'max_size' specifies maximum size of lineages. + } + \item{ + cutoff}{'cutoff' specifies the confidence score at which a cell pair is considered to be part of the same lineage. High values (e.g. 70-100) provide high precision but lower sensitivity. Low values (e.g. 30-60) provide higher sensitivity but lower precision. Default value is 70. + } + \item{ + max_edge_width}{'max_edge_width' specifies the maximum width of edges in the network visualization. All edge weights above the defined 'cutoff' will be scaled between 0.1*'max_edge_with' and 'max_edge_with'. Default value is 5. + } + \item{ + display_orphan}{'display_orphan' defines whether cells without connections should be displayed. This commonly leads the network plot getting less readable. Therefore the suggestion and the default are 'false'/'F'. + } + \item{ + include_labels}{'include_labels' defines whether nodes should be numbered. Cell IDs are not shown for readability. + } + \item{ + ground_truth}{If the 'GEMLI_items' list contains a 'barcodes' vector with orthogonal lineage information 'ground_truth' can be set 'true'/'T' to color cells according to their lineage. Default is 'false'/'F'. + } + \item{ + layout_style}{Depending on the number of cells, and the size of lineages, different layout styles can improve readability. In the suggest_network_trimming_to_size two layout algorithms can be chosen: Fruchterman-Reingold ("fr") and Kamada-Kawai ("kk"). Default is "fr". + } +} +\value{ +This function has no output. It is for visualization only. +} +\references{ +A.S. Eisele*, M. Tarbier*, A.A. Dormann, V. Pelechano, D.M. Suter | "Gene-expression memory-based prediction of cell lineages from scRNA-seq datasets" | Nature Communications 15, 2744 (2024). https://doi.org/10.1038/s41467-024-47158-y +} +\author{ +Marcel Tarbier and Almut Eisele +} +\examples{ +##---- Should be DIRECTLY executable !! ---- +##-- ==> Define data, use random, +##-- or do help(data=index) for the standard data sets. + +## The function is currently defined as +function (x) +{ + } +} +% Add one or more standard keywords, see file 'KEYWORDS' in the +% R documentation directory (show via RShowDoc("KEYWORDS")): +% \keyword{ ~kwd1 } +% \keyword{ ~kwd2 } +% Use only one keyword per line. +% For non-standard keywords, use \concept instead of \keyword: +% \concept{ ~cpt1 } +% \concept{ ~cpt2 } +% Use only one concept per line. diff --git a/GEMLI_package_v1/man/test_lineages.Rd b/GEMLI_package_v1/man/test_lineages.Rd new file mode 100644 index 0000000..955c71c --- /dev/null +++ b/GEMLI_package_v1/man/test_lineages.Rd @@ -0,0 +1,54 @@ +\name{test_lineages} +\alias{test_lineages} +%- Also NEED an '\alias' for EACH other topic documented here. +\title{ +test_lineages +} +\description{ +This function tests the results of lineage assignments by comparing it to lineage assignments from cell barcoding. +} +\usage{ +test_lineages(GEMLI_items, valid_fam_sizes=(1:5), max_interval=100, plot_results=F) +} +\arguments{ + \item{ + GEMLI_items}{'GEMLI_items' is a list of GEMLI inputs and outputs. To run 'test_lineages' it should contain a prediction matrix named 'prediction' that is generated and added to the items list by the function 'predict lineages'. It also needs to contain a ground truth names 'barcodes' provided as a named vector (values are the lineages and names are the cell IDs). + } + \item{ + valid_fam_sizes}{'valid_fam_sizes' specifies a resonable range for lineage sizes. + } + \item{ + max_interval}{'max_interval' is the number of repetitions used in the 'predict_lineages' function. The default value is 100. + } + \item{ + plot_results}{'plot_results' specifies whether the results of the test are visualized (plotted). + } +} +\value{ +The output is a table that lists the number of false positives (FP) and true positives (TP), the precision (TP/PP, where PP is the number of predicted positives which is the sum of TP and FP), and the sensitivity (TP/P, where P is the number of real positives which is the sum of TP and FN - false negatives). +} +\references{ +A.S. Eisele*, M. Tarbier*, A.A. Dormann, V. Pelechano, D.M. Suter | "Gene-expression memory-based prediction of cell lineages from scRNA-seq datasets" | Nature Communications 15, 2744 (2024). https://doi.org/10.1038/s41467-024-47158-y +} +\author{ +Marcel Tarbier and Almut Eisele +} +\examples{ +##---- Should be DIRECTLY executable !! ---- +##-- ==> Define data, use random, +##-- or do help(data=index) for the standard data sets. + +## The function is currently defined as +function (x) +{ + } +% Add one or more standard keywords, see file 'KEYWORDS' in the +% R documentation directory (show via RShowDoc("KEYWORDS")): +% \keyword{ ~kwd1 } +% \keyword{ ~kwd2 } +% Use only one keyword per line. +% For non-standard keywords, use \concept instead of \keyword: +% \concept{ ~cpt1 } +% \concept{ ~cpt2 } +% Use only one concept per line. +} diff --git a/GEMLI_package_v1/man/trim_network_to_size.Rd b/GEMLI_package_v1/man/trim_network_to_size.Rd new file mode 100644 index 0000000..35a4939 --- /dev/null +++ b/GEMLI_package_v1/man/trim_network_to_size.Rd @@ -0,0 +1,51 @@ +\name{trim_network_to_size} +\alias{trim_network_to_size} +%- Also NEED an '\alias' for EACH other topic documented here. +\title{ +trim_network_to_size +} +\description{ +This function trims predicted lineages that exceed a maximum lineage size. In lineages that exceed this size, the weakest links (least likely shared lineages) are trimmed until the desired maximum size is reached. +} +\usage{ +trim_network_to_size(GEMLI_items, max_size = 4, cutoff = 70) +} +\arguments{ + \item{ + GEMLI_items}{'GEMLI_items' is a list of GEMLI inputs and outputs. To run 'suggest_network_trimming_to_size' it should contain a prediction matrix named 'prediction' that is generated and added to the items list by the function 'predict lineages'. It also needs to contain a vector of lineages (values are the lineages and names are the cell IDs) from predictions named 'predicted_lineages'. This vector can be added to the 'GEMLI_items' using the 'prediction_to_lineage_information' function. + } + \item{ + max_size}{'max_size' specifies maximum size of lineages. + } + \item{ + cutoff}{'cutoff' specifies the confidence score at which a cell pair is considered to be part of the same lineage. High values (e.g. 70-100) provide high precision but lower sensitivity. Low values (e.g. 30-60) provide higher sensitivity but lower precision. Default value is 70. + } +} +\value{ +This function return a 'GEMLI_items' list in which the prediction matrix ('prediction'), the prediction table ('predicted_lineage_table') and the prediction vector ('predicted_lineages') have been trimmed to the specified size. +} +\references{ +A.S. Eisele*, M. Tarbier*, A.A. Dormann, V. Pelechano, D.M. Suter | "Gene-expression memory-based prediction of cell lineages from scRNA-seq datasets" | Nature Communications 15, 2744 (2024). https://doi.org/10.1038/s41467-024-47158-y +} +\author{ +Marcel Tarbier and Almut Eisele +} +\examples{ +##---- Should be DIRECTLY executable !! ---- +##-- ==> Define data, use random, +##-- or do help(data=index) for the standard data sets. + +## The function is currently defined as +function (x) +{ + } +} +% Add one or more standard keywords, see file 'KEYWORDS' in the +% R documentation directory (show via RShowDoc("KEYWORDS")): +% \keyword{ ~kwd1 } +% \keyword{ ~kwd2 } +% Use only one keyword per line. +% For non-standard keywords, use \concept instead of \keyword: +% \concept{ ~cpt1 } +% \concept{ ~cpt2 } +% Use only one concept per line. diff --git a/GEMLI_package_v1/man/visualize_as_network.Rd b/GEMLI_package_v1/man/visualize_as_network.Rd new file mode 100644 index 0000000..b58340a --- /dev/null +++ b/GEMLI_package_v1/man/visualize_as_network.Rd @@ -0,0 +1,75 @@ +\name{visualize_as_network} +\alias{visualize_as_network} +%- Also NEED an '\alias' for EACH other topic documented here. +\title{ +visualize_as_network +} +\description{ +This function visualizes the lineage prediction as a network. +} +\usage{ +visualize_as_network(GEMLI_items, cutoff=70, max_edge_width=5, display_orphan=F, include_labels=F, ground_truth=F, highlight_FPs=F, layout_style="fr", cell_type_colors=F, title.cex=1, legend.cex=1) +} +\arguments{ + \item{ + GEMLI_items}{GEMLI_items is a list of GEMLI inputs and outputs. To run 'visualize_as_network' it should contain a prediction matrix named 'prediction' that is generated and added to the items list by the function 'predict lineages'. + } + \item{ + cutoff}{'cutoff' specifies the confidence score at which a cell pair is considered to be part of the same lineage. High values (e.g. 70-100) provide high precision but lower sensitivity. Low values (e.g. 30-60) provide higher sensitivity but lower precision. Default value is 70. + } + \item{ + max_edge_width}{'max_edge_width' specifies the maximum width of edges in the network visualization. All edge weights above the defined 'cutoff' will be scaled between 0.1*'max_edge_with' and 'max_edge_with'. Default value is 5. + } + \item{ + display_orphan}{'display_orphan' defines whether cells without connections should be displayed. This commonly leads the network plot getting less readable. Therefore the suggestion and the default are 'false'/'F'. + } + \item{ + include_labels}{'include_labels' defines whether nodes should be numbered. Cell IDs are not shown for readability. + } + \item{ + ground_truth}{If the 'GEMLI_items' list contains a 'barcodes' vector with orthogonal lineage information 'ground_truth' can be set 'true'/'T' to color cells according to their lineage. Default is 'false'/'F'. + } + \item{ + highlight_FPs}{If the 'GEMLI_items' list contains a 'barcodes' vector with orthogonal lineage information and 'ground_truth' is 'true'/'T' connections between cell that are false positives will be highlighted in red. It can be set tp 'false'/'F' to not highlight false predictions. Default is 'false'/'F'. + } + \item{ + layout_style}{Depending on the number of cells, and the size of lineages, different layout styles can improve readability. Currently three different network layout algorithms canbe chosen: Fruchterman-Reingold ("fr"), Kamada-Kawai ("kk"), and grid ("grid"). Default is "fr". + } + \item{ + cell_type_colors}{'cell_type_colors' can be set 'true'/'T' to color cells by assigned cell type. For such coloring the GEMLI items list must contain a cell_type element (dataframe with column 'cell.ID' and 'cell.type'). Specific colors will be assigned to specific cell types if a GEMLI items list elemnt 'cell_type_color' is added (dataframe with column 'cell-type' and 'color'). If no 'cell_type_color' element is present, random colors will be assigned to each cell type. Default is cell_type_colors = 'F'. + } + \item{ + title.cex}{'title.cex' Character expansion for title size. + } + \item{ + legend.cex}{'legend.cex' Character expansion for legend size. + } +} +\value{ +WIP +} +\references{ +A.S. Eisele*, M. Tarbier*, A.A. Dormann, V. Pelechano, D.M. Suter | "Gene-expression memory-based prediction of cell lineages from scRNA-seq datasets" | Nature Communications 15, 2744 (2024). https://doi.org/10.1038/s41467-024-47158-y +} +\author{ +Marcel Tarbier and Almut Eisele +} +\examples{ +##---- Should be DIRECTLY executable !! ---- +##-- ==> Define data, use random, +##-- or do help(data=index) for the standard data sets. + +## The function is currently defined as +function (x) +{ + } +} +% Add one or more standard keywords, see file 'KEYWORDS' in the +% R documentation directory (show via RShowDoc("KEYWORDS")): +% \keyword{ ~kwd1 } +% \keyword{ ~kwd2 } +% Use only one keyword per line. +% For non-standard keywords, use \concept instead of \keyword: +% \concept{ ~cpt1 } +% \concept{ ~cpt2 } +% Use only one concept per line. diff --git a/GEMLI_package_v1/vignettes/Example1_simple_lineage_predictions.Rmd b/GEMLI_package_v1/vignettes/Example1_simple_lineage_predictions.Rmd new file mode 100644 index 0000000..7ec9c76 --- /dev/null +++ b/GEMLI_package_v1/vignettes/Example1_simple_lineage_predictions.Rmd @@ -0,0 +1,104 @@ +--- +title: "Example1_simple_lineage_predictions" +output: html_vignette +date: "2023-06-02" +vignette: > + %\VignetteIndexEntry{Example1_simple_lineage_predictions} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} + +--- + +```{r, echo = FALSE, message=FALSE} +knitr::opts_chunk$set(message = FALSE, warning = FALSE) +``` + + +For our first example we'll be looking at mouse embryonic stem cells that have been barcoded and cultured for 48h. We'll be working with a subset of this data for fast processing. In our subset we find 'family sizes' ranging from just two up to five related cells. + +## Load package and example data + +First we load the example data. + +```{r, eval=T, echo=T} +library(GEMLI) +library(igraph) +library(HiClimR) + +load('GEMLI_example_data_matrix.RData') +load('GEMLI_example_barcode_information.RData') + +``` + +## Create a GEMLI items list + +GEMLI's inputs and outputs are stored in a list of objects with predefined names. To run GEMLI you need at least a quality controlled and normalized gene expression matrix (rows = genes/features, colums = cells/samples). In this example we also provide a ground truth for lineages stemming from a barcoding experiment (values = barcode ID, names = cell IDs). + +```{r, eval=T, echo=T} +# Making GEMLI list and storing data +GEMLI_items = list() +GEMLI_items[['gene_expression']] = data_matrix +GEMLI_items[['barcodes']] = lineage_dict_bc + +# A brief look at the loaded data +GEMLI_items[['gene_expression']][9:14,1:5] +GEMLI_items[['barcodes']][1:5] + +``` + +## Perform lineage predictions + +We can then identify cell lineages through repeated iterative clustering (this may take 2-3min). The predict_lineages function takes our GEMLI_items as input. It outputs a matrix of all cells against all cells with values corresponding to a confidence score that they are part of the same lineage. + +```{r, eval=T, echo=T} +# Perform lineage predictions +GEMLI_items = predict_lineages(GEMLI_items) + +# A brief look at the result +GEMLI_items[['prediction']][1:5,15:19] +``` + +## Test lineage prediction + +Since we have barcoding data for this dataset we can test the predicted lineages against our ground truth. The test_lineage_prediction function again takes our GEMLI_items as input. It's important that a predcition has been run first with predict_lineages. It outputs the number of true positive predictions (TP), false positive predictions (FP), as well as precision and sensitivity for various confidence intervals. The output can be visualized by setting plot_results to true/T. + +```{r, eval=T, echo=T, fig.height=6, fig.width=6} +GEMLI_items = test_lineages(GEMLI_items) + +# A brief look at the resulting table +GEMLI_items$testing_results + +# And a run with plotting of the result +GEMLI_items = test_lineages(GEMLI_items, plot_results=T) +``` + +## Visualize predictions as network + +We can also investigate our predictions by visualizing them as a network with the visualize_as_network function. Here we need to set a cutoff that defines which predictions we want to consider. It represents a confidence score and high values yield fewer predictions with high precision while low values yield more predictions with lower precision. + +```{r, eval=T, echo=T, fig.height=6, fig.width=6} + +visualize_as_network(GEMLI_items, cutoff=90) +visualize_as_network(GEMLI_items, cutoff=50) +``` + + +If a ground truth e.g. from barcoding is avalable we can set ground_truth to true/T to highlight false predictions with red edges. Cells without barcode information will be displayed in white. + +```{r, eval=T, echo=T, fig.height=6, fig.width=6} +visualize_as_network(GEMLI_items, cutoff=90, ground_truth=T) +visualize_as_network(GEMLI_items, cutoff=50, ground_truth=T) +``` + +## Extract lineage information + +Now we can extract the lineage information with the prediction_to_lineage_information function. Again we need to set a cutoff that defines which predictions we want to consider. The function outputs both a lineage table and a 'dictionary', a vector that has the lineage number as values and the cell IDs as names. + +```{r, eval=T, echo=T} +GEMLI_items = prediction_to_lineage_information(GEMLI_items, cutoff=50) + +# A brief look at the result +GEMLI_items$predicted_lineage_table[1:5,] + +``` + diff --git a/GEMLI_package_v1/vignettes/Example2_predicting_multicellular_structures.Rmd b/GEMLI_package_v1/vignettes/Example2_predicting_multicellular_structures.Rmd new file mode 100644 index 0000000..29d3ed1 --- /dev/null +++ b/GEMLI_package_v1/vignettes/Example2_predicting_multicellular_structures.Rmd @@ -0,0 +1,98 @@ + +--- +title: "Example2_predicting_multicellular_structures" +output: html_vignette +date: "2023-06-05" +vignette: > + %\VignetteIndexEntry{Example2_predicting_multicellular_structures} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} + +--- + +```{r, echo = FALSE, message=FALSE} +knitr::opts_chunk$set(message = FALSE, warning = FALSE) +``` + +In this example we'll be visualizing GEMLI results from a lineage-annotated dataset of murine intestinal crypts. Intestinal crypts originate from one or few intestinal stem cells, and are composed of a number of different cell types. In the data we find cells from individual crypts that range from just three up to forty-four related cells. The scRNA-seq dataset is derived from Bues et al. 2022 (PMID 35165449) and is publically available under Gene Expression Omnibus accession number GSE148093. + + +## Load package and example data + +First we load the example data. Here we already predicted the lineages using GEMLI and therefore do not include a count matrix, but rather start with the predictions right away. + +```{r, eval=T, echo=T} +library(GEMLI) +library(igraph) +library(HiClimR) + +load('GEMLI_crypts_example_data_matrix.RData') +load('GEMLI_crypts_example_barcode_information.RData') + +``` + +## Create a GEMLI items list + +We then create a GEMLI items list. This list is used to store the data, and create and store the outputs of GEMLI (for details check example one). + +```{r, eval=T, echo=T} + +GEMLI_items_crypts = list() +GEMLI_items_crypts[['prediction']] = Crypts +GEMLI_items_crypts[['barcodes']] = Crypts_bc_dict + +``` + +## Visualize predictions as network + +To visualize large lineages we'll use three different network layout algorithms: Fruchterman-Reingold, Kamada-Kawai, and grid. Each of them has advantages and disadvantages. + +(1) Fruchterman-Reingold dispalys the cells of individual predicted crypts close together with ample space between crypts. This makes it hard to see connections within individual crypts but allows to get a good overview of individual structures. + +(2) Kamada-Kawai spaces individual cells well, so we can see individual connections between them. It may, however, happen that two different predicted crypts are partially overlayed, as can be seen for dark red and bright green lineages on the right side of the plot. + +(3) When the network is layed out as a grid, one gets generally a good overview of the predicted lineages and their connections, but it's hard to see which connections belongs to which cell in the same row. + +```{r, eval=T, echo=T, fig.height=6, fig.width=6} + +visualize_as_network(GEMLI_items_crypts, cutoff=70, display_orphan=F, max_edge_width=1, ground_truth=T, include_labels=F, layout_style="fr") +visualize_as_network(GEMLI_items_crypts, cutoff=70, display_orphan=F, max_edge_width=1, ground_truth=T, include_labels=F, layout_style="kk") +visualize_as_network(GEMLI_items_crypts, cutoff=70, display_orphan=F, max_edge_width=1, ground_truth=T, include_labels=F, layout_style="grid") + +``` + +## Adding cell type information to the GEMLI items list + +The cells of individual intestinal crypts can be assigned to different cell types. This information can be added to the GEMLI items list as 'cell_type' slot in the form of a dataframe with column 'cell.ID' and 'cell.type'. + +```{r, eval=T, echo=T} + +load('GEMLI_crypts_example_cell_type_annotation.RData') +GEMLI_items_crypts[['cell_type']] = Crypts_annotation + +``` + +## Color prediction network visualization by cell type + +The visualization of the lineage predictions can now be colored by the cell type annotation. This allows to see the composition of individual intestinal crypts. + +```{r, eval=T, echo=T, fig.height=6, fig.width=6} + +visualize_as_network(GEMLI_items_crypts, cutoff=70, max_edge_width=5, display_orphan=F, include_labels=F, ground_truth=T, highlight_FPs=T, layout_style="kk", cell_type_colors=T) + +``` + +Specific colors can be assigned to specific cell types by adding a dataframe with column 'cell.type' and 'color' in the GEMLI items list slot 'cell_type_color'. This can also allow to highlight just one or two selected cell types. + +```{r, eval=T, echo=T, fig.height=6, fig.width=6} + +# Adding custom color as cell_type_color element to GEMLI_items +cell.type <- unique(GEMLI_items_crypts[['cell_type']]$cell.type) +color <- c("#5386BD", "skyblue1", "darkgreen", "gold", "red", "darkred", "black") +Cell_type_color <- data.frame(cell.type, color) +GEMLI_items_crypts[['cell_type_color']] = Cell_type_color + +# Make a visualization network +visualize_as_network(GEMLI_items_crypts, cutoff=70, max_edge_width=5, display_orphan=F, include_labels=F, ground_truth=T, highlight_FPs=T, layout_style="kk", cell_type_colors=T) + +``` diff --git a/GEMLI_package_v1/vignettes/Example3_cell_fate_analysis.Rmd b/GEMLI_package_v1/vignettes/Example3_cell_fate_analysis.Rmd new file mode 100644 index 0000000..b688ba0 --- /dev/null +++ b/GEMLI_package_v1/vignettes/Example3_cell_fate_analysis.Rmd @@ -0,0 +1,80 @@ + +--- +title: "Example3_cell_fate_analysis" +output: html_document +date: "2023-06-05" +vignette: > + %\VignetteIndexEntry{Example3_cell_fate_analysis} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r, echo = FALSE, message=FALSE} +knitr::opts_chunk$set(message = FALSE, warning = FALSE) +``` + +For our third example we'll be looking at cell fate decisions in a scRNA-seq dataset of human breast cancer encompassing both ductal carcinoma in situ (DCIS) and invasive tumor (inv_tumor) cells. We'll be working with a subset of this data for fast processing. No ground truth is available. We will study the fate transition from DCIS to invasive tumor cells. The data is derived from a public 10X Genomics dataset associated to the following preprint bioRxiv 2022.10.06.510405; doi: https://doi.org/10.1101/2022.10.06.510405 and downloaded from https://www.10xgenomics.com/products/xenium-in-situ/preview-dataset-human-breast (April 2023). + +## Load example data + +First we load the example data. We already predicted the lineages and extracted the predicted lineages using GEMLI. Furthermore we load the previously generated cell type information for all cells in the dataset. + +```{r, eval=T, echo=T} +library(GEMLI) +library(igraph) +library(HiClimR) +library(dplyr) +library(ggplot2) +library(Seurat) +library(ggrepel) + +load('GEMLI_cancer_example_norm_count.RData') +load('GEMLI_cancer_example_predicted_lineages.RData') +load('GEMLI_cancer_example_cell_type_annotation.RData') + +``` + + +## Create a GEMLI items list + +We then create a GEMLI items list. This list is used to store the data, and create and store the outputs of GEMLI (for details check example one). + +```{r, eval=T, echo=T} + +GEMLI_items = list() +GEMLI_items[['gene_expression']] = Cancer_norm_count +GEMLI_items[['predicted_lineage_table']] = Cancer_predicted_lineages +GEMLI_items[['cell_type']] = Cancer_annotation + +``` + + +## Extract symmetric and asymmetric cell lineages + +We extract now predicted cell lineages with members in only one cell type (symmetric) or in two or more cell types (asymmetric). To analyze the transition from DCIS to invasive breast cancer we will extract symmetric DCIS, asymmetric DCIS and invasive tumor, and symmetric inv_tumor lineages. To exclude lineages with a too large asymmetry, we set a threshold to extract asymmetric lineages containing at least 10% of each cell type. The function output is stored in GEMLI_items 'cell_fate_analysis' item. It is a data frame with a column cell.fate with label sym or asym and cell type separated by an underscore. This cell.fate designation allows to subsequently analyze only a specific cell type in asymmetric cell lineages. + +```{r, eval=T, echo=T} + +GEMLI_items<-extract_cell_fate_lineages(GEMLI_items, selection=c("inv_tumor", "DCIS"), unique=FALSE, threshold=c(10,10)) + +# A brief look at the result +GEMLI_items[['cell_fate_analysis']][1:10,] +table(GEMLI_items[['cell_fate_analysis']]$cell.fate) + +``` + +## Call and visualize DEG for cells in specific lineage types + +Based on the symmetric and asymmetric lineages we extracted, we will now call differentially expressed genes (DEG) specific for cells of specific cell types in specific lineages types. To analyze the transition from DCIS to invasive breast cancer, we notable call DEG for DCIS cells in asymmetric and symmetric lineages. These are genes specific to DCIS cells at the start of the transition. + +```{r, eval=T, echo=T} + +GEMLI_items<-cell_fate_DEG_calling(GEMLI_items, ident1="sym_DCIS", ident2="asym_DCIS", min.pct=0.05, logfc.threshold=0.1) + +# A brief look at the result +GEMLI_items[['DEG']][1:10,] + +# Volcano plot of the DEG analysis +DEG_volcano_plot(GEMLI_items, name1="Sym_DCIS", name2="Asym_DCIS") + +```