diff --git a/DESCRIPTION b/DESCRIPTION index 6cf5912..7c22620 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: FRASER Type: Package Title: Find RAre Splicing Events in RNA-Seq Data -Version: 2.5.6 -Date: 2025-10-24 +Version: 2.5.7 +Date: 2025-10-29 Authors@R: c( person("Christian", "Mertes", role=c("aut", "cre"), email="mertes@in.tum.de", comment=c(ORCID="0000-0002-1091-205X")), @@ -120,6 +120,7 @@ Collate: getURLs.R helper-functions.R mergeExternalData.R + mergeFds.R saveHDF5Objects.R RcppExports.R autoencoder.R diff --git a/NAMESPACE b/NAMESPACE index 80dbf37..fe7debe 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -61,6 +61,7 @@ export(makeSimulatedFraserDataSet) export(mapSeqlevels) export(mergeCounts) export(mergeExternalData) +export(mergeFDS) export(name) export(nonSplicedReads) export(pVals) @@ -168,6 +169,7 @@ importFrom(GenomeInfoDb,keepStandardChromosomes) importFrom(GenomeInfoDb,seqinfo) importFrom(GenomeInfoDb,seqlengths) importFrom(GenomeInfoDb,seqlevels) +importFrom(GenomeInfoDb,seqlevelsInUse) importFrom(GenomeInfoDb,seqlevelsStyle) importFrom(GenomeInfoDb,seqnames) importFrom(GenomeInfoDb,sortSeqlevels) diff --git a/NEWS b/NEWS index 8ee7ded..de7c940 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,7 @@ +CHANGES IN VERSION 2.4.7 +------------------------- + o Added mergeFDS function, which merges two fds FraserDataSet objects. + CHANGES IN VERSION 2.4.6 ------------------------- o Fixing a bug in the annotation, where we miss the annotation for junction, diff --git a/R/mergeFds.R b/R/mergeFds.R new file mode 100644 index 0000000..00d20e5 --- /dev/null +++ b/R/mergeFds.R @@ -0,0 +1,144 @@ +#' Merge two FraserDataSet objects with different splice sites and junctions +#' +#' This function merges two FraserDataSet objects that have different +#' splice sites and junctions but no overlapping samples. It combines +#' only the rawCountsJ and rawCountsSS assays and recalculates startID +#' and endID for the merged rowRanges. +#' +#' @param fds1 First FraserDataSet object. +#' @param fds2 Second FraserDataSet object. +#' @param join_type Type of join for rowRanges: "outer" (default) or "inner". +#' @param fds_name Name for the merged FraserDataSet. +#' @param workingDir Directory where to store HDF5 and RDS files. Defaults to +#' \code{FRASER_output} in the current working directory. +#' @return A merged FraserDataSet object containing combined samples and assays. +#' @examples +#' # merged_fds <- mergeFDS(fds1, fds2, join_type="outer") +#' @export +#' @importFrom GenomeInfoDb seqlevelsInUse +#' @importFrom SummarizedExperiment rowRanges rowData assays +mergeFDS <- function(fds1, fds2, join_type = c("outer", "inner"), fds_name="merged_fds", workingDir="FRASER_output") { + join_type <- match.arg(join_type) + # create a True/False flag for join types + all_flag <- if (join_type == "outer") TRUE else FALSE + + create_junction_counts <- function(fds){ + chr_levels <- seqlevelsInUse(fds) + + row_ranges <- as.data.table(rowRanges(fds)) + + junction_counts <- as.data.table(assays(fds)$rawCountsJ) + junction_counts <- cbind(junction_counts, row_ranges[, c("seqnames", "start", "end", "width", "strand", "startID", "endID")]) + # Enforce order + junction_counts[, seqnames := factor(as.character(seqnames), levels = chr_levels)] + setorder(junction_counts, seqnames, start, end) + return (junction_counts) + } + + create_split_counts <- function(fds){ + + chr_levels <- seqlevelsInUse(fds) + + row_ranges <- as.data.table(rowRanges(fds)) + + splice_sites <- rowData(nonSplicedReads(fds)) + splice_site_counts <- as.data.table(assays(fds)$rawCountsSS) + splice_site_counts <- as.data.table(cbind(splice_sites, splice_site_counts)) + + splice_site_lookup <- unique(rbind( + row_ranges[, .(spliceSiteID = startID, seqnames, position = start - 1)], # donor positions + row_ranges[, .(spliceSiteID = endID, seqnames, position = end)] # acceptor positions + )) + + splice_site_lookup[, start := position] + splice_site_lookup[, end := start + 1] + splice_site_lookup[, width := 2] + splice_site_lookup[, position := NULL] + + splice_site_counts <- merge(splice_site_counts, splice_site_lookup, by = "spliceSiteID", all.x = TRUE) + splice_site_counts[, seqnames := factor(as.character(seqnames), levels = chr_levels)] + setorder(splice_site_counts, seqnames, start) + + return (splice_site_counts) + + } + + junction_counts_1 <- create_junction_counts(fds1) + junction_counts_2 <- create_junction_counts(fds2) + + # Join the junction counts based on join_type + junction_counts_merged <- merge(junction_counts_1, junction_counts_2, by=c("start", "end", "seqnames", "width", "strand"), all=all_flag) + + # If outer join, missing numeric counts should become 0. + # For inner join we do NOT coerce NAs to 0 (there shouldn't be rows + # coming from only one side when all=FALSE), so skip replacement. + if (all_flag) { + junction_counts_merged[is.na(junction_counts_merged)] <- 0 + } + positions <- unique(rbind( + junction_counts_merged[, .(seqnames, pos = start)], + junction_counts_merged[, .(seqnames, pos = end)] + )) + setorder(positions, seqnames, pos) + positions[, spliceSiteID := .I] + + junction_counts_merged <- merge(junction_counts_merged, positions, by.x = c("seqnames", "start"), by.y = c("seqnames", "pos"), all.x = TRUE) + setnames(junction_counts_merged, "spliceSiteID", "startID") + + junction_counts_merged <- merge(junction_counts_merged, positions, by.x = c("seqnames", "end"), by.y = c("seqnames", "pos"), all.x = TRUE) + setnames(junction_counts_merged, "spliceSiteID", "endID") + + junction_counts_merged[ ,c("startID.x","startID.y", "endID.x", "endID.y") := NULL] + + splice_site_counts_1 <- create_split_counts(fds1) + splice_site_counts_2 <- create_split_counts(fds2) + + splice_site_counts_merged <- merge(splice_site_counts_1, splice_site_counts_2, by=c("start", "end", "seqnames", "width", "type"), all=all_flag) + splice_site_counts_merged[ ,c("spliceSiteID.x","spliceSiteID.y") := NULL] + + # When outer, fill NA -> 0 (samples absent in one side) + if (all_flag) { + splice_site_counts_merged[is.na(splice_site_counts_merged)] <- 0 + } + + splice_site_counts_merged <- merge(splice_site_counts_merged, positions, by.x = c("seqnames", "start"), by.y = c("seqnames", "pos"), all.x = TRUE) + positions[, pos := pos - 1] # to merge donor values + splice_site_counts_merged <- merge(splice_site_counts_merged, positions, by.x = c("seqnames", "start"), by.y = c("seqnames", "pos"), all.x = TRUE) + + # Combine spliceSiteID.x and spliceSiteID.y into a single column. + # For each row, use spliceSiteID.x if it’s not NA; otherwise take spliceSiteID.y. + # This keeps whichever ID exists after merging. + splice_site_counts_merged[, spliceSiteID := fcoalesce(spliceSiteID.x, spliceSiteID.y)] + + # Set the rest to NULL + splice_site_counts_merged[ ,c("spliceSiteID.x","spliceSiteID.y") := NULL] + + # when using inner join, ensure we only keep splice sites that are in positions + if (!all_flag) { + splice_site_counts_merged <- splice_site_counts_merged[spliceSiteID %in% positions$spliceSiteID] + junction_counts_merged <- junction_counts_merged[startID %in% positions$spliceSiteID & endID %in% positions$spliceSiteID] + } + + # Subset both colData + # --- Combine colData --- + # Get intersection of column names + shared_cols <- intersect(colnames(colData(fds1)), colnames(colData(fds2))) + + cd1 <- colData(fds1)[, shared_cols, drop = FALSE] + cd2 <- colData(fds2)[, shared_cols, drop = FALSE] + + merged_colData <- rbind(cd1, cd2) + + # --- Create merged FraserDataSet --- + merged_fds <- FraserDataSet( + colData = as.data.table(merged_colData), + junctions = junction_counts_merged, + spliceSites = splice_site_counts_merged, + name = fds_name, + workingDir = workingDir + ) + + merged_fds <- calculatePSIValues(merged_fds, types=fitMetrics(merged_fds)) + + return(merged_fds) +} diff --git a/man/mergeFDS.Rd b/man/mergeFDS.Rd new file mode 100644 index 0000000..bb7cd21 --- /dev/null +++ b/man/mergeFDS.Rd @@ -0,0 +1,38 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/mergeFds.R +\name{mergeFDS} +\alias{mergeFDS} +\title{Merge two FraserDataSet objects with different splice sites and junctions} +\usage{ +mergeFDS( + fds1, + fds2, + join_type = c("outer", "inner"), + fds_name = "merged_fds", + workingDir = "FRASER_output" +) +} +\arguments{ +\item{fds1}{First FraserDataSet object.} + +\item{fds2}{Second FraserDataSet object.} + +\item{join_type}{Type of join for rowRanges: "outer" (default) or "inner".} + +\item{fds_name}{Name for the merged FraserDataSet.} + +\item{workingDir}{Directory where to store HDF5 and RDS files. Defaults to +\code{FRASER_output} in the current working directory.} +} +\value{ +A merged FraserDataSet object containing combined samples and assays. +} +\description{ +This function merges two FraserDataSet objects that have different +splice sites and junctions but no overlapping samples. It combines +only the rawCountsJ and rawCountsSS assays and recalculates startID +and endID for the merged rowRanges. +} +\examples{ +# merged_fds <- mergeFDS(fds1, fds2, join_type="outer") +} diff --git a/tests/testthat/test_merge_fds.R b/tests/testthat/test_merge_fds.R new file mode 100644 index 0000000..5ce0649 --- /dev/null +++ b/tests/testthat/test_merge_fds.R @@ -0,0 +1,19 @@ +context("Test merge fds function") + +test_that("mergeFDS", { + fds <- createTestFraserDataSet() + + fds_1 <- fds[, 1] + fds_2 <- fds[, c(2,3)] + + merged_fds <- mergeFDS(fds_1, fds_2) + + # Check number of samples + expect_equal(ncol(merged_fds), ncol(fds)) + + # Check number of junctions (features) + expect_equal(nrow(merged_fds), nrow(fds)) + + # Check number of splice sites + expect_equal(nrow(assay(merged_fds, "rawCountsSS")), nrow(assay(fds, "rawCountsSS"))) +}) \ No newline at end of file