From ca408b6ef3efbf3c124cd73114740e29cfd3ae01 Mon Sep 17 00:00:00 2001 From: AtaJadidAhari Date: Mon, 27 Oct 2025 17:40:17 +0100 Subject: [PATCH 1/5] add mergeFds function --- DESCRIPTION | 1 + R/mergeFds.R | 126 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 127 insertions(+) create mode 100644 R/mergeFds.R diff --git a/DESCRIPTION b/DESCRIPTION index 6cf5912..e6741c5 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -120,6 +120,7 @@ Collate: getURLs.R helper-functions.R mergeExternalData.R + mergeFds.R saveHDF5Objects.R RcppExports.R autoencoder.R diff --git a/R/mergeFds.R b/R/mergeFds.R new file mode 100644 index 0000000..fa10ae7 --- /dev/null +++ b/R/mergeFds.R @@ -0,0 +1,126 @@ +#' Merge two FraserDataSet objects with different splice sites and junctions +#' +#' This function merges two FraserDataSet objects that have different +#' splice sites and junctions but no overlapping samples. It combines +#' only the rawCountsJ and rawCountsSS assays and recalculates startID +#' and endID for the merged rowRanges. +#' +#' @param fds1 First FraserDataSet object. +#' @param fds2 Second FraserDataSet object. +#' @param join_type Type of join for rowRanges: "outer" (default) or "inner". +#' @param fds_name Name for the merged FraserDataSet. +#' @return A merged FraserDataSet object containing combined samples and assays. +#' @examples +#' # merged_fds <- mergeFDS(fds1, fds2, join_type="outer") +#' @export +#' @importFrom data.table as.data.table setorder setnames fcoalesce +#' @importFrom GenomeInfoDb seqlevelsInUse +#' @importFrom SummarizedExperiment rowRanges rowData assays +mergeFDS <- function(fds1, fds2, join_type = c("outer", "inner"), fds_name="merged_fds") { + join_type <- match.arg(join_type) + + create_junction_counts <- function(fds){ + chr_levels <- seqlevelsInUse(fds) + + row_ranges <- as.data.table(rowRanges(fds)) + + junction_counts <- as.data.table(assays(fds)$rawCountsJ) + junction_counts <- cbind(junction_counts, row_ranges[, c("seqnames", "start", "end", "width", "strand", "startID", "endID")]) + # Enforce order + junction_counts[, seqnames := factor(as.character(seqnames), levels = chr_levels)] + setorder(junction_counts, seqnames, start) + return (junction_counts) + } + + create_split_counts <- function(fds){ + + chr_levels <- seqlevelsInUse(fds) + + row_ranges <- as.data.table(rowRanges(fds)) + + splice_sites <- rowData(nonSplicedReads(fds)) + splice_site_counts <- as.data.table(assays(fds)$rawCountsSS) + splice_site_counts <- as.data.table(cbind(splice_sites, splice_site_counts)) + + splice_site_lookup <- unique(rbind( + row_ranges[, .(spliceSiteID = startID, seqnames, position = start - 1)], # donor positions + row_ranges[, .(spliceSiteID = endID, seqnames, position = end)] # acceptor positions + )) + + splice_site_lookup[, start := position] + splice_site_lookup[, end := start + 1] + splice_site_lookup[, width := 2] + splice_site_lookup[, position := NULL] + + splice_site_counts <- merge(splice_site_counts, splice_site_lookup, by = "spliceSiteID", all.x = TRUE) + splice_site_counts[, seqnames := factor(as.character(seqnames), levels = chr_levels)] + setorder(splice_site_counts, seqnames, start) + + return (splice_site_counts) + + } + + + junction_counts_1 <- create_junction_counts(fds1) + junction_counts_2 <- create_junction_counts(fds2) + + + + junction_counts_merged <- merge(junction_counts_1, junction_counts_2, by=c("start", "end", "seqnames", "width", "strand"), all=T, ) + junction_counts_merged[is.na(junction_counts_merged)] <- 0 + + positions <- unique(rbind( + junction_counts_merged[, .(seqnames, pos = start)], + junction_counts_merged[, .(seqnames, pos = end)] + )) + setorder(positions, seqnames, pos) + positions[, spliceSiteID := .I] + + + junction_counts_merged <- merge(junction_counts_merged, positions, by.x = c("seqnames", "start"), by.y = c("seqnames", "pos"), all.x = TRUE) + setnames(junction_counts_merged, "spliceSiteID", "startID") + + junction_counts_merged <- merge(junction_counts_merged, positions, by.x = c("seqnames", "end"), by.y = c("seqnames", "pos"), all.x = TRUE) + setnames(junction_counts_merged, "spliceSiteID", "endID") + + junction_counts_merged[ ,c("startID.x","startID.y", "endID.x", "endID.y") := NULL] + + splice_site_counts_1 <- create_split_counts(fds1) + splice_site_counts_2 <- create_split_counts(fds2) + + splice_site_counts_merged <- merge(splice_site_counts_1, splice_site_counts_2, by=c("start", "end", "seqnames", "width", "type"), all=T, ) + splice_site_counts_merged[ ,c("spliceSiteID.x","spliceSiteID.y") := NULL] + splice_site_counts_merged[is.na(splice_site_counts_merged)] <- 0 + + + splice_site_counts_merged <- merge(splice_site_counts_merged, positions, by.x = c("seqnames", "start"), by.y = c("seqnames", "pos"), all.x = TRUE) + positions[, pos := pos - 1] # to merge Donnor values + splice_site_counts_merged <- merge(splice_site_counts_merged, positions, by.x = c("seqnames", "start"), by.y = c("seqnames", "pos"), all.x = TRUE) + + splice_site_counts_merged[, spliceSiteID := fcoalesce(spliceSiteID.x, spliceSiteID.y)] + splice_site_counts_merged[ ,c("spliceSiteID.x","spliceSiteID.y") := NULL] + + + # Subset both colData + # --- Combine colData --- + # Get intersection of column names + shared_cols <- intersect(colnames(colData(fds1)), colnames(colData(fds2))) + + cd1 <- colData(fds1)[, shared_cols, drop = FALSE] + cd2 <- colData(fds2)[, shared_cols, drop = FALSE] + + merged_colData <- rbind(cd1, cd2) + + # --- Create merged FraserDataSet --- + merged_fds <- FraserDataSet( + colData = as.data.table(merged_colData), + junctions = junction_counts_merged, + spliceSites = splice_site_counts_merged, + name = paste0(fds_name) + ) + + merged_fds <- calculatePSIValues(merged_fds, types=fitMetrics(merged_fds)) + + + return(merged_fds) +} \ No newline at end of file From e8087d8feefbb4901697c698c2c274f0afee38d1 Mon Sep 17 00:00:00 2001 From: AtaJadidAhari Date: Tue, 28 Oct 2025 00:13:54 +0100 Subject: [PATCH 2/5] add documents for mergeFds --- NAMESPACE | 2 ++ R/mergeFds.R | 3 +-- man/mergeFDS.Rd | 29 +++++++++++++++++++++++++++++ 3 files changed, 32 insertions(+), 2 deletions(-) create mode 100644 man/mergeFDS.Rd diff --git a/NAMESPACE b/NAMESPACE index 80dbf37..fe7debe 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -61,6 +61,7 @@ export(makeSimulatedFraserDataSet) export(mapSeqlevels) export(mergeCounts) export(mergeExternalData) +export(mergeFDS) export(name) export(nonSplicedReads) export(pVals) @@ -168,6 +169,7 @@ importFrom(GenomeInfoDb,keepStandardChromosomes) importFrom(GenomeInfoDb,seqinfo) importFrom(GenomeInfoDb,seqlengths) importFrom(GenomeInfoDb,seqlevels) +importFrom(GenomeInfoDb,seqlevelsInUse) importFrom(GenomeInfoDb,seqlevelsStyle) importFrom(GenomeInfoDb,seqnames) importFrom(GenomeInfoDb,sortSeqlevels) diff --git a/R/mergeFds.R b/R/mergeFds.R index fa10ae7..e1e4761 100644 --- a/R/mergeFds.R +++ b/R/mergeFds.R @@ -13,7 +13,6 @@ #' @examples #' # merged_fds <- mergeFDS(fds1, fds2, join_type="outer") #' @export -#' @importFrom data.table as.data.table setorder setnames fcoalesce #' @importFrom GenomeInfoDb seqlevelsInUse #' @importFrom SummarizedExperiment rowRanges rowData assays mergeFDS <- function(fds1, fds2, join_type = c("outer", "inner"), fds_name="merged_fds") { @@ -123,4 +122,4 @@ mergeFDS <- function(fds1, fds2, join_type = c("outer", "inner"), fds_name="merg return(merged_fds) -} \ No newline at end of file +} diff --git a/man/mergeFDS.Rd b/man/mergeFDS.Rd new file mode 100644 index 0000000..9fc3998 --- /dev/null +++ b/man/mergeFDS.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/mergeFds.R +\name{mergeFDS} +\alias{mergeFDS} +\title{Merge two FraserDataSet objects with different splice sites and junctions} +\usage{ +mergeFDS(fds1, fds2, join_type = c("outer", "inner"), fds_name = "merged_fds") +} +\arguments{ +\item{fds1}{First FraserDataSet object.} + +\item{fds2}{Second FraserDataSet object.} + +\item{join_type}{Type of join for rowRanges: "outer" (default) or "inner".} + +\item{fds_name}{Name for the merged FraserDataSet.} +} +\value{ +A merged FraserDataSet object containing combined samples and assays. +} +\description{ +This function merges two FraserDataSet objects that have different +splice sites and junctions but no overlapping samples. It combines +only the rawCountsJ and rawCountsSS assays and recalculates startID +and endID for the merged rowRanges. +} +\examples{ +# merged_fds <- mergeFDS(fds1, fds2, join_type="outer") +} From f0a3f1a36972d501355f2a103994314a39ad069a Mon Sep 17 00:00:00 2001 From: AtaJadidAhari Date: Tue, 28 Oct 2025 16:33:08 +0100 Subject: [PATCH 3/5] add tests for mergeFDS --- tests/testthat/test_merge_fds.R | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 tests/testthat/test_merge_fds.R diff --git a/tests/testthat/test_merge_fds.R b/tests/testthat/test_merge_fds.R new file mode 100644 index 0000000..5ce0649 --- /dev/null +++ b/tests/testthat/test_merge_fds.R @@ -0,0 +1,19 @@ +context("Test merge fds function") + +test_that("mergeFDS", { + fds <- createTestFraserDataSet() + + fds_1 <- fds[, 1] + fds_2 <- fds[, c(2,3)] + + merged_fds <- mergeFDS(fds_1, fds_2) + + # Check number of samples + expect_equal(ncol(merged_fds), ncol(fds)) + + # Check number of junctions (features) + expect_equal(nrow(merged_fds), nrow(fds)) + + # Check number of splice sites + expect_equal(nrow(assay(merged_fds, "rawCountsSS")), nrow(assay(fds, "rawCountsSS"))) +}) \ No newline at end of file From 77a2305571732875aa7d0930aa117c84c53b4147 Mon Sep 17 00:00:00 2001 From: AtaJadidAhari Date: Wed, 29 Oct 2025 08:41:13 +0100 Subject: [PATCH 4/5] bump version --- DESCRIPTION | 4 ++-- NEWS | 4 ++++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index e6741c5..7c22620 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: FRASER Type: Package Title: Find RAre Splicing Events in RNA-Seq Data -Version: 2.5.6 -Date: 2025-10-24 +Version: 2.5.7 +Date: 2025-10-29 Authors@R: c( person("Christian", "Mertes", role=c("aut", "cre"), email="mertes@in.tum.de", comment=c(ORCID="0000-0002-1091-205X")), diff --git a/NEWS b/NEWS index 8ee7ded..de7c940 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,7 @@ +CHANGES IN VERSION 2.4.7 +------------------------- + o Added mergeFDS function, which merges two fds FraserDataSet objects. + CHANGES IN VERSION 2.4.6 ------------------------- o Fixing a bug in the annotation, where we miss the annotation for junction, From bbd3c125481c06b3c0bed00943e84c6ae189f410 Mon Sep 17 00:00:00 2001 From: AtaJadidAhari Date: Wed, 29 Oct 2025 12:51:03 +0100 Subject: [PATCH 5/5] fix reviews --- R/mergeFds.R | 45 ++++++++++++++++++++++++++++++++------------- man/mergeFDS.Rd | 11 ++++++++++- 2 files changed, 42 insertions(+), 14 deletions(-) diff --git a/R/mergeFds.R b/R/mergeFds.R index e1e4761..00d20e5 100644 --- a/R/mergeFds.R +++ b/R/mergeFds.R @@ -9,14 +9,18 @@ #' @param fds2 Second FraserDataSet object. #' @param join_type Type of join for rowRanges: "outer" (default) or "inner". #' @param fds_name Name for the merged FraserDataSet. +#' @param workingDir Directory where to store HDF5 and RDS files. Defaults to +#' \code{FRASER_output} in the current working directory. #' @return A merged FraserDataSet object containing combined samples and assays. #' @examples #' # merged_fds <- mergeFDS(fds1, fds2, join_type="outer") #' @export #' @importFrom GenomeInfoDb seqlevelsInUse #' @importFrom SummarizedExperiment rowRanges rowData assays -mergeFDS <- function(fds1, fds2, join_type = c("outer", "inner"), fds_name="merged_fds") { +mergeFDS <- function(fds1, fds2, join_type = c("outer", "inner"), fds_name="merged_fds", workingDir="FRASER_output") { join_type <- match.arg(join_type) + # create a True/False flag for join types + all_flag <- if (join_type == "outer") TRUE else FALSE create_junction_counts <- function(fds){ chr_levels <- seqlevelsInUse(fds) @@ -27,7 +31,7 @@ mergeFDS <- function(fds1, fds2, join_type = c("outer", "inner"), fds_name="merg junction_counts <- cbind(junction_counts, row_ranges[, c("seqnames", "start", "end", "width", "strand", "startID", "endID")]) # Enforce order junction_counts[, seqnames := factor(as.character(seqnames), levels = chr_levels)] - setorder(junction_counts, seqnames, start) + setorder(junction_counts, seqnames, start, end) return (junction_counts) } @@ -59,15 +63,18 @@ mergeFDS <- function(fds1, fds2, join_type = c("outer", "inner"), fds_name="merg } - junction_counts_1 <- create_junction_counts(fds1) junction_counts_2 <- create_junction_counts(fds2) + # Join the junction counts based on join_type + junction_counts_merged <- merge(junction_counts_1, junction_counts_2, by=c("start", "end", "seqnames", "width", "strand"), all=all_flag) - - junction_counts_merged <- merge(junction_counts_1, junction_counts_2, by=c("start", "end", "seqnames", "width", "strand"), all=T, ) - junction_counts_merged[is.na(junction_counts_merged)] <- 0 - + # If outer join, missing numeric counts should become 0. + # For inner join we do NOT coerce NAs to 0 (there shouldn't be rows + # coming from only one side when all=FALSE), so skip replacement. + if (all_flag) { + junction_counts_merged[is.na(junction_counts_merged)] <- 0 + } positions <- unique(rbind( junction_counts_merged[, .(seqnames, pos = start)], junction_counts_merged[, .(seqnames, pos = end)] @@ -75,7 +82,6 @@ mergeFDS <- function(fds1, fds2, join_type = c("outer", "inner"), fds_name="merg setorder(positions, seqnames, pos) positions[, spliceSiteID := .I] - junction_counts_merged <- merge(junction_counts_merged, positions, by.x = c("seqnames", "start"), by.y = c("seqnames", "pos"), all.x = TRUE) setnames(junction_counts_merged, "spliceSiteID", "startID") @@ -87,18 +93,31 @@ mergeFDS <- function(fds1, fds2, join_type = c("outer", "inner"), fds_name="merg splice_site_counts_1 <- create_split_counts(fds1) splice_site_counts_2 <- create_split_counts(fds2) - splice_site_counts_merged <- merge(splice_site_counts_1, splice_site_counts_2, by=c("start", "end", "seqnames", "width", "type"), all=T, ) + splice_site_counts_merged <- merge(splice_site_counts_1, splice_site_counts_2, by=c("start", "end", "seqnames", "width", "type"), all=all_flag) splice_site_counts_merged[ ,c("spliceSiteID.x","spliceSiteID.y") := NULL] - splice_site_counts_merged[is.na(splice_site_counts_merged)] <- 0 + # When outer, fill NA -> 0 (samples absent in one side) + if (all_flag) { + splice_site_counts_merged[is.na(splice_site_counts_merged)] <- 0 + } splice_site_counts_merged <- merge(splice_site_counts_merged, positions, by.x = c("seqnames", "start"), by.y = c("seqnames", "pos"), all.x = TRUE) - positions[, pos := pos - 1] # to merge Donnor values + positions[, pos := pos - 1] # to merge donor values splice_site_counts_merged <- merge(splice_site_counts_merged, positions, by.x = c("seqnames", "start"), by.y = c("seqnames", "pos"), all.x = TRUE) + # Combine spliceSiteID.x and spliceSiteID.y into a single column. + # For each row, use spliceSiteID.x if it’s not NA; otherwise take spliceSiteID.y. + # This keeps whichever ID exists after merging. splice_site_counts_merged[, spliceSiteID := fcoalesce(spliceSiteID.x, spliceSiteID.y)] + + # Set the rest to NULL splice_site_counts_merged[ ,c("spliceSiteID.x","spliceSiteID.y") := NULL] + # when using inner join, ensure we only keep splice sites that are in positions + if (!all_flag) { + splice_site_counts_merged <- splice_site_counts_merged[spliceSiteID %in% positions$spliceSiteID] + junction_counts_merged <- junction_counts_merged[startID %in% positions$spliceSiteID & endID %in% positions$spliceSiteID] + } # Subset both colData # --- Combine colData --- @@ -115,11 +134,11 @@ mergeFDS <- function(fds1, fds2, join_type = c("outer", "inner"), fds_name="merg colData = as.data.table(merged_colData), junctions = junction_counts_merged, spliceSites = splice_site_counts_merged, - name = paste0(fds_name) + name = fds_name, + workingDir = workingDir ) merged_fds <- calculatePSIValues(merged_fds, types=fitMetrics(merged_fds)) - return(merged_fds) } diff --git a/man/mergeFDS.Rd b/man/mergeFDS.Rd index 9fc3998..bb7cd21 100644 --- a/man/mergeFDS.Rd +++ b/man/mergeFDS.Rd @@ -4,7 +4,13 @@ \alias{mergeFDS} \title{Merge two FraserDataSet objects with different splice sites and junctions} \usage{ -mergeFDS(fds1, fds2, join_type = c("outer", "inner"), fds_name = "merged_fds") +mergeFDS( + fds1, + fds2, + join_type = c("outer", "inner"), + fds_name = "merged_fds", + workingDir = "FRASER_output" +) } \arguments{ \item{fds1}{First FraserDataSet object.} @@ -14,6 +20,9 @@ mergeFDS(fds1, fds2, join_type = c("outer", "inner"), fds_name = "merged_fds") \item{join_type}{Type of join for rowRanges: "outer" (default) or "inner".} \item{fds_name}{Name for the merged FraserDataSet.} + +\item{workingDir}{Directory where to store HDF5 and RDS files. Defaults to +\code{FRASER_output} in the current working directory.} } \value{ A merged FraserDataSet object containing combined samples and assays.