From ca408b6ef3efbf3c124cd73114740e29cfd3ae01 Mon Sep 17 00:00:00 2001
From: AtaJadidAhari <atajadidahari@gmail.com>
Date: Mon, 27 Oct 2025 17:40:17 +0100
Subject: [PATCH 1/5] add mergeFds function

---
 DESCRIPTION  |   1 +
 R/mergeFds.R | 126 +++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 127 insertions(+)
 create mode 100644 R/mergeFds.R

diff --git a/DESCRIPTION b/DESCRIPTION
index 6cf5912..e6741c5 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -120,6 +120,7 @@ Collate:
     getURLs.R
     helper-functions.R
     mergeExternalData.R
+    mergeFds.R
     saveHDF5Objects.R
     RcppExports.R
     autoencoder.R
diff --git a/R/mergeFds.R b/R/mergeFds.R
new file mode 100644
index 0000000..fa10ae7
--- /dev/null
+++ b/R/mergeFds.R
@@ -0,0 +1,126 @@
+#' Merge two FraserDataSet objects with different splice sites and junctions
+#'
+#' This function merges two FraserDataSet objects that have different
+#' splice sites and junctions but no overlapping samples. It combines
+#' only the rawCountsJ and rawCountsSS assays and recalculates startID
+#' and endID for the merged rowRanges.
+#'
+#' @param fds1 First FraserDataSet object.
+#' @param fds2 Second FraserDataSet object.
+#' @param join_type Type of join for rowRanges: "outer" (default) or "inner".
+#' @param fds_name Name for the merged FraserDataSet.
+#' @return A merged FraserDataSet object containing combined samples and assays.
+#' @examples
+#' # merged_fds <- mergeFDS(fds1, fds2, join_type="outer")
+#' @export
+#' @importFrom data.table as.data.table setorder setnames fcoalesce
+#' @importFrom GenomeInfoDb seqlevelsInUse
+#' @importFrom SummarizedExperiment rowRanges rowData assays
+mergeFDS <- function(fds1, fds2, join_type = c("outer", "inner"), fds_name="merged_fds") {
+  join_type <- match.arg(join_type)
+  
+  create_junction_counts <- function(fds){
+    chr_levels <- seqlevelsInUse(fds)
+    
+    row_ranges <- as.data.table(rowRanges(fds))
+    
+    junction_counts <- as.data.table(assays(fds)$rawCountsJ)
+    junction_counts <- cbind(junction_counts, row_ranges[, c("seqnames", "start", "end", "width", "strand", "startID", "endID")])
+    # Enforce order
+    junction_counts[, seqnames := factor(as.character(seqnames), levels = chr_levels)]
+    setorder(junction_counts, seqnames, start)
+    return (junction_counts)
+  }
+  
+  create_split_counts <- function(fds){
+    
+    chr_levels <- seqlevelsInUse(fds)
+    
+    row_ranges <- as.data.table(rowRanges(fds))
+    
+    splice_sites <- rowData(nonSplicedReads(fds))
+    splice_site_counts <- as.data.table(assays(fds)$rawCountsSS)
+    splice_site_counts <- as.data.table(cbind(splice_sites, splice_site_counts))
+    
+    splice_site_lookup <- unique(rbind(
+      row_ranges[, .(spliceSiteID = startID, seqnames, position = start - 1)],  # donor positions
+      row_ranges[, .(spliceSiteID = endID, seqnames, position = end)]      # acceptor positions
+    )) 
+    
+    splice_site_lookup[, start := position]
+    splice_site_lookup[, end := start + 1]
+    splice_site_lookup[, width := 2]
+    splice_site_lookup[, position := NULL]
+    
+    splice_site_counts <- merge(splice_site_counts, splice_site_lookup, by = "spliceSiteID", all.x = TRUE)        
+    splice_site_counts[, seqnames := factor(as.character(seqnames), levels = chr_levels)]
+    setorder(splice_site_counts, seqnames, start)
+    
+    return (splice_site_counts)
+    
+  }
+  
+  
+  junction_counts_1 <- create_junction_counts(fds1)
+  junction_counts_2 <- create_junction_counts(fds2)
+  
+  
+  
+  junction_counts_merged <- merge(junction_counts_1, junction_counts_2, by=c("start", "end", "seqnames", "width", "strand"), all=T, )
+  junction_counts_merged[is.na(junction_counts_merged)] <- 0
+  
+  positions <- unique(rbind(
+    junction_counts_merged[, .(seqnames, pos = start)],
+    junction_counts_merged[, .(seqnames, pos = end)]
+  ))
+  setorder(positions, seqnames, pos)
+  positions[, spliceSiteID := .I]
+  
+  
+  junction_counts_merged <- merge(junction_counts_merged, positions, by.x = c("seqnames", "start"), by.y = c("seqnames", "pos"), all.x = TRUE)
+  setnames(junction_counts_merged, "spliceSiteID", "startID")
+  
+  junction_counts_merged <- merge(junction_counts_merged, positions, by.x = c("seqnames", "end"), by.y = c("seqnames", "pos"), all.x = TRUE)
+  setnames(junction_counts_merged, "spliceSiteID", "endID")
+  
+  junction_counts_merged[ ,c("startID.x","startID.y", "endID.x", "endID.y") := NULL]
+  
+  splice_site_counts_1 <- create_split_counts(fds1)
+  splice_site_counts_2 <- create_split_counts(fds2)
+  
+  splice_site_counts_merged <- merge(splice_site_counts_1, splice_site_counts_2, by=c("start", "end", "seqnames", "width", "type"), all=T, )
+  splice_site_counts_merged[ ,c("spliceSiteID.x","spliceSiteID.y") := NULL]
+  splice_site_counts_merged[is.na(splice_site_counts_merged)] <- 0
+  
+  
+  splice_site_counts_merged <- merge(splice_site_counts_merged, positions, by.x = c("seqnames", "start"), by.y = c("seqnames", "pos"), all.x = TRUE)
+  positions[, pos := pos - 1] # to merge Donnor values 
+  splice_site_counts_merged <- merge(splice_site_counts_merged, positions, by.x = c("seqnames", "start"), by.y = c("seqnames", "pos"), all.x = TRUE)
+  
+  splice_site_counts_merged[, spliceSiteID := fcoalesce(spliceSiteID.x, spliceSiteID.y)]
+  splice_site_counts_merged[ ,c("spliceSiteID.x","spliceSiteID.y") := NULL]
+  
+  
+  # Subset both colData
+  # --- Combine colData ---
+  # Get intersection of column names
+  shared_cols <- intersect(colnames(colData(fds1)), colnames(colData(fds2)))
+  
+  cd1 <- colData(fds1)[, shared_cols, drop = FALSE]
+  cd2 <- colData(fds2)[, shared_cols, drop = FALSE]
+  
+  merged_colData <- rbind(cd1, cd2)  
+  
+  # --- Create merged FraserDataSet ---
+  merged_fds <- FraserDataSet(
+    colData = as.data.table(merged_colData),
+    junctions = junction_counts_merged,
+    spliceSites = splice_site_counts_merged,
+    name = paste0(fds_name)
+  )
+  
+  merged_fds <- calculatePSIValues(merged_fds, types=fitMetrics(merged_fds))
+  
+  
+  return(merged_fds)
+}
\ No newline at end of file

From e8087d8feefbb4901697c698c2c274f0afee38d1 Mon Sep 17 00:00:00 2001
From: AtaJadidAhari <atajadidahari@gmail.com>
Date: Tue, 28 Oct 2025 00:13:54 +0100
Subject: [PATCH 2/5] add documents for mergeFds

---
 NAMESPACE       |  2 ++
 R/mergeFds.R    |  3 +--
 man/mergeFDS.Rd | 29 +++++++++++++++++++++++++++++
 3 files changed, 32 insertions(+), 2 deletions(-)
 create mode 100644 man/mergeFDS.Rd

diff --git a/NAMESPACE b/NAMESPACE
index 80dbf37..fe7debe 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -61,6 +61,7 @@ export(makeSimulatedFraserDataSet)
 export(mapSeqlevels)
 export(mergeCounts)
 export(mergeExternalData)
+export(mergeFDS)
 export(name)
 export(nonSplicedReads)
 export(pVals)
@@ -168,6 +169,7 @@ importFrom(GenomeInfoDb,keepStandardChromosomes)
 importFrom(GenomeInfoDb,seqinfo)
 importFrom(GenomeInfoDb,seqlengths)
 importFrom(GenomeInfoDb,seqlevels)
+importFrom(GenomeInfoDb,seqlevelsInUse)
 importFrom(GenomeInfoDb,seqlevelsStyle)
 importFrom(GenomeInfoDb,seqnames)
 importFrom(GenomeInfoDb,sortSeqlevels)
diff --git a/R/mergeFds.R b/R/mergeFds.R
index fa10ae7..e1e4761 100644
--- a/R/mergeFds.R
+++ b/R/mergeFds.R
@@ -13,7 +13,6 @@
 #' @examples
 #' # merged_fds <- mergeFDS(fds1, fds2, join_type="outer")
 #' @export
-#' @importFrom data.table as.data.table setorder setnames fcoalesce
 #' @importFrom GenomeInfoDb seqlevelsInUse
 #' @importFrom SummarizedExperiment rowRanges rowData assays
 mergeFDS <- function(fds1, fds2, join_type = c("outer", "inner"), fds_name="merged_fds") {
@@ -123,4 +122,4 @@ mergeFDS <- function(fds1, fds2, join_type = c("outer", "inner"), fds_name="merg
   
   
   return(merged_fds)
-}
\ No newline at end of file
+}
diff --git a/man/mergeFDS.Rd b/man/mergeFDS.Rd
new file mode 100644
index 0000000..9fc3998
--- /dev/null
+++ b/man/mergeFDS.Rd
@@ -0,0 +1,29 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/mergeFds.R
+\name{mergeFDS}
+\alias{mergeFDS}
+\title{Merge two FraserDataSet objects with different splice sites and junctions}
+\usage{
+mergeFDS(fds1, fds2, join_type = c("outer", "inner"), fds_name = "merged_fds")
+}
+\arguments{
+\item{fds1}{First FraserDataSet object.}
+
+\item{fds2}{Second FraserDataSet object.}
+
+\item{join_type}{Type of join for rowRanges: "outer" (default) or "inner".}
+
+\item{fds_name}{Name for the merged FraserDataSet.}
+}
+\value{
+A merged FraserDataSet object containing combined samples and assays.
+}
+\description{
+This function merges two FraserDataSet objects that have different
+splice sites and junctions but no overlapping samples. It combines
+only the rawCountsJ and rawCountsSS assays and recalculates startID
+and endID for the merged rowRanges.
+}
+\examples{
+# merged_fds <- mergeFDS(fds1, fds2, join_type="outer")
+}

From f0a3f1a36972d501355f2a103994314a39ad069a Mon Sep 17 00:00:00 2001
From: AtaJadidAhari <atajadidahari@gmail.com>
Date: Tue, 28 Oct 2025 16:33:08 +0100
Subject: [PATCH 3/5] add tests for mergeFDS

---
 tests/testthat/test_merge_fds.R | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)
 create mode 100644 tests/testthat/test_merge_fds.R

diff --git a/tests/testthat/test_merge_fds.R b/tests/testthat/test_merge_fds.R
new file mode 100644
index 0000000..5ce0649
--- /dev/null
+++ b/tests/testthat/test_merge_fds.R
@@ -0,0 +1,19 @@
+context("Test merge fds function")
+
+test_that("mergeFDS", {
+  fds <- createTestFraserDataSet()
+  
+  fds_1 <- fds[, 1]
+  fds_2 <- fds[, c(2,3)]
+  
+  merged_fds <- mergeFDS(fds_1, fds_2)
+  
+  # Check number of samples
+  expect_equal(ncol(merged_fds), ncol(fds))
+  
+  # Check number of junctions (features)
+  expect_equal(nrow(merged_fds), nrow(fds))
+  
+  # Check number of splice sites
+  expect_equal(nrow(assay(merged_fds, "rawCountsSS")), nrow(assay(fds, "rawCountsSS")))  
+})
\ No newline at end of file

From 77a2305571732875aa7d0930aa117c84c53b4147 Mon Sep 17 00:00:00 2001
From: AtaJadidAhari <atajadidahari@gmail.com>
Date: Wed, 29 Oct 2025 08:41:13 +0100
Subject: [PATCH 4/5] bump version

---
 DESCRIPTION | 4 ++--
 NEWS        | 4 ++++
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index e6741c5..7c22620 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,8 +1,8 @@
 Package: FRASER
 Type: Package
 Title: Find RAre Splicing Events in RNA-Seq Data
-Version: 2.5.6
-Date: 2025-10-24
+Version: 2.5.7
+Date: 2025-10-29
 Authors@R: c(
     person("Christian", "Mertes", role=c("aut", "cre"), 
             email="mertes@in.tum.de", comment=c(ORCID="0000-0002-1091-205X")),
diff --git a/NEWS b/NEWS
index 8ee7ded..de7c940 100644
--- a/NEWS
+++ b/NEWS
@@ -1,3 +1,7 @@
+CHANGES IN VERSION 2.4.7
+-------------------------
+    o Added mergeFDS function, which merges two fds FraserDataSet objects.
+
 CHANGES IN VERSION 2.4.6
 -------------------------
     o Fixing a bug in the annotation, where we miss the annotation for junction, 

From bbd3c125481c06b3c0bed00943e84c6ae189f410 Mon Sep 17 00:00:00 2001
From: AtaJadidAhari <atajadidahari@gmail.com>
Date: Wed, 29 Oct 2025 12:51:03 +0100
Subject: [PATCH 5/5] fix reviews

---
 R/mergeFds.R    | 45 ++++++++++++++++++++++++++++++++-------------
 man/mergeFDS.Rd | 11 ++++++++++-
 2 files changed, 42 insertions(+), 14 deletions(-)

diff --git a/R/mergeFds.R b/R/mergeFds.R
index e1e4761..00d20e5 100644
--- a/R/mergeFds.R
+++ b/R/mergeFds.R
@@ -9,14 +9,18 @@
 #' @param fds2 Second FraserDataSet object.
 #' @param join_type Type of join for rowRanges: "outer" (default) or "inner".
 #' @param fds_name Name for the merged FraserDataSet.
+#' @param workingDir Directory where to store HDF5 and RDS files. Defaults to
+#'                \code{FRASER_output} in the current working directory.
 #' @return A merged FraserDataSet object containing combined samples and assays.
 #' @examples
 #' # merged_fds <- mergeFDS(fds1, fds2, join_type="outer")
 #' @export
 #' @importFrom GenomeInfoDb seqlevelsInUse
 #' @importFrom SummarizedExperiment rowRanges rowData assays
-mergeFDS <- function(fds1, fds2, join_type = c("outer", "inner"), fds_name="merged_fds") {
+mergeFDS <- function(fds1, fds2, join_type = c("outer", "inner"), fds_name="merged_fds", workingDir="FRASER_output") {
   join_type <- match.arg(join_type)
+  # create a True/False flag for join types
+  all_flag <- if (join_type == "outer") TRUE else FALSE
   
   create_junction_counts <- function(fds){
     chr_levels <- seqlevelsInUse(fds)
@@ -27,7 +31,7 @@ mergeFDS <- function(fds1, fds2, join_type = c("outer", "inner"), fds_name="merg
     junction_counts <- cbind(junction_counts, row_ranges[, c("seqnames", "start", "end", "width", "strand", "startID", "endID")])
     # Enforce order
     junction_counts[, seqnames := factor(as.character(seqnames), levels = chr_levels)]
-    setorder(junction_counts, seqnames, start)
+    setorder(junction_counts, seqnames, start, end)
     return (junction_counts)
   }
   
@@ -59,15 +63,18 @@ mergeFDS <- function(fds1, fds2, join_type = c("outer", "inner"), fds_name="merg
     
   }
   
-  
   junction_counts_1 <- create_junction_counts(fds1)
   junction_counts_2 <- create_junction_counts(fds2)
   
+  # Join the junction counts based on join_type
+  junction_counts_merged <- merge(junction_counts_1, junction_counts_2, by=c("start", "end", "seqnames", "width", "strand"), all=all_flag)
   
-  
-  junction_counts_merged <- merge(junction_counts_1, junction_counts_2, by=c("start", "end", "seqnames", "width", "strand"), all=T, )
-  junction_counts_merged[is.na(junction_counts_merged)] <- 0
-  
+  # If outer join, missing numeric counts should become 0.
+  # For inner join we do NOT coerce NAs to 0 (there shouldn't be rows
+  # coming from only one side when all=FALSE), so skip replacement.
+  if (all_flag) {
+    junction_counts_merged[is.na(junction_counts_merged)] <- 0
+  }
   positions <- unique(rbind(
     junction_counts_merged[, .(seqnames, pos = start)],
     junction_counts_merged[, .(seqnames, pos = end)]
@@ -75,7 +82,6 @@ mergeFDS <- function(fds1, fds2, join_type = c("outer", "inner"), fds_name="merg
   setorder(positions, seqnames, pos)
   positions[, spliceSiteID := .I]
   
-  
   junction_counts_merged <- merge(junction_counts_merged, positions, by.x = c("seqnames", "start"), by.y = c("seqnames", "pos"), all.x = TRUE)
   setnames(junction_counts_merged, "spliceSiteID", "startID")
   
@@ -87,18 +93,31 @@ mergeFDS <- function(fds1, fds2, join_type = c("outer", "inner"), fds_name="merg
   splice_site_counts_1 <- create_split_counts(fds1)
   splice_site_counts_2 <- create_split_counts(fds2)
   
-  splice_site_counts_merged <- merge(splice_site_counts_1, splice_site_counts_2, by=c("start", "end", "seqnames", "width", "type"), all=T, )
+  splice_site_counts_merged <- merge(splice_site_counts_1, splice_site_counts_2, by=c("start", "end", "seqnames", "width", "type"), all=all_flag)
   splice_site_counts_merged[ ,c("spliceSiteID.x","spliceSiteID.y") := NULL]
-  splice_site_counts_merged[is.na(splice_site_counts_merged)] <- 0
   
+  # When outer, fill NA -> 0 (samples absent in one side)
+  if (all_flag) {
+      splice_site_counts_merged[is.na(splice_site_counts_merged)] <- 0
+  }
   
   splice_site_counts_merged <- merge(splice_site_counts_merged, positions, by.x = c("seqnames", "start"), by.y = c("seqnames", "pos"), all.x = TRUE)
-  positions[, pos := pos - 1] # to merge Donnor values 
+  positions[, pos := pos - 1] # to merge donor values
   splice_site_counts_merged <- merge(splice_site_counts_merged, positions, by.x = c("seqnames", "start"), by.y = c("seqnames", "pos"), all.x = TRUE)
   
+  # Combine spliceSiteID.x and spliceSiteID.y into a single column.
+  # For each row, use spliceSiteID.x if it’s not NA; otherwise take spliceSiteID.y.
+  # This keeps whichever ID exists after merging.
   splice_site_counts_merged[, spliceSiteID := fcoalesce(spliceSiteID.x, spliceSiteID.y)]
+
+  # Set the rest to NULL
   splice_site_counts_merged[ ,c("spliceSiteID.x","spliceSiteID.y") := NULL]
   
+  # when using inner join, ensure we only keep splice sites that are in positions
+  if (!all_flag) {
+    splice_site_counts_merged <- splice_site_counts_merged[spliceSiteID %in% positions$spliceSiteID]
+    junction_counts_merged <- junction_counts_merged[startID %in% positions$spliceSiteID & endID %in% positions$spliceSiteID]
+  }
   
   # Subset both colData
   # --- Combine colData ---
@@ -115,11 +134,11 @@ mergeFDS <- function(fds1, fds2, join_type = c("outer", "inner"), fds_name="merg
     colData = as.data.table(merged_colData),
     junctions = junction_counts_merged,
     spliceSites = splice_site_counts_merged,
-    name = paste0(fds_name)
+    name = fds_name,
+    workingDir = workingDir
   )
   
   merged_fds <- calculatePSIValues(merged_fds, types=fitMetrics(merged_fds))
   
-  
   return(merged_fds)
 }
diff --git a/man/mergeFDS.Rd b/man/mergeFDS.Rd
index 9fc3998..bb7cd21 100644
--- a/man/mergeFDS.Rd
+++ b/man/mergeFDS.Rd
@@ -4,7 +4,13 @@
 \alias{mergeFDS}
 \title{Merge two FraserDataSet objects with different splice sites and junctions}
 \usage{
-mergeFDS(fds1, fds2, join_type = c("outer", "inner"), fds_name = "merged_fds")
+mergeFDS(
+  fds1,
+  fds2,
+  join_type = c("outer", "inner"),
+  fds_name = "merged_fds",
+  workingDir = "FRASER_output"
+)
 }
 \arguments{
 \item{fds1}{First FraserDataSet object.}
@@ -14,6 +20,9 @@ mergeFDS(fds1, fds2, join_type = c("outer", "inner"), fds_name = "merged_fds")
 \item{join_type}{Type of join for rowRanges: "outer" (default) or "inner".}
 
 \item{fds_name}{Name for the merged FraserDataSet.}
+
+\item{workingDir}{Directory where to store HDF5 and RDS files. Defaults to
+\code{FRASER_output} in the current working directory.}
 }
 \value{
 A merged FraserDataSet object containing combined samples and assays.