From b0874d148316b5163f002ad4267263c2e37c8a1d Mon Sep 17 00:00:00 2001
From: Eric Talevich <52723+etal@users.noreply.github.com>
Date: Sat, 7 Feb 2026 13:06:10 -0800
Subject: [PATCH 1/3] skgenome: Replace custom interval arithmetic with
 bioframe (closes #982)

Use bioframe.subtract, bioframe.cluster, and bioframe.overlap to replace
custom Python implementations of genomic interval merging, subtraction,
and intersection. This reduces maintenance surface (~155 lines net) and
improves performance for WGS cases with many bins.

- subtract.py: replace _subtraction() generator with bioframe.subtract()
- merge.py: replace _nonoverlapping_groups + _squash_tuples with
  bioframe.cluster() for both merge() and flatten()
- gary.py: replace intersection() non-trim path with bioframe.overlap()
- intersect.py: delete unimplemented venn() stub
- Add bioframe>=0.7.2 as a core dependency

Public GenomicArray API (method names and signatures) is unchanged.
---
 environment-dev.yml   |   1 +
 requirements/core.txt |   1 +
 requirements/min.txt  |   1 +
 skgenome/gary.py      |  31 ++++--
 skgenome/intersect.py |  12 ---
 skgenome/merge.py     | 246 ++++++++++++------------------------------
 skgenome/subtract.py  |  73 ++-----------
 7 files changed, 105 insertions(+), 260 deletions(-)

diff --git a/environment-dev.yml b/environment-dev.yml
index 7893bdf6..567bc4fb 100644
--- a/environment-dev.yml
+++ b/environment-dev.yml
@@ -5,6 +5,7 @@ channels:
 dependencies:
   - python=3.11
   # Core dependencies (from requirements/core.txt)
+  - bioframe>=0.7.2
   - biopython>=1.85
   - matplotlib>=3.9.0
   - numpy>=2.2.3
diff --git a/requirements/core.txt b/requirements/core.txt
index 235994aa..b39e641f 100644
--- a/requirements/core.txt
+++ b/requirements/core.txt
@@ -1,3 +1,4 @@
+bioframe >= 0.7.2
 biopython >= 1.85
 matplotlib >= 3.9.0
 numpy >= 2.2.3
diff --git a/requirements/min.txt b/requirements/min.txt
index b0637683..cbe4f8a2 100644
--- a/requirements/min.txt
+++ b/requirements/min.txt
@@ -1,3 +1,4 @@
+bioframe == 0.7.2
 biopython == 1.85
 matplotlib == 3.9.0
 numpy == 2.2.3
diff --git a/skgenome/gary.py b/skgenome/gary.py
index 7a17d7aa..be503008 100644
--- a/skgenome/gary.py
+++ b/skgenome/gary.py
@@ -6,11 +6,14 @@
 from collections import OrderedDict
 from typing import Any, TypeVar, TYPE_CHECKING
 
+import bioframe
 import numpy as np
 import pandas as pd
 
 from .chromsort import sorter_chrom
 from .intersect import by_ranges, into_ranges, iter_ranges, iter_slices, Numeric
+
+_BF_COLS = ("chromosome", "start", "end")
 from .merge import flatten, merge
 from .rangelabel import to_label
 from .subtract import subtract
@@ -703,19 +706,33 @@ def intersection(self, other: GenomicArray, mode: str = "outer") -> GenomicArray
 
         The extra fields of `self`, but not `other`, are retained in the output.
         """
-        # TODO options for which extra fields to keep
-        #   by default, keep just the fields in 'table'
         if mode == "trim":
-            # Slower
             chunks = [
                 chunk.data
                 for _, chunk in self.by_ranges(other, mode=mode, keep_empty=False)
             ]
             return self.as_dataframe(pd.concat(chunks))
-        # Faster
-        slices = iter_slices(self.data, other.data, mode, False)
-        indices = np.concatenate(list(slices))
-        return self.as_dataframe(self.data.loc[indices])
+        if not len(self) or not len(other):
+            return self.as_dataframe(self.data.iloc[:0])
+        pairs = bioframe.overlap(
+            self.data,
+            other.data,
+            how="inner",
+            cols1=_BF_COLS,
+            cols2=_BF_COLS,
+            return_index=True,
+            return_input=True,
+        )
+        if mode == "inner":
+            # Keep only self bins fully contained within an other bin
+            pairs = pairs[
+                (pairs["start"] >= pairs["start_"]) & (pairs["end"] <= pairs["end_"])
+            ]
+        # Sort by other-index then self-index to match by_ranges ordering
+        pairs = pairs.sort_values(["index_", "index"])
+        # Return self rows (with duplicates, one per overlap pair)
+        self_indices = pairs["index"].to_numpy()
+        return self.as_dataframe(self.data.loc[self_indices])
 
     def merge(
         self,
diff --git a/skgenome/intersect.py b/skgenome/intersect.py
index a3d4b714..4f105ac2 100644
--- a/skgenome/intersect.py
+++ b/skgenome/intersect.py
@@ -241,15 +241,3 @@ def _irange_nested(
                 region_mask[int(end_idx) :] = 0
 
         yield region_mask, start_val, end_val
-
-
-def venn(table, other, mode: str):
-    # TODO -- implement 'venn' via fjoin algorithm
-    # 'cut' table at all 'other' boundaries
-    #   -> extra column '_venn_':int (0, 1, 2)
-    #       0=self only, 1=both, 2=other only
-    #   -> 'cut' just drops the '_venn_' column
-    #   -> 'subtract' drops 1 and 2?
-    #       (is that faster? probably not)
-    #   -> 'jaccard' does math with it...
-    return table
diff --git a/skgenome/merge.py b/skgenome/merge.py
index 3f1c300d..33fda176 100644
--- a/skgenome/merge.py
+++ b/skgenome/merge.py
@@ -12,22 +12,23 @@
 import itertools
 from typing import TYPE_CHECKING
 
-import numpy as np
+import bioframe
 import pandas as pd
 
 from .chromsort import sorter_chrom
-from .combiners import get_combiners, first_of
+from .combiners import get_combiners
 
 if TYPE_CHECKING:
-    from collections.abc import Callable
-    from collections.abc import Iterable
+    from collections.abc import Callable, Iterable
+
+_BF_COLS = ("chromosome", "start", "end")
 
 
 def flatten(
-    table,
+    table: pd.DataFrame,
     combine: dict[str, Callable] | None = None,
     split_columns: Iterable[str] | None = None,
-):
+) -> pd.DataFrame:
     """Combine overlapping regions into single rows, similar to bedtools merge."""
     if table.empty:
         return table
@@ -36,122 +37,53 @@ def flatten(
     # NB: Input rows and columns should already be sorted like this
     table = table.sort_values(["chromosome", "start", "end"])
     cmb = get_combiners(table, False, combine)
-    out = (
-        table.groupby(by="chromosome", as_index=False, group_keys=False, sort=False)[
-            table.columns
-        ]
-        .apply(_flatten_overlapping, cmb, split_columns)
-        .reset_index(drop=True)
+    clustered = bioframe.cluster(
+        table, min_dist=0, cols=_BF_COLS, return_cluster_ids=True
     )
+    all_rows: list = []
+    for _cluster_id, group in clustered.groupby("cluster", sort=False):
+        group_rows = group.drop(
+            columns=["cluster", "cluster_start", "cluster_end"], errors="ignore"
+        )
+        all_rows.extend(_flatten_tuples(group_rows, cmb))
+    out = pd.DataFrame.from_records(all_rows, columns=table.columns)
     return out.reindex(
         out.chromosome.apply(sorter_chrom).sort_values(kind="mergesort").index
     )
 
 
-def _flatten_overlapping(
-    table, combine: dict[str, Callable], split_columns: Iterable | None
-):
-    """Merge overlapping regions within a chromosome/strand.
-
-    Assume chromosome and (if relevant) strand are already identical, so only
-    start and end coordinates are considered.
-    """
-    if split_columns:
-        row_groups = (
-            tuple(_flatten_tuples_split(row_group, combine, split_columns))
-            for row_group in _nonoverlapping_groups(table, 0)
-        )
-    else:
-        row_groups = (
-            tuple(_flatten_tuples(row_group, combine))
-            for row_group in _nonoverlapping_groups(table, 0)
-        )
-    all_row_groups = itertools.chain(*row_groups)
-    return pd.DataFrame.from_records(list(all_row_groups), columns=table.columns)
-
-
-def _flatten_tuples(keyed_rows: Iterable, combine: dict[str, Callable]):
+def _flatten_tuples(
+    group_df: pd.DataFrame, combine: dict[str, Callable]
+) -> list[tuple]:
     """Divide multiple rows where they overlap.
 
-    Parameters
-    ----------
-    keyed_rows : iterable
-        pairs of (non-overlapping-group index, overlapping rows)
-    combine : dict
-        Mapping of field names to functions applied to combine overlapping
-        regions.
-
-    Returns
-    -------
-    DataFrame
+    Split at all coordinate breakpoints and combine extra fields.
     """
-    rows = [kr[1] for kr in keyed_rows]
-    first_row = rows[0]
+    rows = list(group_df.itertuples(index=False))
     if len(rows) == 1:
-        yield first_row
-    else:
-        # TODO speed this up! Bottleneck is in dictcomp
-        extra_cols = [x for x in first_row._fields[3:] if x in combine]
-        breaks = sorted(set(itertools.chain(*[(r.start, r.end) for r in rows])))
-        for bp_start, bp_end in itertools.pairwise(breaks):
-            # Find the row(s) overlapping this region
-            # i.e. any not already seen and not already passed
-            rows_in_play = [
-                row for row in rows if row.start <= bp_start and row.end >= bp_end
-            ]
-            # Combine the extra fields of the overlapping regions
-            extra_fields = {
-                key: combine[key]([getattr(r, key) for r in rows_in_play])
-                for key in extra_cols
-            }
-            yield first_row._replace(start=bp_start, end=bp_end, **extra_fields)
-
-
-def _flatten_tuples_split(keyed_rows, combine: dict, split_columns: Iterable | None):
-    """Divide multiple rows where they overlap.
-
-    Parameters
-    ----------
-    keyed_rows : iterable
-        pairs of (non-overlapping-group index, overlapping rows)
-    combine : dict
-        Mapping of field names to functions applied to combine overlapping
-        regions.
-    split_columns : list or tuple
-        Field names where numeric values should be subdivided a region.
-
-    Returns
-    -------
-    DataFrame
-    """
-    rows = [kr[1] for kr in keyed_rows]
+        return rows
     first_row = rows[0]
-    if len(rows) == 1:
-        yield first_row
-    else:
-        # TODO - use split_columns
-        extra_cols = [x for x in first_row._fields[3:] if x in combine]
-        breaks = sorted(set(itertools.chain(*[(r.start, r.end) for r in rows])))
-        for bp_start, bp_end in itertools.pairwise(breaks):
-            # Find the row(s) overlapping this region
-            # i.e. any not already seen and not already passed
-            rows_in_play = [
-                row for row in rows if row.start <= bp_start and row.end >= bp_end
-            ]
-            # Combine the extra fields of the overlapping regions
-            extra_fields = {
-                key: combine[key]([getattr(r, key) for r in rows_in_play])
-                for key in extra_cols
-            }
-            yield first_row._replace(start=bp_start, end=bp_end, **extra_fields)
+    extra_cols = [x for x in first_row._fields[3:] if x in combine]
+    breaks = sorted(set(itertools.chain(*[(r.start, r.end) for r in rows])))
+    result = []
+    for bp_start, bp_end in itertools.pairwise(breaks):
+        rows_in_play = [
+            row for row in rows if row.start <= bp_start and row.end >= bp_end
+        ]
+        extra_fields = {
+            key: combine[key]([getattr(r, key) for r in rows_in_play])
+            for key in extra_cols
+        }
+        result.append(first_row._replace(start=bp_start, end=bp_end, **extra_fields))
+    return result
 
 
 def merge(
-    table,
+    table: pd.DataFrame,
     bp: int = 0,
     stranded: bool = False,
     combine: dict[str, Callable] | None = None,
-):
+) -> pd.DataFrame:
     """Merge overlapping rows in a DataFrame."""
     if table.empty:
         return table
@@ -161,84 +93,42 @@ def merge(
     if stranded:
         groupkey = ["chromosome", "strand"]
     else:
-        # NB: same gene name can appear on alt. contigs
         groupkey = ["chromosome"]
     table = table.sort_values([*groupkey, "start", "end"])
     cmb = get_combiners(table, stranded, combine)
-    out = (
-        table.groupby(by=groupkey, as_index=False, group_keys=False, sort=False)[
-            table.columns
-        ]
-        .apply(_merge_overlapping, bp, cmb)
-        .reset_index(drop=True)
+    min_dist = max(0, -bp)
+    on = ["strand"] if stranded else None
+    clustered = bioframe.cluster(
+        table,
+        min_dist=min_dist,
+        cols=_BF_COLS,
+        on=on,
+        return_cluster_ids=True,
+        return_cluster_intervals=True,
     )
+    data_cols = [
+        c
+        for c in clustered.columns
+        if c not in ("cluster", "cluster_start", "cluster_end")
+    ]
+    result_rows = []
+    for _cluster_id, group in clustered.groupby("cluster", sort=False):
+        if len(group) == 1:
+            result_rows.append(group[data_cols].iloc[0])
+        else:
+            vals = {}
+            for col in data_cols:
+                if col == "start":
+                    vals[col] = group["cluster_start"].iloc[0]
+                elif col == "end":
+                    vals[col] = group["cluster_end"].iloc[0]
+                elif col in cmb:
+                    vals[col] = cmb[col](group[col].values)
+                else:
+                    vals[col] = group[col].iloc[0]
+            result_rows.append(pd.Series(vals))
+    out = pd.DataFrame(result_rows, columns=data_cols).reset_index(drop=True)
     # Re-sort chromosomes cleverly instead of lexicographically
     return out.reindex(
         out.chromosome.apply(sorter_chrom).sort_values(kind="mergesort").index
     )
-
-
-def _merge_overlapping(table, bp: int, combine: dict[str, Callable]):
-    """Merge overlapping regions within a chromosome/strand.
-
-    Assume chromosome and (if relevant) strand are already identical, so only
-    start and end coordinates are considered.
-    """
-    merged_rows = [
-        _squash_tuples(row_group, combine)
-        for row_group in _nonoverlapping_groups(table, bp)
-    ]
-    return pd.DataFrame.from_records(merged_rows, columns=merged_rows[0]._fields)
-
-
-def _nonoverlapping_groups(table, bp: int):
-    """Identify and enumerate groups of overlapping rows.
-
-    That is, increment the group ID after each non-negative gap between
-    intervals. Intervals (rows) will be merged if any bases overlap by at least
-    `bp`.
-    """
-    # Examples:
-    #
-    #  gap?     F  T  F  F  T  T  T  F
-    #  group  0  1  1  2  3  3  3  3  4
-    #
-    #  gap?     T  F  T  T  F  F  F  T
-    #  group  0  0  1  1  1  2  3  4  4
-    gap_sizes = table.start.to_numpy()[1:] - table.end.cummax().to_numpy()[:-1]
-    group_keys = np.r_[False, gap_sizes > (-bp)].cumsum()
-    # NB: pandas groupby seems like the obvious choice over itertools, but it is
-    # very slow -- probably because it traverses the whole table (i.e.
-    # chromosome) again to select groups, redoing the various inferences that
-    # would be worthwhile outside this inner loop. With itertools, we take
-    # advantage of the grouping and sorting already done, and don't repeat
-    # pandas' traversal and inferences.
-    # ENH: Find & use a lower-level, 1-pass pandas function
-    keyed_groups = zip(group_keys, table.itertuples(index=False), strict=True)
-    return (row_group for _key, row_group in itertools.groupby(keyed_groups, first_of))
-
-
-# Squash rows according to a given grouping criterion
-# XXX see also segfilter.py
-def _squash_tuples(keyed_rows, combine: dict[str, Callable]):
-    """Combine multiple rows into one NamedTuple.
-
-    Parameters
-    ----------
-    keyed_rows : iterable
-        pairs of (non-overlapping-group index, overlapping rows)
-    combine : dict
-
-    Returns
-    -------
-    namedtuple
-    """
-    rows = [kr[1] for kr in keyed_rows]  # list(rows)
-    firsttup = rows[0]
-    if len(rows) == 1:
-        return firsttup
-    newfields = {
-        key: combiner(pd.Series([getattr(r, key) for r in rows]))
-        for key, combiner in combine.items()
-    }
-    return firsttup._replace(**newfields)
diff --git a/skgenome/subtract.py b/skgenome/subtract.py
index 7e621d89..2a855832 100644
--- a/skgenome/subtract.py
+++ b/skgenome/subtract.py
@@ -7,73 +7,20 @@
 
 """
 
-import logging
+from __future__ import annotations
 
-import numpy as np
-import pandas as pd
+from typing import TYPE_CHECKING
 
-from .intersect import by_ranges
+import bioframe
 
+if TYPE_CHECKING:
+    import pandas as pd
 
-def subtract(table, other) -> pd.DataFrame:
+_BF_COLS = ("chromosome", "start", "end")
+
+
+def subtract(table: pd.DataFrame, other: pd.DataFrame) -> pd.DataFrame:
     """Subtract one set of regions from another, returning the one-way difference."""
     if not len(other):
         return table
-    return pd.DataFrame.from_records(_subtraction(table, other), columns=table.columns)
-
-
-def _subtraction(table, other):
-    for keeper, rows_to_exclude in by_ranges(other, table, "outer", True):
-        if len(rows_to_exclude):
-            logging.debug(
-                " %s:%d-%d : Subtracting %d excluded regions",
-                keeper.chromosome,
-                keeper.start,
-                keeper.end,
-                len(rows_to_exclude),
-            )
-
-            keep_left = keeper.start < rows_to_exclude.start.iat[0]
-            keep_right = keeper.end > rows_to_exclude.end.iat[-1]
-            if keep_left and keep_right:
-                # Keep both original edges of the source region
-                # =========
-                #   --  --
-                starts = np.r_[keeper.start, rows_to_exclude.end.to_numpy()]
-                ends = np.r_[rows_to_exclude.start.to_numpy(), keeper.end]
-            elif keep_left:
-                # Exclusion overlaps only the right side
-                # =======
-                #   -- ---
-                starts = np.r_[keeper.start, rows_to_exclude.end.to_numpy()[:-1]]
-                ends = rows_to_exclude.start.to_numpy()
-            elif keep_right:
-                # Exclusion overlaps only the left side
-                #  ========
-                # ---  --
-                starts = rows_to_exclude.end.to_numpy()
-                ends = np.r_[rows_to_exclude.start.to_numpy()[1:], keeper.end]
-            elif len(rows_to_exclude) > 1:
-                # Exclusions overlap both edges
-                #  ======
-                # -- -- ---
-                starts = rows_to_exclude.end.to_numpy()[:-1]
-                ends = rows_to_exclude.start.to_numpy()[1:]
-            else:
-                # Exclusion covers the whole region
-                continue
-
-            for start, end in zip(starts, ends, strict=True):
-                if end > start:
-                    yield keeper._replace(start=start, end=end)
-                else:
-                    logging.debug("Discarding pair: (%d, %d)", start, end)
-
-        else:
-            logging.debug(
-                " %s:%d-%d : No excluded regions",
-                keeper.chromosome,
-                keeper.start,
-                keeper.end,
-            )
-            yield keeper
+    return bioframe.subtract(table, other, cols1=_BF_COLS, cols2=_BF_COLS)

From a97718bee201074927ee43da2da886feb9d79a1e Mon Sep 17 00:00:00 2001
From: Eric Talevich <52723+etal@users.noreply.github.com>
Date: Sat, 7 Feb 2026 14:47:23 -0800
Subject: [PATCH 2/3] skgenome.gary: Implement .cut() and .squash() (#226,
 #227)

- cut(other): splits intervals at the boundary coordinates of another
  array. Each piece inherits all data from its source row.
- squash(by, combine): combines consecutive adjacent rows, optionally
  only when they share the same value in a given column (e.g. gene).

Both follow the established pattern of DataFrame-level functions
(cut.py, merge.py) wrapped by GenomicArray methods in gary.py.
---
 skgenome/cut.py     |  72 +++++++++++++++++++++++
 skgenome/gary.py    |  30 +++++++---
 skgenome/merge.py   |  71 +++++++++++++++++++++-
 test/test_genome.py | 140 ++++++++++++++++++++++++++++++++++++++++++--
 4 files changed, 299 insertions(+), 14 deletions(-)
 create mode 100644 skgenome/cut.py

diff --git a/skgenome/cut.py b/skgenome/cut.py
new file mode 100644
index 00000000..b7c23aa0
--- /dev/null
+++ b/skgenome/cut.py
@@ -0,0 +1,72 @@
+"""DataFrame-level cutting operations.
+
+Split genomic regions at boundaries defined by another set of regions,
+similar to bedtools intersect -split.
+
+The functions here operate on pandas DataFrame and Series instances, not
+GenomicArray types.
+
+"""
+
+from __future__ import annotations
+
+import itertools
+import numpy as np
+import pandas as pd
+
+
+def cut(table: pd.DataFrame, other: pd.DataFrame) -> pd.DataFrame:
+    """Split intervals in ``table`` at the boundary coordinates of ``other``.
+
+    Each interval in ``table`` is split wherever a start or end coordinate
+    from ``other`` falls strictly within it. Every resulting piece inherits
+    all data columns from its source row.
+
+    Parameters
+    ----------
+    table : DataFrame
+        Genomic intervals with at least chromosome, start, end columns.
+    other : DataFrame
+        Intervals whose start/end positions define the cut points.
+
+    Returns
+    -------
+    DataFrame
+        A copy of ``table`` with rows split at the breakpoints from ``other``.
+    """
+    if table.empty or other.empty:
+        return table
+
+    # Collect breakpoints per chromosome from other
+    other_bp: dict[str, np.ndarray] = {}
+    for chrom, grp in other.groupby("chromosome", sort=False):
+        other_bp[chrom] = np.unique(
+            np.concatenate([grp["start"].to_numpy(), grp["end"].to_numpy()])
+        )
+
+    result_chunks: list[pd.DataFrame] = []
+    for chrom, chrom_table in table.groupby("chromosome", sort=False):
+        if chrom not in other_bp:
+            result_chunks.append(chrom_table)
+            continue
+
+        breakpoints = other_bp[chrom]
+        cut_rows: list[tuple] = []
+        for row in chrom_table.itertuples(index=False):
+            # Breakpoints strictly inside (row.start, row.end)
+            inner = breakpoints[(breakpoints > row.start) & (breakpoints < row.end)]
+            if len(inner) == 0:
+                cut_rows.append(row)
+            else:
+                bounds = np.concatenate([[row.start], inner, [row.end]])
+                for seg_start, seg_end in itertools.pairwise(bounds):
+                    cut_rows.append(
+                        row._replace(start=int(seg_start), end=int(seg_end))
+                    )
+        result_chunks.append(
+            pd.DataFrame.from_records(cut_rows, columns=chrom_table.columns)
+        )
+
+    if not result_chunks:
+        return table.iloc[:0]
+    return pd.concat(result_chunks, ignore_index=True)
diff --git a/skgenome/gary.py b/skgenome/gary.py
index be503008..ac280c56 100644
--- a/skgenome/gary.py
+++ b/skgenome/gary.py
@@ -11,10 +11,11 @@
 import pandas as pd
 
 from .chromsort import sorter_chrom
+from .cut import cut
 from .intersect import by_ranges, into_ranges, iter_ranges, iter_slices, Numeric
 
 _BF_COLS = ("chromosome", "start", "end")
-from .merge import flatten, merge
+from .merge import flatten, merge, squash
 from .rangelabel import to_label
 from .subtract import subtract
 from .subdivide import subdivide
@@ -686,10 +687,9 @@ def sort_columns(self) -> None:
 
     # Genome arithmetic
 
-    def cut(self, other, combine=None):
-        """Split this array's regions at the boundaries in `other`."""
-        # TODO
-        return NotImplemented
+    def cut(self: _GA, other: GenomicArray) -> _GA:
+        """Split this array's regions at the boundaries in ``other``."""
+        return self.as_dataframe(cut(self.data, other.data))
 
     def flatten(
         self,
@@ -782,10 +782,22 @@ def resize_ranges(
         # Don't modify the original
         return self.as_dataframe(table.copy())
 
-    def squash(self, combine=None):
-        """Combine some groups of rows, by some criteria, into single rows."""
-        # TODO
-        return NotImplemented
+    def squash(
+        self: _GA,
+        by: str | None = None,
+        combine: dict[str, Callable] | None = None,
+    ) -> _GA:
+        """Combine consecutive adjacent rows into single rows.
+
+        Parameters
+        ----------
+        by : str or None
+            If given, only combine consecutive rows with the same value
+            in this column (e.g. ``"gene"``).
+        combine : dict or None
+            Column-to-function mappings for aggregation.
+        """
+        return self.as_dataframe(squash(self.data, by=by, combine=combine))
 
     def subdivide(
         self, avg_size: int, min_size: int = 0, verbose: bool = False
diff --git a/skgenome/merge.py b/skgenome/merge.py
index 33fda176..d4ec351f 100644
--- a/skgenome/merge.py
+++ b/skgenome/merge.py
@@ -13,10 +13,11 @@
 from typing import TYPE_CHECKING
 
 import bioframe
+import numpy as np
 import pandas as pd
 
 from .chromsort import sorter_chrom
-from .combiners import get_combiners
+from .combiners import first_of, get_combiners
 
 if TYPE_CHECKING:
     from collections.abc import Callable, Iterable
@@ -132,3 +133,71 @@ def merge(
     return out.reindex(
         out.chromosome.apply(sorter_chrom).sort_values(kind="mergesort").index
     )
+
+
+def squash(
+    table: pd.DataFrame,
+    by: str | None = None,
+    combine: dict[str, Callable] | None = None,
+) -> pd.DataFrame:
+    """Combine consecutive adjacent rows into single rows.
+
+    Parameters
+    ----------
+    table : DataFrame
+        Genomic intervals with at least chromosome, start, end columns.
+        Must be sorted by chromosome, start, end.
+    by : str or None
+        If given, only combine consecutive rows that have the same value
+        in this column (e.g. ``"gene"``).  If None, combine all adjacent
+        rows on the same chromosome.
+    combine : dict or None
+        Column-to-function mappings for aggregation.  See
+        :func:`get_combiners`.
+
+    Returns
+    -------
+    DataFrame
+        A new DataFrame with consecutive adjacent rows combined.
+    """
+    if table.empty or len(table) == 1:
+        return table
+
+    cmb = get_combiners(table, stranded=False, combine=combine)
+    if by is not None:
+        # Values are identical within each group, so just take the first
+        cmb[by] = first_of
+
+    # Vectorised adjacency detection
+    chroms = table["chromosome"].to_numpy()
+    starts = table["start"].to_numpy()
+    ends = table["end"].to_numpy()
+    is_adjacent = (chroms[:-1] == chroms[1:]) & (ends[:-1] == starts[1:])
+    if by is not None:
+        by_vals = table[by].to_numpy()
+        is_adjacent &= by_vals[:-1] == by_vals[1:]
+
+    # Nothing to squash -- short-circuit
+    if not is_adjacent.any():
+        return table
+
+    # Assign group IDs: increment when rows are *not* adjacent
+    group_ids = np.concatenate([[0], np.cumsum(~is_adjacent)])
+
+    result_rows: list[pd.Series] = []
+    for _gid, group in table.groupby(group_ids, sort=False):
+        if len(group) == 1:
+            result_rows.append(group.iloc[0])
+        else:
+            vals: dict = {}
+            for col in table.columns:
+                if col == "start":
+                    vals[col] = group["start"].iloc[0]
+                elif col == "end":
+                    vals[col] = group["end"].iloc[-1]
+                elif col in cmb:
+                    vals[col] = cmb[col](group[col].values)
+                else:
+                    vals[col] = group[col].iloc[0]
+            result_rows.append(pd.Series(vals))
+    return pd.DataFrame(result_rows, columns=table.columns).reset_index(drop=True)
diff --git a/test/test_genome.py b/test/test_genome.py
index 8ccf86e8..7ac2ae96 100755
--- a/test/test_genome.py
+++ b/test/test_genome.py
@@ -1,5 +1,6 @@
 #!/usr/bin/env python
 """Unit tests for the 'genome' sub-package."""
+
 import functools
 import operator
 import random
@@ -58,12 +59,18 @@ def test_autosomes(self):
         autoy = self.ex_cnr.autosomes(also=["chrY"])
         self.assertEqual(len(autoy), len_all - len_x)
         autoxy = self.ex_cnr.autosomes(also=["chrX", "chrY"])
-        self.assertEqual(len(autoxy), len_all, "It's possible to provide chromosome names.")
-        some_x = (self.ex_cnr.chromosome == 'chrX') & (self.ex_cnr.end <= 434918)
+        self.assertEqual(
+            len(autoxy), len_all, "It's possible to provide chromosome names."
+        )
+        some_x = (self.ex_cnr.chromosome == "chrX") & (self.ex_cnr.end <= 434918)
         some_x_len = some_x.sum()
         self.assertEqual(some_x_len, 3)
         auto_and_some_x = self.ex_cnr.autosomes(also=some_x)
-        self.assertEqual(len(auto_and_some_x), len(auto) + some_x_len, "It's possible to provide a Pandas filter.")
+        self.assertEqual(
+            len(auto_and_some_x),
+            len(auto) + some_x_len,
+            "It's possible to provide a Pandas filter.",
+        )
 
     def test_by_chromosome(self):
         for fname in ("formats/amplicon.cnr", "formats/cl_seq.cns"):
@@ -512,7 +519,9 @@ def test_intersect(self):
                     self._compare_regions(result, self._from_intervals(expect))
                 # Single-object intersect
                 result = regions.intersection(selections, mode=mode)
-                expect = self._from_intervals(functools.reduce(operator.iadd, expectations[mode], []))
+                expect = self._from_intervals(
+                    functools.reduce(operator.iadd, expectations[mode], [])
+                )
                 self._compare_regions(result, expect)
 
     def test_subtract(self):
@@ -566,6 +575,129 @@ def test_subtract(self):
         iresult = target.subtract(access)
         self._compare_regions(iresult, invert)
 
+    def test_cut(self):
+        # Simple: one interval split by two boundary regions
+        regions = self._from_intervals([(1, 20, "A")])
+        cuts = self._from_intervals([(5, 10, "X"), (15, 25, "Y")])
+        # Breakpoints {5, 10, 15, 25}; 5, 10, 15 are inside (1, 20)
+        result = regions.cut(cuts)
+        expect = self._from_intervals(
+            [
+                (1, 5, "A"),
+                (5, 10, "A"),
+                (10, 15, "A"),
+                (15, 20, "A"),
+            ]
+        )
+        self._compare_regions(result, expect)
+
+        # Breakpoint at exact start/end: only interior breakpoint splits
+        regions2 = self._from_intervals([(10, 20, "X")])
+        cuts2 = self._from_intervals([(10, 15, ""), (20, 25, "")])
+        result2 = regions2.cut(cuts2)
+        expect2 = self._from_intervals([(10, 15, "X"), (15, 20, "X")])
+        self._compare_regions(result2, expect2)
+
+        # No breakpoints inside any interval: unchanged
+        regions3 = self._from_intervals([(10, 20, "X")])
+        cuts3 = self._from_intervals([(1, 5, ""), (25, 30, "")])
+        result3 = regions3.cut(cuts3)
+        self._compare_regions(result3, regions3)
+
+        # Empty other: unchanged
+        regions4 = self._from_intervals([(1, 20, "A")])
+        empty = self._from_intervals([])
+        result4 = regions4.cut(empty)
+        self._compare_regions(result4, regions4)
+
+        # Multiple intervals, some cut, some not
+        regions5 = self._from_intervals(
+            [
+                (1, 10, "A"),
+                (20, 30, "B"),
+                (40, 50, "C"),
+            ]
+        )
+        cuts5 = self._from_intervals([(5, 25, "")])
+        # Breakpoints {5, 25}: 5 inside A, 25 inside B, neither inside C
+        result5 = regions5.cut(cuts5)
+        expect5 = self._from_intervals(
+            [
+                (1, 5, "A"),
+                (5, 10, "A"),
+                (20, 25, "B"),
+                (25, 30, "B"),
+                (40, 50, "C"),
+            ]
+        )
+        self._compare_regions(result5, expect5)
+
+    def test_squash(self):
+        # All adjacent: combine into one row
+        regions = self._from_intervals(
+            [
+                (1, 5, "A"),
+                (5, 10, "B"),
+                (10, 15, "C"),
+            ]
+        )
+        result = regions.squash(combine=self.combiner)
+        expect = self._from_intervals([(1, 15, "ABC")])
+        self._compare_regions(result, expect)
+
+        # Gap prevents combining
+        regions2 = self._from_intervals(
+            [
+                (1, 5, "A"),
+                (5, 10, "B"),
+                (12, 15, "C"),
+                (15, 20, "D"),
+            ]
+        )
+        result2 = regions2.squash(combine=self.combiner)
+        expect2 = self._from_intervals(
+            [
+                (1, 10, "AB"),
+                (12, 20, "CD"),
+            ]
+        )
+        self._compare_regions(result2, expect2)
+
+        # Squash by gene: only combine same-gene consecutive runs
+        regions3 = self._from_intervals(
+            [
+                (1, 5, "X"),
+                (5, 10, "X"),
+                (10, 15, "Y"),
+                (15, 20, "Y"),
+                (20, 25, "X"),
+            ]
+        )
+        result3 = regions3.squash(by="gene", combine=self.combiner)
+        expect3 = self._from_intervals(
+            [
+                (1, 10, "X"),
+                (10, 20, "Y"),
+                (20, 25, "X"),
+            ]
+        )
+        self._compare_regions(result3, expect3)
+
+        # Single row: unchanged
+        regions4 = self._from_intervals([(1, 100, "X")])
+        result4 = regions4.squash(combine=self.combiner)
+        self._compare_regions(result4, regions4)
+
+        # Non-adjacent: unchanged
+        regions5 = self._from_intervals(
+            [
+                (1, 5, "A"),
+                (10, 15, "B"),
+            ]
+        )
+        result5 = regions5.squash(combine=self.combiner)
+        self._compare_regions(result5, regions5)
+
 
 class OtherTests(unittest.TestCase):
     """Other small modules & functions in this sub-package."""

From 32180ac23e08afe5d2b2470a66522e64dd22b3dd Mon Sep 17 00:00:00 2001
From: Eric Talevich <52723+etal@users.noreply.github.com>
Date: Sat, 7 Feb 2026 14:56:05 -0800
Subject: [PATCH 3/3] Suppress bioframe's Pandas4Warning in pytest

bioframe <=0.8.0 uses df.groupby(['col']).groups with a single-element
list, which pandas 3.0 deprecates. Our filterwarnings = ["error"] turns
this into a test failure on Python 3.12+ CI (pandas 3.0). Add a
message-based filter that works across all pandas versions.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 pyproject.toml | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index c7f9e9db..4c7842b2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -63,7 +63,12 @@ optional-dependencies = {test = {file = "requirements/tests.txt"}}
 
 [tool.pytest.ini_options]
 testpaths = ["test"]
-filterwarnings = ["error"]
+filterwarnings = [
+    "error",
+    # bioframe <=0.8.0 uses df.groupby(['col']).groups with a single-element
+    # list, which pandas 3.0 deprecates via Pandas4Warning.
+    "ignore:In a future version, the keys of:Warning",
+]
 
 [tool.coverage.run]
 branch = true