From b0874d148316b5163f002ad4267263c2e37c8a1d Mon Sep 17 00:00:00 2001 From: Eric Talevich <52723+etal@users.noreply.github.com> Date: Sat, 7 Feb 2026 13:06:10 -0800 Subject: [PATCH 1/3] skgenome: Replace custom interval arithmetic with bioframe (closes #982) Use bioframe.subtract, bioframe.cluster, and bioframe.overlap to replace custom Python implementations of genomic interval merging, subtraction, and intersection. This reduces maintenance surface (~155 lines net) and improves performance for WGS cases with many bins. - subtract.py: replace _subtraction() generator with bioframe.subtract() - merge.py: replace _nonoverlapping_groups + _squash_tuples with bioframe.cluster() for both merge() and flatten() - gary.py: replace intersection() non-trim path with bioframe.overlap() - intersect.py: delete unimplemented venn() stub - Add bioframe>=0.7.2 as a core dependency Public GenomicArray API (method names and signatures) is unchanged. --- environment-dev.yml | 1 + requirements/core.txt | 1 + requirements/min.txt | 1 + skgenome/gary.py | 31 ++++-- skgenome/intersect.py | 12 --- skgenome/merge.py | 246 ++++++++++++------------------------------ skgenome/subtract.py | 73 ++----------- 7 files changed, 105 insertions(+), 260 deletions(-) diff --git a/environment-dev.yml b/environment-dev.yml index 7893bdf6..567bc4fb 100644 --- a/environment-dev.yml +++ b/environment-dev.yml @@ -5,6 +5,7 @@ channels: dependencies: - python=3.11 # Core dependencies (from requirements/core.txt) + - bioframe>=0.7.2 - biopython>=1.85 - matplotlib>=3.9.0 - numpy>=2.2.3 diff --git a/requirements/core.txt b/requirements/core.txt index 235994aa..b39e641f 100644 --- a/requirements/core.txt +++ b/requirements/core.txt @@ -1,3 +1,4 @@ +bioframe >= 0.7.2 biopython >= 1.85 matplotlib >= 3.9.0 numpy >= 2.2.3 diff --git a/requirements/min.txt b/requirements/min.txt index b0637683..cbe4f8a2 100644 --- a/requirements/min.txt +++ b/requirements/min.txt @@ -1,3 +1,4 @@ +bioframe == 0.7.2 biopython == 1.85 matplotlib == 3.9.0 numpy == 2.2.3 diff --git a/skgenome/gary.py b/skgenome/gary.py index 7a17d7aa..be503008 100644 --- a/skgenome/gary.py +++ b/skgenome/gary.py @@ -6,11 +6,14 @@ from collections import OrderedDict from typing import Any, TypeVar, TYPE_CHECKING +import bioframe import numpy as np import pandas as pd from .chromsort import sorter_chrom from .intersect import by_ranges, into_ranges, iter_ranges, iter_slices, Numeric + +_BF_COLS = ("chromosome", "start", "end") from .merge import flatten, merge from .rangelabel import to_label from .subtract import subtract @@ -703,19 +706,33 @@ def intersection(self, other: GenomicArray, mode: str = "outer") -> GenomicArray The extra fields of `self`, but not `other`, are retained in the output. """ - # TODO options for which extra fields to keep - # by default, keep just the fields in 'table' if mode == "trim": - # Slower chunks = [ chunk.data for _, chunk in self.by_ranges(other, mode=mode, keep_empty=False) ] return self.as_dataframe(pd.concat(chunks)) - # Faster - slices = iter_slices(self.data, other.data, mode, False) - indices = np.concatenate(list(slices)) - return self.as_dataframe(self.data.loc[indices]) + if not len(self) or not len(other): + return self.as_dataframe(self.data.iloc[:0]) + pairs = bioframe.overlap( + self.data, + other.data, + how="inner", + cols1=_BF_COLS, + cols2=_BF_COLS, + return_index=True, + return_input=True, + ) + if mode == "inner": + # Keep only self bins fully contained within an other bin + pairs = pairs[ + (pairs["start"] >= pairs["start_"]) & (pairs["end"] <= pairs["end_"]) + ] + # Sort by other-index then self-index to match by_ranges ordering + pairs = pairs.sort_values(["index_", "index"]) + # Return self rows (with duplicates, one per overlap pair) + self_indices = pairs["index"].to_numpy() + return self.as_dataframe(self.data.loc[self_indices]) def merge( self, diff --git a/skgenome/intersect.py b/skgenome/intersect.py index a3d4b714..4f105ac2 100644 --- a/skgenome/intersect.py +++ b/skgenome/intersect.py @@ -241,15 +241,3 @@ def _irange_nested( region_mask[int(end_idx) :] = 0 yield region_mask, start_val, end_val - - -def venn(table, other, mode: str): - # TODO -- implement 'venn' via fjoin algorithm - # 'cut' table at all 'other' boundaries - # -> extra column '_venn_':int (0, 1, 2) - # 0=self only, 1=both, 2=other only - # -> 'cut' just drops the '_venn_' column - # -> 'subtract' drops 1 and 2? - # (is that faster? probably not) - # -> 'jaccard' does math with it... - return table diff --git a/skgenome/merge.py b/skgenome/merge.py index 3f1c300d..33fda176 100644 --- a/skgenome/merge.py +++ b/skgenome/merge.py @@ -12,22 +12,23 @@ import itertools from typing import TYPE_CHECKING -import numpy as np +import bioframe import pandas as pd from .chromsort import sorter_chrom -from .combiners import get_combiners, first_of +from .combiners import get_combiners if TYPE_CHECKING: - from collections.abc import Callable - from collections.abc import Iterable + from collections.abc import Callable, Iterable + +_BF_COLS = ("chromosome", "start", "end") def flatten( - table, + table: pd.DataFrame, combine: dict[str, Callable] | None = None, split_columns: Iterable[str] | None = None, -): +) -> pd.DataFrame: """Combine overlapping regions into single rows, similar to bedtools merge.""" if table.empty: return table @@ -36,122 +37,53 @@ def flatten( # NB: Input rows and columns should already be sorted like this table = table.sort_values(["chromosome", "start", "end"]) cmb = get_combiners(table, False, combine) - out = ( - table.groupby(by="chromosome", as_index=False, group_keys=False, sort=False)[ - table.columns - ] - .apply(_flatten_overlapping, cmb, split_columns) - .reset_index(drop=True) + clustered = bioframe.cluster( + table, min_dist=0, cols=_BF_COLS, return_cluster_ids=True ) + all_rows: list = [] + for _cluster_id, group in clustered.groupby("cluster", sort=False): + group_rows = group.drop( + columns=["cluster", "cluster_start", "cluster_end"], errors="ignore" + ) + all_rows.extend(_flatten_tuples(group_rows, cmb)) + out = pd.DataFrame.from_records(all_rows, columns=table.columns) return out.reindex( out.chromosome.apply(sorter_chrom).sort_values(kind="mergesort").index ) -def _flatten_overlapping( - table, combine: dict[str, Callable], split_columns: Iterable | None -): - """Merge overlapping regions within a chromosome/strand. - - Assume chromosome and (if relevant) strand are already identical, so only - start and end coordinates are considered. - """ - if split_columns: - row_groups = ( - tuple(_flatten_tuples_split(row_group, combine, split_columns)) - for row_group in _nonoverlapping_groups(table, 0) - ) - else: - row_groups = ( - tuple(_flatten_tuples(row_group, combine)) - for row_group in _nonoverlapping_groups(table, 0) - ) - all_row_groups = itertools.chain(*row_groups) - return pd.DataFrame.from_records(list(all_row_groups), columns=table.columns) - - -def _flatten_tuples(keyed_rows: Iterable, combine: dict[str, Callable]): +def _flatten_tuples( + group_df: pd.DataFrame, combine: dict[str, Callable] +) -> list[tuple]: """Divide multiple rows where they overlap. - Parameters - ---------- - keyed_rows : iterable - pairs of (non-overlapping-group index, overlapping rows) - combine : dict - Mapping of field names to functions applied to combine overlapping - regions. - - Returns - ------- - DataFrame + Split at all coordinate breakpoints and combine extra fields. """ - rows = [kr[1] for kr in keyed_rows] - first_row = rows[0] + rows = list(group_df.itertuples(index=False)) if len(rows) == 1: - yield first_row - else: - # TODO speed this up! Bottleneck is in dictcomp - extra_cols = [x for x in first_row._fields[3:] if x in combine] - breaks = sorted(set(itertools.chain(*[(r.start, r.end) for r in rows]))) - for bp_start, bp_end in itertools.pairwise(breaks): - # Find the row(s) overlapping this region - # i.e. any not already seen and not already passed - rows_in_play = [ - row for row in rows if row.start <= bp_start and row.end >= bp_end - ] - # Combine the extra fields of the overlapping regions - extra_fields = { - key: combine[key]([getattr(r, key) for r in rows_in_play]) - for key in extra_cols - } - yield first_row._replace(start=bp_start, end=bp_end, **extra_fields) - - -def _flatten_tuples_split(keyed_rows, combine: dict, split_columns: Iterable | None): - """Divide multiple rows where they overlap. - - Parameters - ---------- - keyed_rows : iterable - pairs of (non-overlapping-group index, overlapping rows) - combine : dict - Mapping of field names to functions applied to combine overlapping - regions. - split_columns : list or tuple - Field names where numeric values should be subdivided a region. - - Returns - ------- - DataFrame - """ - rows = [kr[1] for kr in keyed_rows] + return rows first_row = rows[0] - if len(rows) == 1: - yield first_row - else: - # TODO - use split_columns - extra_cols = [x for x in first_row._fields[3:] if x in combine] - breaks = sorted(set(itertools.chain(*[(r.start, r.end) for r in rows]))) - for bp_start, bp_end in itertools.pairwise(breaks): - # Find the row(s) overlapping this region - # i.e. any not already seen and not already passed - rows_in_play = [ - row for row in rows if row.start <= bp_start and row.end >= bp_end - ] - # Combine the extra fields of the overlapping regions - extra_fields = { - key: combine[key]([getattr(r, key) for r in rows_in_play]) - for key in extra_cols - } - yield first_row._replace(start=bp_start, end=bp_end, **extra_fields) + extra_cols = [x for x in first_row._fields[3:] if x in combine] + breaks = sorted(set(itertools.chain(*[(r.start, r.end) for r in rows]))) + result = [] + for bp_start, bp_end in itertools.pairwise(breaks): + rows_in_play = [ + row for row in rows if row.start <= bp_start and row.end >= bp_end + ] + extra_fields = { + key: combine[key]([getattr(r, key) for r in rows_in_play]) + for key in extra_cols + } + result.append(first_row._replace(start=bp_start, end=bp_end, **extra_fields)) + return result def merge( - table, + table: pd.DataFrame, bp: int = 0, stranded: bool = False, combine: dict[str, Callable] | None = None, -): +) -> pd.DataFrame: """Merge overlapping rows in a DataFrame.""" if table.empty: return table @@ -161,84 +93,42 @@ def merge( if stranded: groupkey = ["chromosome", "strand"] else: - # NB: same gene name can appear on alt. contigs groupkey = ["chromosome"] table = table.sort_values([*groupkey, "start", "end"]) cmb = get_combiners(table, stranded, combine) - out = ( - table.groupby(by=groupkey, as_index=False, group_keys=False, sort=False)[ - table.columns - ] - .apply(_merge_overlapping, bp, cmb) - .reset_index(drop=True) + min_dist = max(0, -bp) + on = ["strand"] if stranded else None + clustered = bioframe.cluster( + table, + min_dist=min_dist, + cols=_BF_COLS, + on=on, + return_cluster_ids=True, + return_cluster_intervals=True, ) + data_cols = [ + c + for c in clustered.columns + if c not in ("cluster", "cluster_start", "cluster_end") + ] + result_rows = [] + for _cluster_id, group in clustered.groupby("cluster", sort=False): + if len(group) == 1: + result_rows.append(group[data_cols].iloc[0]) + else: + vals = {} + for col in data_cols: + if col == "start": + vals[col] = group["cluster_start"].iloc[0] + elif col == "end": + vals[col] = group["cluster_end"].iloc[0] + elif col in cmb: + vals[col] = cmb[col](group[col].values) + else: + vals[col] = group[col].iloc[0] + result_rows.append(pd.Series(vals)) + out = pd.DataFrame(result_rows, columns=data_cols).reset_index(drop=True) # Re-sort chromosomes cleverly instead of lexicographically return out.reindex( out.chromosome.apply(sorter_chrom).sort_values(kind="mergesort").index ) - - -def _merge_overlapping(table, bp: int, combine: dict[str, Callable]): - """Merge overlapping regions within a chromosome/strand. - - Assume chromosome and (if relevant) strand are already identical, so only - start and end coordinates are considered. - """ - merged_rows = [ - _squash_tuples(row_group, combine) - for row_group in _nonoverlapping_groups(table, bp) - ] - return pd.DataFrame.from_records(merged_rows, columns=merged_rows[0]._fields) - - -def _nonoverlapping_groups(table, bp: int): - """Identify and enumerate groups of overlapping rows. - - That is, increment the group ID after each non-negative gap between - intervals. Intervals (rows) will be merged if any bases overlap by at least - `bp`. - """ - # Examples: - # - # gap? F T F F T T T F - # group 0 1 1 2 3 3 3 3 4 - # - # gap? T F T T F F F T - # group 0 0 1 1 1 2 3 4 4 - gap_sizes = table.start.to_numpy()[1:] - table.end.cummax().to_numpy()[:-1] - group_keys = np.r_[False, gap_sizes > (-bp)].cumsum() - # NB: pandas groupby seems like the obvious choice over itertools, but it is - # very slow -- probably because it traverses the whole table (i.e. - # chromosome) again to select groups, redoing the various inferences that - # would be worthwhile outside this inner loop. With itertools, we take - # advantage of the grouping and sorting already done, and don't repeat - # pandas' traversal and inferences. - # ENH: Find & use a lower-level, 1-pass pandas function - keyed_groups = zip(group_keys, table.itertuples(index=False), strict=True) - return (row_group for _key, row_group in itertools.groupby(keyed_groups, first_of)) - - -# Squash rows according to a given grouping criterion -# XXX see also segfilter.py -def _squash_tuples(keyed_rows, combine: dict[str, Callable]): - """Combine multiple rows into one NamedTuple. - - Parameters - ---------- - keyed_rows : iterable - pairs of (non-overlapping-group index, overlapping rows) - combine : dict - - Returns - ------- - namedtuple - """ - rows = [kr[1] for kr in keyed_rows] # list(rows) - firsttup = rows[0] - if len(rows) == 1: - return firsttup - newfields = { - key: combiner(pd.Series([getattr(r, key) for r in rows])) - for key, combiner in combine.items() - } - return firsttup._replace(**newfields) diff --git a/skgenome/subtract.py b/skgenome/subtract.py index 7e621d89..2a855832 100644 --- a/skgenome/subtract.py +++ b/skgenome/subtract.py @@ -7,73 +7,20 @@ """ -import logging +from __future__ import annotations -import numpy as np -import pandas as pd +from typing import TYPE_CHECKING -from .intersect import by_ranges +import bioframe +if TYPE_CHECKING: + import pandas as pd -def subtract(table, other) -> pd.DataFrame: +_BF_COLS = ("chromosome", "start", "end") + + +def subtract(table: pd.DataFrame, other: pd.DataFrame) -> pd.DataFrame: """Subtract one set of regions from another, returning the one-way difference.""" if not len(other): return table - return pd.DataFrame.from_records(_subtraction(table, other), columns=table.columns) - - -def _subtraction(table, other): - for keeper, rows_to_exclude in by_ranges(other, table, "outer", True): - if len(rows_to_exclude): - logging.debug( - " %s:%d-%d : Subtracting %d excluded regions", - keeper.chromosome, - keeper.start, - keeper.end, - len(rows_to_exclude), - ) - - keep_left = keeper.start < rows_to_exclude.start.iat[0] - keep_right = keeper.end > rows_to_exclude.end.iat[-1] - if keep_left and keep_right: - # Keep both original edges of the source region - # ========= - # -- -- - starts = np.r_[keeper.start, rows_to_exclude.end.to_numpy()] - ends = np.r_[rows_to_exclude.start.to_numpy(), keeper.end] - elif keep_left: - # Exclusion overlaps only the right side - # ======= - # -- --- - starts = np.r_[keeper.start, rows_to_exclude.end.to_numpy()[:-1]] - ends = rows_to_exclude.start.to_numpy() - elif keep_right: - # Exclusion overlaps only the left side - # ======== - # --- -- - starts = rows_to_exclude.end.to_numpy() - ends = np.r_[rows_to_exclude.start.to_numpy()[1:], keeper.end] - elif len(rows_to_exclude) > 1: - # Exclusions overlap both edges - # ====== - # -- -- --- - starts = rows_to_exclude.end.to_numpy()[:-1] - ends = rows_to_exclude.start.to_numpy()[1:] - else: - # Exclusion covers the whole region - continue - - for start, end in zip(starts, ends, strict=True): - if end > start: - yield keeper._replace(start=start, end=end) - else: - logging.debug("Discarding pair: (%d, %d)", start, end) - - else: - logging.debug( - " %s:%d-%d : No excluded regions", - keeper.chromosome, - keeper.start, - keeper.end, - ) - yield keeper + return bioframe.subtract(table, other, cols1=_BF_COLS, cols2=_BF_COLS) From a97718bee201074927ee43da2da886feb9d79a1e Mon Sep 17 00:00:00 2001 From: Eric Talevich <52723+etal@users.noreply.github.com> Date: Sat, 7 Feb 2026 14:47:23 -0800 Subject: [PATCH 2/3] skgenome.gary: Implement .cut() and .squash() (#226, #227) - cut(other): splits intervals at the boundary coordinates of another array. Each piece inherits all data from its source row. - squash(by, combine): combines consecutive adjacent rows, optionally only when they share the same value in a given column (e.g. gene). Both follow the established pattern of DataFrame-level functions (cut.py, merge.py) wrapped by GenomicArray methods in gary.py. --- skgenome/cut.py | 72 +++++++++++++++++++++++ skgenome/gary.py | 30 +++++++--- skgenome/merge.py | 71 +++++++++++++++++++++- test/test_genome.py | 140 ++++++++++++++++++++++++++++++++++++++++++-- 4 files changed, 299 insertions(+), 14 deletions(-) create mode 100644 skgenome/cut.py diff --git a/skgenome/cut.py b/skgenome/cut.py new file mode 100644 index 00000000..b7c23aa0 --- /dev/null +++ b/skgenome/cut.py @@ -0,0 +1,72 @@ +"""DataFrame-level cutting operations. + +Split genomic regions at boundaries defined by another set of regions, +similar to bedtools intersect -split. + +The functions here operate on pandas DataFrame and Series instances, not +GenomicArray types. + +""" + +from __future__ import annotations + +import itertools +import numpy as np +import pandas as pd + + +def cut(table: pd.DataFrame, other: pd.DataFrame) -> pd.DataFrame: + """Split intervals in ``table`` at the boundary coordinates of ``other``. + + Each interval in ``table`` is split wherever a start or end coordinate + from ``other`` falls strictly within it. Every resulting piece inherits + all data columns from its source row. + + Parameters + ---------- + table : DataFrame + Genomic intervals with at least chromosome, start, end columns. + other : DataFrame + Intervals whose start/end positions define the cut points. + + Returns + ------- + DataFrame + A copy of ``table`` with rows split at the breakpoints from ``other``. + """ + if table.empty or other.empty: + return table + + # Collect breakpoints per chromosome from other + other_bp: dict[str, np.ndarray] = {} + for chrom, grp in other.groupby("chromosome", sort=False): + other_bp[chrom] = np.unique( + np.concatenate([grp["start"].to_numpy(), grp["end"].to_numpy()]) + ) + + result_chunks: list[pd.DataFrame] = [] + for chrom, chrom_table in table.groupby("chromosome", sort=False): + if chrom not in other_bp: + result_chunks.append(chrom_table) + continue + + breakpoints = other_bp[chrom] + cut_rows: list[tuple] = [] + for row in chrom_table.itertuples(index=False): + # Breakpoints strictly inside (row.start, row.end) + inner = breakpoints[(breakpoints > row.start) & (breakpoints < row.end)] + if len(inner) == 0: + cut_rows.append(row) + else: + bounds = np.concatenate([[row.start], inner, [row.end]]) + for seg_start, seg_end in itertools.pairwise(bounds): + cut_rows.append( + row._replace(start=int(seg_start), end=int(seg_end)) + ) + result_chunks.append( + pd.DataFrame.from_records(cut_rows, columns=chrom_table.columns) + ) + + if not result_chunks: + return table.iloc[:0] + return pd.concat(result_chunks, ignore_index=True) diff --git a/skgenome/gary.py b/skgenome/gary.py index be503008..ac280c56 100644 --- a/skgenome/gary.py +++ b/skgenome/gary.py @@ -11,10 +11,11 @@ import pandas as pd from .chromsort import sorter_chrom +from .cut import cut from .intersect import by_ranges, into_ranges, iter_ranges, iter_slices, Numeric _BF_COLS = ("chromosome", "start", "end") -from .merge import flatten, merge +from .merge import flatten, merge, squash from .rangelabel import to_label from .subtract import subtract from .subdivide import subdivide @@ -686,10 +687,9 @@ def sort_columns(self) -> None: # Genome arithmetic - def cut(self, other, combine=None): - """Split this array's regions at the boundaries in `other`.""" - # TODO - return NotImplemented + def cut(self: _GA, other: GenomicArray) -> _GA: + """Split this array's regions at the boundaries in ``other``.""" + return self.as_dataframe(cut(self.data, other.data)) def flatten( self, @@ -782,10 +782,22 @@ def resize_ranges( # Don't modify the original return self.as_dataframe(table.copy()) - def squash(self, combine=None): - """Combine some groups of rows, by some criteria, into single rows.""" - # TODO - return NotImplemented + def squash( + self: _GA, + by: str | None = None, + combine: dict[str, Callable] | None = None, + ) -> _GA: + """Combine consecutive adjacent rows into single rows. + + Parameters + ---------- + by : str or None + If given, only combine consecutive rows with the same value + in this column (e.g. ``"gene"``). + combine : dict or None + Column-to-function mappings for aggregation. + """ + return self.as_dataframe(squash(self.data, by=by, combine=combine)) def subdivide( self, avg_size: int, min_size: int = 0, verbose: bool = False diff --git a/skgenome/merge.py b/skgenome/merge.py index 33fda176..d4ec351f 100644 --- a/skgenome/merge.py +++ b/skgenome/merge.py @@ -13,10 +13,11 @@ from typing import TYPE_CHECKING import bioframe +import numpy as np import pandas as pd from .chromsort import sorter_chrom -from .combiners import get_combiners +from .combiners import first_of, get_combiners if TYPE_CHECKING: from collections.abc import Callable, Iterable @@ -132,3 +133,71 @@ def merge( return out.reindex( out.chromosome.apply(sorter_chrom).sort_values(kind="mergesort").index ) + + +def squash( + table: pd.DataFrame, + by: str | None = None, + combine: dict[str, Callable] | None = None, +) -> pd.DataFrame: + """Combine consecutive adjacent rows into single rows. + + Parameters + ---------- + table : DataFrame + Genomic intervals with at least chromosome, start, end columns. + Must be sorted by chromosome, start, end. + by : str or None + If given, only combine consecutive rows that have the same value + in this column (e.g. ``"gene"``). If None, combine all adjacent + rows on the same chromosome. + combine : dict or None + Column-to-function mappings for aggregation. See + :func:`get_combiners`. + + Returns + ------- + DataFrame + A new DataFrame with consecutive adjacent rows combined. + """ + if table.empty or len(table) == 1: + return table + + cmb = get_combiners(table, stranded=False, combine=combine) + if by is not None: + # Values are identical within each group, so just take the first + cmb[by] = first_of + + # Vectorised adjacency detection + chroms = table["chromosome"].to_numpy() + starts = table["start"].to_numpy() + ends = table["end"].to_numpy() + is_adjacent = (chroms[:-1] == chroms[1:]) & (ends[:-1] == starts[1:]) + if by is not None: + by_vals = table[by].to_numpy() + is_adjacent &= by_vals[:-1] == by_vals[1:] + + # Nothing to squash -- short-circuit + if not is_adjacent.any(): + return table + + # Assign group IDs: increment when rows are *not* adjacent + group_ids = np.concatenate([[0], np.cumsum(~is_adjacent)]) + + result_rows: list[pd.Series] = [] + for _gid, group in table.groupby(group_ids, sort=False): + if len(group) == 1: + result_rows.append(group.iloc[0]) + else: + vals: dict = {} + for col in table.columns: + if col == "start": + vals[col] = group["start"].iloc[0] + elif col == "end": + vals[col] = group["end"].iloc[-1] + elif col in cmb: + vals[col] = cmb[col](group[col].values) + else: + vals[col] = group[col].iloc[0] + result_rows.append(pd.Series(vals)) + return pd.DataFrame(result_rows, columns=table.columns).reset_index(drop=True) diff --git a/test/test_genome.py b/test/test_genome.py index 8ccf86e8..7ac2ae96 100755 --- a/test/test_genome.py +++ b/test/test_genome.py @@ -1,5 +1,6 @@ #!/usr/bin/env python """Unit tests for the 'genome' sub-package.""" + import functools import operator import random @@ -58,12 +59,18 @@ def test_autosomes(self): autoy = self.ex_cnr.autosomes(also=["chrY"]) self.assertEqual(len(autoy), len_all - len_x) autoxy = self.ex_cnr.autosomes(also=["chrX", "chrY"]) - self.assertEqual(len(autoxy), len_all, "It's possible to provide chromosome names.") - some_x = (self.ex_cnr.chromosome == 'chrX') & (self.ex_cnr.end <= 434918) + self.assertEqual( + len(autoxy), len_all, "It's possible to provide chromosome names." + ) + some_x = (self.ex_cnr.chromosome == "chrX") & (self.ex_cnr.end <= 434918) some_x_len = some_x.sum() self.assertEqual(some_x_len, 3) auto_and_some_x = self.ex_cnr.autosomes(also=some_x) - self.assertEqual(len(auto_and_some_x), len(auto) + some_x_len, "It's possible to provide a Pandas filter.") + self.assertEqual( + len(auto_and_some_x), + len(auto) + some_x_len, + "It's possible to provide a Pandas filter.", + ) def test_by_chromosome(self): for fname in ("formats/amplicon.cnr", "formats/cl_seq.cns"): @@ -512,7 +519,9 @@ def test_intersect(self): self._compare_regions(result, self._from_intervals(expect)) # Single-object intersect result = regions.intersection(selections, mode=mode) - expect = self._from_intervals(functools.reduce(operator.iadd, expectations[mode], [])) + expect = self._from_intervals( + functools.reduce(operator.iadd, expectations[mode], []) + ) self._compare_regions(result, expect) def test_subtract(self): @@ -566,6 +575,129 @@ def test_subtract(self): iresult = target.subtract(access) self._compare_regions(iresult, invert) + def test_cut(self): + # Simple: one interval split by two boundary regions + regions = self._from_intervals([(1, 20, "A")]) + cuts = self._from_intervals([(5, 10, "X"), (15, 25, "Y")]) + # Breakpoints {5, 10, 15, 25}; 5, 10, 15 are inside (1, 20) + result = regions.cut(cuts) + expect = self._from_intervals( + [ + (1, 5, "A"), + (5, 10, "A"), + (10, 15, "A"), + (15, 20, "A"), + ] + ) + self._compare_regions(result, expect) + + # Breakpoint at exact start/end: only interior breakpoint splits + regions2 = self._from_intervals([(10, 20, "X")]) + cuts2 = self._from_intervals([(10, 15, ""), (20, 25, "")]) + result2 = regions2.cut(cuts2) + expect2 = self._from_intervals([(10, 15, "X"), (15, 20, "X")]) + self._compare_regions(result2, expect2) + + # No breakpoints inside any interval: unchanged + regions3 = self._from_intervals([(10, 20, "X")]) + cuts3 = self._from_intervals([(1, 5, ""), (25, 30, "")]) + result3 = regions3.cut(cuts3) + self._compare_regions(result3, regions3) + + # Empty other: unchanged + regions4 = self._from_intervals([(1, 20, "A")]) + empty = self._from_intervals([]) + result4 = regions4.cut(empty) + self._compare_regions(result4, regions4) + + # Multiple intervals, some cut, some not + regions5 = self._from_intervals( + [ + (1, 10, "A"), + (20, 30, "B"), + (40, 50, "C"), + ] + ) + cuts5 = self._from_intervals([(5, 25, "")]) + # Breakpoints {5, 25}: 5 inside A, 25 inside B, neither inside C + result5 = regions5.cut(cuts5) + expect5 = self._from_intervals( + [ + (1, 5, "A"), + (5, 10, "A"), + (20, 25, "B"), + (25, 30, "B"), + (40, 50, "C"), + ] + ) + self._compare_regions(result5, expect5) + + def test_squash(self): + # All adjacent: combine into one row + regions = self._from_intervals( + [ + (1, 5, "A"), + (5, 10, "B"), + (10, 15, "C"), + ] + ) + result = regions.squash(combine=self.combiner) + expect = self._from_intervals([(1, 15, "ABC")]) + self._compare_regions(result, expect) + + # Gap prevents combining + regions2 = self._from_intervals( + [ + (1, 5, "A"), + (5, 10, "B"), + (12, 15, "C"), + (15, 20, "D"), + ] + ) + result2 = regions2.squash(combine=self.combiner) + expect2 = self._from_intervals( + [ + (1, 10, "AB"), + (12, 20, "CD"), + ] + ) + self._compare_regions(result2, expect2) + + # Squash by gene: only combine same-gene consecutive runs + regions3 = self._from_intervals( + [ + (1, 5, "X"), + (5, 10, "X"), + (10, 15, "Y"), + (15, 20, "Y"), + (20, 25, "X"), + ] + ) + result3 = regions3.squash(by="gene", combine=self.combiner) + expect3 = self._from_intervals( + [ + (1, 10, "X"), + (10, 20, "Y"), + (20, 25, "X"), + ] + ) + self._compare_regions(result3, expect3) + + # Single row: unchanged + regions4 = self._from_intervals([(1, 100, "X")]) + result4 = regions4.squash(combine=self.combiner) + self._compare_regions(result4, regions4) + + # Non-adjacent: unchanged + regions5 = self._from_intervals( + [ + (1, 5, "A"), + (10, 15, "B"), + ] + ) + result5 = regions5.squash(combine=self.combiner) + self._compare_regions(result5, regions5) + class OtherTests(unittest.TestCase): """Other small modules & functions in this sub-package.""" From 32180ac23e08afe5d2b2470a66522e64dd22b3dd Mon Sep 17 00:00:00 2001 From: Eric Talevich <52723+etal@users.noreply.github.com> Date: Sat, 7 Feb 2026 14:56:05 -0800 Subject: [PATCH 3/3] Suppress bioframe's Pandas4Warning in pytest bioframe <=0.8.0 uses df.groupby(['col']).groups with a single-element list, which pandas 3.0 deprecates. Our filterwarnings = ["error"] turns this into a test failure on Python 3.12+ CI (pandas 3.0). Add a message-based filter that works across all pandas versions. Co-Authored-By: Claude Opus 4.6 --- pyproject.toml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index c7f9e9db..4c7842b2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -63,7 +63,12 @@ optional-dependencies = {test = {file = "requirements/tests.txt"}} [tool.pytest.ini_options] testpaths = ["test"] -filterwarnings = ["error"] +filterwarnings = [ + "error", + # bioframe <=0.8.0 uses df.groupby(['col']).groups with a single-element + # list, which pandas 3.0 deprecates via Pandas4Warning. + "ignore:In a future version, the keys of:Warning", +] [tool.coverage.run] branch = true