Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions environment-dev.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ channels:
dependencies:
- python=3.11
# Core dependencies (from requirements/core.txt)
- bioframe>=0.7.2
- biopython>=1.85
- matplotlib>=3.9.0
- numpy>=2.2.3
Expand Down
7 changes: 6 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,12 @@ optional-dependencies = {test = {file = "requirements/tests.txt"}}

[tool.pytest.ini_options]
testpaths = ["test"]
filterwarnings = ["error"]
filterwarnings = [
"error",
# bioframe <=0.8.0 uses df.groupby(['col']).groups with a single-element
# list, which pandas 3.0 deprecates via Pandas4Warning.
"ignore:In a future version, the keys of:Warning",
]

[tool.coverage.run]
branch = true
Expand Down
1 change: 1 addition & 0 deletions requirements/core.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
bioframe >= 0.7.2
biopython >= 1.85
matplotlib >= 3.9.0
numpy >= 2.2.3
Expand Down
1 change: 1 addition & 0 deletions requirements/min.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
bioframe == 0.7.2
biopython == 1.85
matplotlib == 3.9.0
numpy == 2.2.3
Expand Down
72 changes: 72 additions & 0 deletions skgenome/cut.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
"""DataFrame-level cutting operations.

Split genomic regions at boundaries defined by another set of regions,
similar to bedtools intersect -split.

The functions here operate on pandas DataFrame and Series instances, not
GenomicArray types.

"""

from __future__ import annotations

import itertools
import numpy as np
import pandas as pd


def cut(table: pd.DataFrame, other: pd.DataFrame) -> pd.DataFrame:
"""Split intervals in ``table`` at the boundary coordinates of ``other``.

Each interval in ``table`` is split wherever a start or end coordinate
from ``other`` falls strictly within it. Every resulting piece inherits
all data columns from its source row.

Parameters
----------
table : DataFrame
Genomic intervals with at least chromosome, start, end columns.
other : DataFrame
Intervals whose start/end positions define the cut points.

Returns
-------
DataFrame
A copy of ``table`` with rows split at the breakpoints from ``other``.
"""
if table.empty or other.empty:
return table

# Collect breakpoints per chromosome from other
other_bp: dict[str, np.ndarray] = {}
for chrom, grp in other.groupby("chromosome", sort=False):
other_bp[chrom] = np.unique(
np.concatenate([grp["start"].to_numpy(), grp["end"].to_numpy()])
)

result_chunks: list[pd.DataFrame] = []
for chrom, chrom_table in table.groupby("chromosome", sort=False):
if chrom not in other_bp:
result_chunks.append(chrom_table)
continue

breakpoints = other_bp[chrom]
cut_rows: list[tuple] = []
for row in chrom_table.itertuples(index=False):
# Breakpoints strictly inside (row.start, row.end)
inner = breakpoints[(breakpoints > row.start) & (breakpoints < row.end)]
if len(inner) == 0:
cut_rows.append(row)
else:
bounds = np.concatenate([[row.start], inner, [row.end]])
for seg_start, seg_end in itertools.pairwise(bounds):
cut_rows.append(
row._replace(start=int(seg_start), end=int(seg_end))
)
result_chunks.append(
pd.DataFrame.from_records(cut_rows, columns=chrom_table.columns)
)

if not result_chunks:
return table.iloc[:0]
return pd.concat(result_chunks, ignore_index=True)
61 changes: 45 additions & 16 deletions skgenome/gary.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,16 @@
from collections import OrderedDict
from typing import Any, TypeVar, TYPE_CHECKING

import bioframe
import numpy as np
import pandas as pd

from .chromsort import sorter_chrom
from .cut import cut
from .intersect import by_ranges, into_ranges, iter_ranges, iter_slices, Numeric
from .merge import flatten, merge

_BF_COLS = ("chromosome", "start", "end")
from .merge import flatten, merge, squash
from .rangelabel import to_label
from .subtract import subtract
from .subdivide import subdivide
Expand Down Expand Up @@ -683,10 +687,9 @@ def sort_columns(self) -> None:

# Genome arithmetic

def cut(self, other, combine=None):
"""Split this array's regions at the boundaries in `other`."""
# TODO
return NotImplemented
def cut(self: _GA, other: GenomicArray) -> _GA:
"""Split this array's regions at the boundaries in ``other``."""
return self.as_dataframe(cut(self.data, other.data))

def flatten(
self,
Expand All @@ -703,19 +706,33 @@ def intersection(self, other: GenomicArray, mode: str = "outer") -> GenomicArray

The extra fields of `self`, but not `other`, are retained in the output.
"""
# TODO options for which extra fields to keep
# by default, keep just the fields in 'table'
if mode == "trim":
# Slower
chunks = [
chunk.data
for _, chunk in self.by_ranges(other, mode=mode, keep_empty=False)
]
return self.as_dataframe(pd.concat(chunks))
# Faster
slices = iter_slices(self.data, other.data, mode, False)
indices = np.concatenate(list(slices))
return self.as_dataframe(self.data.loc[indices])
if not len(self) or not len(other):
return self.as_dataframe(self.data.iloc[:0])
pairs = bioframe.overlap(
self.data,
other.data,
how="inner",
cols1=_BF_COLS,
cols2=_BF_COLS,
return_index=True,
return_input=True,
)
if mode == "inner":
# Keep only self bins fully contained within an other bin
pairs = pairs[
(pairs["start"] >= pairs["start_"]) & (pairs["end"] <= pairs["end_"])
]
# Sort by other-index then self-index to match by_ranges ordering
pairs = pairs.sort_values(["index_", "index"])
# Return self rows (with duplicates, one per overlap pair)
self_indices = pairs["index"].to_numpy()
return self.as_dataframe(self.data.loc[self_indices])

def merge(
self,
Expand Down Expand Up @@ -765,10 +782,22 @@ def resize_ranges(
# Don't modify the original
return self.as_dataframe(table.copy())

def squash(self, combine=None):
"""Combine some groups of rows, by some criteria, into single rows."""
# TODO
return NotImplemented
def squash(
self: _GA,
by: str | None = None,
combine: dict[str, Callable] | None = None,
) -> _GA:
"""Combine consecutive adjacent rows into single rows.

Parameters
----------
by : str or None
If given, only combine consecutive rows with the same value
in this column (e.g. ``"gene"``).
combine : dict or None
Column-to-function mappings for aggregation.
"""
return self.as_dataframe(squash(self.data, by=by, combine=combine))

def subdivide(
self, avg_size: int, min_size: int = 0, verbose: bool = False
Expand Down
12 changes: 0 additions & 12 deletions skgenome/intersect.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,15 +241,3 @@ def _irange_nested(
region_mask[int(end_idx) :] = 0

yield region_mask, start_val, end_val


def venn(table, other, mode: str):
# TODO -- implement 'venn' via fjoin algorithm
# 'cut' table at all 'other' boundaries
# -> extra column '_venn_':int (0, 1, 2)
# 0=self only, 1=both, 2=other only
# -> 'cut' just drops the '_venn_' column
# -> 'subtract' drops 1 and 2?
# (is that faster? probably not)
# -> 'jaccard' does math with it...
return table
Loading