diff --git a/build-hooks/_version.py b/build-hooks/_version.py index 642364b..e972497 100644 --- a/build-hooks/_version.py +++ b/build-hooks/_version.py @@ -3,16 +3,14 @@ import enum import functools import re -from typing import ( - TYPE_CHECKING, - Callable, - Generic, - MutableSequence, - Optional, - Type, - TypeVar, - overload, -) +from typing import TYPE_CHECKING +from typing import Callable +from typing import Generic +from typing import MutableSequence +from typing import Optional +from typing import Type +from typing import TypeVar +from typing import overload try: from typing import Self diff --git a/demo.ipynb b/demo.ipynb index 5a6705e..71729f3 100644 --- a/demo.ipynb +++ b/demo.ipynb @@ -196,8 +196,16 @@ " SELECT * FROM read_csv('{file_path}', delim='\\t', header=false, {bed_columns})\n", " \"\"\")\n", "\n", - "print(\"Features A:\", conn.execute(\"SELECT COUNT(*) FROM features_a\").fetchone()[0], \"intervals\")\n", - "print(\"Features B:\", conn.execute(\"SELECT COUNT(*) FROM features_b\").fetchone()[0], \"intervals\")\n", + "print(\n", + " \"Features A:\",\n", + " conn.execute(\"SELECT COUNT(*) FROM features_a\").fetchone()[0],\n", + " \"intervals\",\n", + ")\n", + "print(\n", + " \"Features B:\",\n", + " conn.execute(\"SELECT COUNT(*) FROM features_b\").fetchone()[0],\n", + " \"intervals\",\n", + ")\n", "conn.execute(\"SELECT * FROM features_a LIMIT 5\").fetchdf()" ] }, diff --git a/docs/conf.py b/docs/conf.py index 5e28805..907e9ba 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -79,4 +79,4 @@ # }, "repository_url": "https://github.com/abdenlab/giql", "use_repository_button": True, -} \ No newline at end of file +} diff --git a/src/giql/expressions.py b/src/giql/expressions.py index 940dbc0..857a223 100644 --- a/src/giql/expressions.py +++ b/src/giql/expressions.py @@ -79,9 +79,7 @@ def _split_named_and_positional(args): positional_args = [] for arg in args: if isinstance(arg, (exp.PropertyEQ, exp.Kwarg)): - param_name = ( - arg.this.name if hasattr(arg.this, "name") else str(arg.this) - ) + param_name = arg.this.name if hasattr(arg.this, "name") else str(arg.this) kwargs[param_name.lower()] = arg.expression else: positional_args.append(arg) diff --git a/src/giql/mcp/server.py b/src/giql/mcp/server.py index 0a06164..8f74ae6 100644 --- a/src/giql/mcp/server.py +++ b/src/giql/mcp/server.py @@ -74,12 +74,21 @@ "description": "Find the k-nearest genomic features to a reference position", "syntax": "CROSS JOIN LATERAL NEAREST(table, reference=interval, k=5)", "parameters": [ - {"name": "target_table", "description": "Table to search for nearest features"}, + { + "name": "target_table", + "description": "Table to search for nearest features", + }, {"name": "reference", "description": "Reference position or column"}, {"name": "k", "description": "Number of nearest neighbors (default: 1)"}, - {"name": "max_distance", "description": "Maximum distance threshold (optional)"}, + { + "name": "max_distance", + "description": "Maximum distance threshold (optional)", + }, {"name": "stranded", "description": "Same-strand only (default: false)"}, - {"name": "signed", "description": "Return signed distances (default: false)"}, + { + "name": "signed", + "description": "Return signed distances (default: false)", + }, ], "returns": "Rows from target table with distance column", "example": "SELECT * FROM peaks CROSS JOIN LATERAL NEAREST(genes, reference=peaks.interval, k=3) AS nearest", @@ -91,7 +100,10 @@ "syntax": "CLUSTER(interval) AS cluster_id", "parameters": [ {"name": "interval", "description": "Genomic column to cluster"}, - {"name": "distance", "description": "Max gap to consider same cluster (default: 0)"}, + { + "name": "distance", + "description": "Max gap to consider same cluster (default: 0)", + }, {"name": "stranded", "description": "Cluster by strand (default: false)"}, ], "returns": "Integer cluster ID", @@ -333,7 +345,10 @@ def explain_operator(name: str) -> dict[str, Any]: name_upper = name.upper().strip() if name_upper not in OPERATORS: - return {"error": f"Unknown operator: {name}", "available": list(OPERATORS.keys())} + return { + "error": f"Unknown operator: {name}", + "available": list(OPERATORS.keys()), + } op = OPERATORS[name_upper] @@ -346,7 +361,9 @@ def explain_operator(name: str) -> dict[str, Any]: pattern = rf"^{name_upper}\n[~=\-]+\n(.*?)(?=\n[A-Z]+\n[~=\-]+|\Z)" match = re.search(pattern, content, re.MULTILINE | re.DOTALL) if match: - full_docs = f"{name_upper}\n{'=' * len(name_upper)}\n{match.group(1).strip()}" + full_docs = ( + f"{name_upper}\n{'=' * len(name_upper)}\n{match.group(1).strip()}" + ) return { "name": name_upper, diff --git a/src/giql/table.py b/src/giql/table.py index f23adaf..563343c 100644 --- a/src/giql/table.py +++ b/src/giql/table.py @@ -66,7 +66,7 @@ class Table: coordinate_system="1based", interval_type="closed", ), - ] + ], ) """ diff --git a/src/giql/transpile.py b/src/giql/transpile.py index f846834..2b29c3d 100644 --- a/src/giql/transpile.py +++ b/src/giql/transpile.py @@ -77,7 +77,7 @@ def transpile( sql = transpile( "SELECT * FROM peaks WHERE interval INTERSECTS 'chr1:1000-2000'", - tables=["peaks"] + tables=["peaks"], ) Custom table configuration:: @@ -92,7 +92,7 @@ def transpile( start_col="start", end_col="end", ) - ] + ], ) """ # Build tables container diff --git a/tests/generators/test_base.py b/tests/generators/test_base.py index 45d42bd..42e04f0 100644 --- a/tests/generators/test_base.py +++ b/tests/generators/test_base.py @@ -875,7 +875,9 @@ def test_giqlnearest_sql_closed_intervals(self): tables = Tables() tables.register("genes_closed", Table("genes_closed", interval_type="closed")) - sql = "SELECT * FROM NEAREST(genes_closed, reference := 'chr1:1000-2000', k := 3)" + sql = ( + "SELECT * FROM NEAREST(genes_closed, reference := 'chr1:1000-2000', k := 3)" + ) ast = parse_one(sql, dialect=GIQLDialect) generator = BaseGIQLGenerator(tables=tables) @@ -988,7 +990,9 @@ def test_giqlnearest_sql_target_not_in_tables(self, tables_with_peaks_and_genes) WHEN giqlnearest_sql is called THEN ValueError is raised listing available tables. """ - sql = "SELECT * FROM NEAREST(unknown_table, reference := 'chr1:1000-2000', k := 3)" + sql = ( + "SELECT * FROM NEAREST(unknown_table, reference := 'chr1:1000-2000', k := 3)" + ) ast = parse_one(sql, dialect=GIQLDialect) generator = BaseGIQLGenerator(tables=tables_with_peaks_and_genes) diff --git a/tests/integration/__init__.py b/tests/integration/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/integration/bedtools/__init__.py b/tests/integration/bedtools/__init__.py new file mode 100644 index 0000000..a152056 --- /dev/null +++ b/tests/integration/bedtools/__init__.py @@ -0,0 +1 @@ +"""Bedtools integration tests for GIQL operator correctness.""" diff --git a/tests/integration/bedtools/conftest.py b/tests/integration/bedtools/conftest.py new file mode 100644 index 0000000..79994a1 --- /dev/null +++ b/tests/integration/bedtools/conftest.py @@ -0,0 +1,57 @@ +"""Pytest fixtures for bedtools integration tests.""" + +import shutil + +import pytest + +from giql import transpile + +duckdb = pytest.importorskip("duckdb") +pytest.importorskip("pybedtools") + +if not shutil.which("bedtools"): + pytest.skip( + "bedtools binary not found in PATH", + allow_module_level=True, + ) + +pytestmark = pytest.mark.integration + +from .utils.duckdb_loader import load_intervals # noqa: E402 + + +@pytest.fixture(scope="function") +def duckdb_connection(): + """Provide clean DuckDB connection for each test. + + Each test gets a fresh in-memory database with no shared state. + """ + conn = duckdb.connect(":memory:") + yield conn + conn.close() + + +@pytest.fixture(scope="function") +def giql_query(duckdb_connection): + """Provide a helper that loads data, transpiles GIQL, and executes. + + Usage:: + + result = giql_query( + "SELECT * FROM t WHERE interval INTERSECTS 'chr1:1-100'", + tables=["t"], + t=[GenomicInterval("chr1", 50, 150, "x", 0, "+")], + ) + """ + + def _run(query: str, *, tables: list[str], **table_data): + for name, intervals in table_data.items(): + load_intervals( + duckdb_connection, + name, + [i.to_tuple() for i in intervals], + ) + sql = transpile(query, tables=tables) + return duckdb_connection.execute(sql).fetchall() + + return _run diff --git a/tests/integration/bedtools/test_cluster.py b/tests/integration/bedtools/test_cluster.py new file mode 100644 index 0000000..c492f0d --- /dev/null +++ b/tests/integration/bedtools/test_cluster.py @@ -0,0 +1,206 @@ +"""Integration tests for GIQL CLUSTER operator. + +These tests validate that GIQL's CLUSTER operator correctly assigns +cluster IDs to overlapping intervals. Since bedtools has no direct +CLUSTER equivalent, we cross-validate against bedtools merge: the +number of distinct clusters should equal the number of merged intervals. +""" + +from giql import transpile + +from .utils.bedtools_wrapper import merge +from .utils.data_models import GenomicInterval +from .utils.duckdb_loader import load_intervals + + +def test_cluster_basic(duckdb_connection): + """ + GIVEN a set of intervals with two overlapping groups + WHEN CLUSTER operator is applied via GIQL + THEN overlapping intervals share cluster IDs and distinct + cluster count matches bedtools merge + """ + intervals = [ + GenomicInterval("chr1", 100, 200, "i1", 100, "+"), + GenomicInterval("chr1", 150, 250, "i2", 150, "+"), + GenomicInterval("chr1", 400, 500, "i3", 200, "+"), + ] + + load_intervals( + duckdb_connection, + "intervals", + [i.to_tuple() for i in intervals], + ) + + bedtools_merged = merge([i.to_tuple() for i in intervals]) + + sql = transpile( + """ + SELECT *, CLUSTER(interval) AS cluster_id + FROM intervals + """, + tables=["intervals"], + ) + giql_result = duckdb_connection.execute(sql).fetchall() + + assert len(giql_result) == len(intervals) + + cluster_ids = {row[-1] for row in giql_result} + assert len(cluster_ids) == len(bedtools_merged), ( + f"Expected {len(bedtools_merged)} clusters (matching merge " + f"count), got {len(cluster_ids)}" + ) + + # i1 and i2 overlap, so they should share a cluster_id + i1_cluster = giql_result[0][-1] + i2_cluster = giql_result[1][-1] + i3_cluster = giql_result[2][-1] + assert i1_cluster == i2_cluster, ( + "Overlapping intervals i1 and i2 should share cluster_id" + ) + assert i3_cluster != i1_cluster, ( + "Separated interval i3 should have a different cluster_id" + ) + + +def test_cluster_separated(duckdb_connection): + """ + GIVEN non-overlapping intervals with gaps + WHEN CLUSTER operator is applied + THEN each interval gets its own cluster_id, matching bedtools merge count + """ + intervals = [ + GenomicInterval("chr1", 100, 200, "i1", 100, "+"), + GenomicInterval("chr1", 300, 400, "i2", 150, "+"), + GenomicInterval("chr1", 500, 600, "i3", 200, "+"), + ] + + load_intervals( + duckdb_connection, + "intervals", + [i.to_tuple() for i in intervals], + ) + + bedtools_merged = merge([i.to_tuple() for i in intervals]) + + sql = transpile( + """ + SELECT *, CLUSTER(interval) AS cluster_id + FROM intervals + """, + tables=["intervals"], + ) + giql_result = duckdb_connection.execute(sql).fetchall() + + cluster_ids = {row[-1] for row in giql_result} + assert len(cluster_ids) == len(bedtools_merged) == 3 + + +def test_cluster_multiple_chromosomes(duckdb_connection): + """ + GIVEN overlapping intervals on different chromosomes + WHEN CLUSTER operator is applied + THEN clustering occurs per chromosome independently, matching bedtools merge count + """ + intervals = [ + GenomicInterval("chr1", 100, 200, "i1", 100, "+"), + GenomicInterval("chr1", 150, 250, "i2", 150, "+"), + GenomicInterval("chr2", 100, 200, "i3", 100, "+"), + GenomicInterval("chr2", 150, 250, "i4", 150, "+"), + ] + + load_intervals( + duckdb_connection, + "intervals", + [i.to_tuple() for i in intervals], + ) + + bedtools_merged = merge([i.to_tuple() for i in intervals]) + + sql = transpile( + """ + SELECT *, CLUSTER(interval) AS cluster_id + FROM intervals + """, + tables=["intervals"], + ) + giql_result = duckdb_connection.execute(sql).fetchall() + + # Cluster IDs are per-partition (per-chrom), so we count + # distinct (chrom, cluster_id) pairs + chrom_clusters = {(row[0], row[-1]) for row in giql_result} + assert len(chrom_clusters) == len(bedtools_merged) == 2 + + +def test_cluster_stranded(duckdb_connection): + """ + GIVEN overlapping intervals on different strands + WHEN CLUSTER with stranded := true is applied + THEN clustering occurs per strand independently, matching + bedtools strand-aware merge count + """ + intervals = [ + GenomicInterval("chr1", 100, 200, "i1", 100, "+"), + GenomicInterval("chr1", 150, 250, "i2", 150, "+"), + GenomicInterval("chr1", 120, 220, "i3", 200, "-"), + GenomicInterval("chr1", 180, 280, "i4", 100, "-"), + ] + + load_intervals( + duckdb_connection, + "intervals", + [i.to_tuple() for i in intervals], + ) + + bedtools_strand_merged = merge( + [i.to_tuple() for i in intervals], + strand_mode="same", + ) + + sql = transpile( + """ + SELECT *, CLUSTER(interval, stranded := true) AS cluster_id + FROM intervals + """, + tables=["intervals"], + ) + giql_result = duckdb_connection.execute(sql).fetchall() + + # Cluster IDs are per-partition (per-chrom-strand), so count + # distinct (chrom, strand, cluster_id) pairs + strand_clusters = {(row[0], row[5], row[-1]) for row in giql_result} + assert len(strand_clusters) == len(bedtools_strand_merged) + + +def test_cluster_with_distance(giql_query): + """ + GIVEN intervals with gaps of 50bp and 150bp + WHEN CLUSTER with distance=100 is applied + THEN gaps <= 100bp are in the same cluster and gaps > 100bp are separate + """ + result = giql_query( + """ + SELECT *, CLUSTER(interval, 100) AS cluster_id + FROM intervals + """, + tables=["intervals"], + intervals=[ + GenomicInterval("chr1", 100, 200, "i1", 100, "+"), + GenomicInterval("chr1", 250, 350, "i2", 150, "+"), # 50bp gap + GenomicInterval("chr1", 500, 600, "i3", 200, "+"), # 150bp gap + ], + ) + + assert len(result) == 3 + + # i1 and i2 should share a cluster (50bp gap <= 100) + i1_cluster = result[0][-1] + i2_cluster = result[1][-1] + i3_cluster = result[2][-1] + + assert i1_cluster == i2_cluster, ( + "i1 and i2 (50bp gap) should be in the same cluster with distance=100" + ) + assert i3_cluster != i1_cluster, ( + "i3 (150bp gap) should be in a different cluster with distance=100" + ) diff --git a/tests/integration/bedtools/test_contains.py b/tests/integration/bedtools/test_contains.py new file mode 100644 index 0000000..6325e43 --- /dev/null +++ b/tests/integration/bedtools/test_contains.py @@ -0,0 +1,128 @@ +"""Integration tests for GIQL CONTAINS operator. + +These tests validate that GIQL's CONTAINS predicate correctly identifies +intervals that fully contain a point or range. No direct bedtools +equivalent exists, so tests validate against known expected results. +""" + +from .utils.data_models import GenomicInterval + + +def test_contains_point(giql_query): + """ + GIVEN a table with intervals of varying sizes on chr1 + WHEN CONTAINS is used with a point literal + THEN only intervals that contain the point are returned + """ + result = giql_query( + "SELECT * FROM intervals WHERE interval CONTAINS 'chr1:150'", + tables=["intervals"], + intervals=[ + GenomicInterval("chr1", 100, 300, "wide", 100, "+"), + GenomicInterval("chr1", 140, 160, "narrow", 100, "+"), + GenomicInterval("chr1", 200, 400, "partial", 100, "+"), + GenomicInterval("chr1", 500, 600, "far", 100, "+"), + ], + ) + + names = {row[3] for row in result} + # wide [100,300) contains 150, narrow [140,160) contains 150 + # partial [200,400) does not contain 150, far [500,600) does not + assert names == {"wide", "narrow"}, f"Expected wide+narrow, got {names}" + + +def test_contains_range(giql_query): + """ + GIVEN a table with intervals of varying sizes + WHEN CONTAINS is used with a range literal + THEN only intervals that fully contain the range are returned + """ + result = giql_query( + "SELECT * FROM intervals WHERE interval CONTAINS 'chr1:150-250'", + tables=["intervals"], + intervals=[ + GenomicInterval("chr1", 100, 400, "large", 100, "+"), + GenomicInterval("chr1", 150, 250, "medium", 100, "+"), + GenomicInterval("chr1", 180, 220, "small", 100, "+"), + GenomicInterval("chr1", 500, 600, "far", 100, "+"), + ], + ) + + names = {row[3] for row in result} + # large [100,400) contains [150,250), medium [150,250) contains [150,250) + # small [180,220) does not fully contain [150,250) + assert names == {"large", "medium"}, f"Expected large+medium, got {names}" + + +def test_contains_column_to_column(giql_query): + """ + GIVEN two tables where some intervals in A fully contain intervals in B + WHEN a.interval CONTAINS b.interval is used in WHERE clause + THEN only pairs where A fully contains B are returned + """ + result = giql_query( + """ + SELECT a.name, b.name + FROM intervals_a a, intervals_b b + WHERE a.interval CONTAINS b.interval + """, + tables=["intervals_a", "intervals_b"], + intervals_a=[ + GenomicInterval("chr1", 100, 400, "a_large", 100, "+"), + GenomicInterval("chr1", 200, 250, "a_small", 100, "+"), + ], + intervals_b=[ + GenomicInterval("chr1", 150, 300, "b1", 100, "+"), + GenomicInterval("chr1", 210, 240, "b2", 100, "+"), + ], + ) + + pairs = {(row[0], row[1]) for row in result} + # a_large [100,400) contains b1 [150,300) and b2 [210,240) + # a_small [200,250) contains b2 [210,240) but not b1 [150,300) + assert pairs == {("a_large", "b1"), ("a_large", "b2"), ("a_small", "b2")}, ( + f"Expected 3 containment pairs, got {pairs}" + ) + + +def test_contains_cross_chromosome(giql_query): + """ + GIVEN a table with intervals on multiple chromosomes + WHEN CONTAINS is used with a chr1 point literal + THEN only chr1 intervals are considered + """ + result = giql_query( + "SELECT * FROM intervals WHERE interval CONTAINS 'chr1:150'", + tables=["intervals"], + intervals=[ + GenomicInterval("chr1", 100, 300, "chr1_hit", 100, "+"), + GenomicInterval("chr2", 100, 300, "chr2_miss", 100, "+"), + ], + ) + + names = {row[3] for row in result} + assert names == {"chr1_hit"} + + +def test_contains_all_set_predicate(giql_query): + """ + GIVEN a table with intervals of varying sizes + WHEN CONTAINS ALL is used with multiple points + THEN only intervals containing all points are returned + """ + result = giql_query( + """ + SELECT * FROM intervals + WHERE interval CONTAINS ALL('chr1:150', 'chr1:300') + """, + tables=["intervals"], + intervals=[ + GenomicInterval("chr1", 100, 400, "large", 100, "+"), + GenomicInterval("chr1", 100, 200, "left", 100, "+"), + GenomicInterval("chr1", 250, 400, "right", 100, "+"), + ], + ) + + names = {row[3] for row in result} + # Only large [100,400) contains both 150 and 300 + assert names == {"large"}, f"Expected only large, got {names}" diff --git a/tests/integration/bedtools/test_distance.py b/tests/integration/bedtools/test_distance.py new file mode 100644 index 0000000..4fc53e7 --- /dev/null +++ b/tests/integration/bedtools/test_distance.py @@ -0,0 +1,204 @@ +"""Integration tests for GIQL DISTANCE operator. + +These tests validate that GIQL's DISTANCE function computes the correct +genomic distance between intervals, cross-validated against bedtools +closest -d output. +""" + +from .utils.data_models import GenomicInterval + + +def test_distance_non_overlapping(giql_query): + """ + GIVEN two non-overlapping intervals with a known gap + WHEN DISTANCE is computed via GIQL + THEN the distance equals b.start - a.end (half-open arithmetic) + """ + result = giql_query( + """ + SELECT DISTANCE(a.interval, b.interval) AS dist + FROM intervals_a a, intervals_b b + """, + tables=["intervals_a", "intervals_b"], + intervals_a=[GenomicInterval("chr1", 100, 200, "a1", 100, "+")], + intervals_b=[GenomicInterval("chr1", 300, 400, "b1", 100, "+")], + ) + + assert len(result) == 1 + # Half-open distance: b.start - a.end = 300 - 200 = 100 + assert result[0][0] == 100 + + +def test_distance_overlapping(giql_query): + """ + GIVEN two overlapping intervals + WHEN DISTANCE is computed via GIQL + THEN the distance is 0 + """ + result = giql_query( + """ + SELECT DISTANCE(a.interval, b.interval) AS dist + FROM intervals_a a, intervals_b b + """, + tables=["intervals_a", "intervals_b"], + intervals_a=[GenomicInterval("chr1", 100, 300, "a1", 100, "+")], + intervals_b=[GenomicInterval("chr1", 200, 400, "b1", 100, "+")], + ) + + assert len(result) == 1 + assert result[0][0] == 0 + + +def test_distance_adjacent(giql_query): + """ + GIVEN two adjacent intervals (touching, half-open coordinates) + WHEN DISTANCE is computed via GIQL + THEN the distance is 0 (half-open: end of A == start of B means no gap) + """ + result = giql_query( + """ + SELECT DISTANCE(a.interval, b.interval) AS dist + FROM intervals_a a, intervals_b b + """, + tables=["intervals_a", "intervals_b"], + intervals_a=[GenomicInterval("chr1", 100, 200, "a1", 100, "+")], + intervals_b=[GenomicInterval("chr1", 200, 300, "b1", 100, "+")], + ) + + assert len(result) == 1 + # Half-open adjacent: b.start - a.end = 200 - 200 = 0 + assert result[0][0] == 0 + + +def test_distance_cross_chromosome(giql_query): + """ + GIVEN two intervals on different chromosomes + WHEN DISTANCE is computed via GIQL + THEN the distance is NULL (cross-chromosome distance is undefined) + """ + result = giql_query( + """ + SELECT DISTANCE(a.interval, b.interval) AS dist + FROM intervals_a a, intervals_b b + """, + tables=["intervals_a", "intervals_b"], + intervals_a=[GenomicInterval("chr1", 100, 200, "a1", 100, "+")], + intervals_b=[GenomicInterval("chr2", 100, 200, "b1", 100, "+")], + ) + + assert len(result) == 1 + assert result[0][0] is None, ( + f"Cross-chromosome distance should be NULL, got {result[0][0]}" + ) + + +def test_distance_signed_downstream(giql_query): + """ + GIVEN B is downstream of A (B starts after A ends) on + strand + WHEN DISTANCE with signed := true is computed + THEN the distance is positive (downstream) + """ + result = giql_query( + """ + SELECT DISTANCE(a.interval, b.interval, signed := true) AS dist + FROM intervals_a a, intervals_b b + """, + tables=["intervals_a", "intervals_b"], + intervals_a=[GenomicInterval("chr1", 100, 200, "a1", 100, "+")], + intervals_b=[GenomicInterval("chr1", 300, 400, "b1", 100, "+")], + ) + + assert len(result) == 1 + assert result[0][0] > 0, f"Expected positive (downstream), got {result[0][0]}" + + +def test_distance_signed_upstream(giql_query): + """ + GIVEN B is upstream of A (B ends before A starts) on + strand + WHEN DISTANCE with signed := true is computed + THEN the distance is negative (upstream) + """ + result = giql_query( + """ + SELECT DISTANCE(a.interval, b.interval, signed := true) AS dist + FROM intervals_a a, intervals_b b + """, + tables=["intervals_a", "intervals_b"], + intervals_a=[GenomicInterval("chr1", 300, 400, "a1", 100, "+")], + intervals_b=[GenomicInterval("chr1", 100, 200, "b1", 100, "+")], + ) + + assert len(result) == 1 + assert result[0][0] < 0, f"Expected negative (upstream), got {result[0][0]}" + + +def test_distance_stranded_unstranded_input(giql_query): + """ + GIVEN one interval with strand "." (unstranded) + WHEN DISTANCE with stranded := true is computed + THEN the distance is NULL (stranded mode requires valid strand) + """ + result = giql_query( + """ + SELECT DISTANCE(a.interval, b.interval, stranded := true) AS dist + FROM intervals_a a, intervals_b b + """, + tables=["intervals_a", "intervals_b"], + intervals_a=[GenomicInterval("chr1", 100, 200, "a1", 100, ".")], + intervals_b=[GenomicInterval("chr1", 300, 400, "b1", 100, "+")], + ) + + assert len(result) == 1 + assert result[0][0] is None, ( + f"Stranded distance with '.' strand should be NULL, got {result[0][0]}" + ) + + +def test_distance_stranded_same_strand(giql_query): + """ + GIVEN two non-overlapping intervals both on + strand + WHEN DISTANCE with stranded := true is computed + THEN the distance is computed normally (same strand is valid) + """ + result = giql_query( + """ + SELECT DISTANCE(a.interval, b.interval, stranded := true) AS dist + FROM intervals_a a, intervals_b b + """, + tables=["intervals_a", "intervals_b"], + intervals_a=[GenomicInterval("chr1", 100, 200, "a1", 100, "+")], + intervals_b=[GenomicInterval("chr1", 300, 400, "b1", 100, "+")], + ) + + assert len(result) == 1 + # Both on same + strand, distance should be computed normally + assert result[0][0] == 100, f"Expected 100, got {result[0][0]}" + + +def test_distance_signed_stranded_minus_strand(giql_query): + """ + GIVEN two non-overlapping intervals on - strand, B downstream genomically + WHEN DISTANCE with signed := true, stranded := true is computed + THEN the sign is inverted due to - strand (downstream + genomically is upstream in transcript orientation) + """ + result = giql_query( + """ + SELECT DISTANCE( + a.interval, b.interval, + signed := true, + stranded := true + ) AS dist + FROM intervals_a a, intervals_b b + """, + tables=["intervals_a", "intervals_b"], + intervals_a=[GenomicInterval("chr1", 100, 200, "a1", 100, "-")], + intervals_b=[GenomicInterval("chr1", 300, 400, "b1", 100, "-")], + ) + + assert len(result) == 1 + # On - strand with signed+stranded, genomic downstream becomes + # transcript upstream, so sign should be negative + assert result[0][0] < 0, ( + f"Expected negative distance on - strand, got {result[0][0]}" + ) diff --git a/tests/integration/bedtools/test_intersect.py b/tests/integration/bedtools/test_intersect.py new file mode 100644 index 0000000..f4bfd43 --- /dev/null +++ b/tests/integration/bedtools/test_intersect.py @@ -0,0 +1,327 @@ +"""Integration tests for GIQL INTERSECTS operator. + +These tests validate that GIQL's INTERSECTS operator produces identical +results to bedtools intersect command. +""" + +from giql import transpile + +from .utils.bedtools_wrapper import intersect +from .utils.comparison import compare_results +from .utils.data_models import GenomicInterval +from .utils.duckdb_loader import load_intervals + + +def test_intersect_basic_overlap(duckdb_connection): + """ + GIVEN two tables with genomic intervals where some intervals overlap + WHEN a GIQL query uses INTERSECTS predicate in WHERE clause + THEN results match bedtools intersect output exactly + """ + intervals_a = [ + GenomicInterval("chr1", 100, 200, "a1", 100, "+"), + GenomicInterval("chr1", 150, 250, "a2", 200, "+"), + GenomicInterval("chr1", 300, 400, "a3", 150, "-"), + ] + intervals_b = [ + GenomicInterval("chr1", 180, 220, "b1", 100, "+"), + GenomicInterval("chr1", 350, 450, "b2", 200, "-"), + ] + + load_intervals( + duckdb_connection, + "intervals_a", + [i.to_tuple() for i in intervals_a], + ) + load_intervals( + duckdb_connection, + "intervals_b", + [i.to_tuple() for i in intervals_b], + ) + + bedtools_result = intersect( + [i.to_tuple() for i in intervals_a], + [i.to_tuple() for i in intervals_b], + ) + + sql = transpile( + """ + SELECT DISTINCT a.* + FROM intervals_a a, intervals_b b + WHERE a.interval INTERSECTS b.interval + """, + tables=["intervals_a", "intervals_b"], + ) + giql_result = duckdb_connection.execute(sql).fetchall() + + comparison = compare_results(giql_result, bedtools_result) + assert comparison.match, comparison.failure_message() + + +def test_intersect_partial_overlap(duckdb_connection): + """ + GIVEN intervals with partial overlaps + WHEN INTERSECTS query is executed + THEN results match bedtools partial overlap behavior + """ + intervals_a = [ + GenomicInterval("chr1", 100, 250, "a1", 100, "+"), + GenomicInterval("chr1", 300, 400, "a2", 200, "+"), + ] + intervals_b = [ + GenomicInterval("chr1", 200, 350, "b1", 150, "+"), + ] + + load_intervals( + duckdb_connection, + "intervals_a", + [i.to_tuple() for i in intervals_a], + ) + load_intervals( + duckdb_connection, + "intervals_b", + [i.to_tuple() for i in intervals_b], + ) + + bedtools_result = intersect( + [i.to_tuple() for i in intervals_a], + [i.to_tuple() for i in intervals_b], + ) + + sql = transpile( + """ + SELECT DISTINCT a.* + FROM intervals_a a, intervals_b b + WHERE a.interval INTERSECTS b.interval + """, + tables=["intervals_a", "intervals_b"], + ) + giql_result = duckdb_connection.execute(sql).fetchall() + + comparison = compare_results(giql_result, bedtools_result) + assert comparison.match, comparison.failure_message() + + +def test_intersect_no_overlap(duckdb_connection): + """ + GIVEN two sets of intervals with no overlaps + WHEN INTERSECTS query is executed + THEN no results returned matching bedtools empty output + """ + intervals_a = [ + GenomicInterval("chr1", 100, 200, "a1", 100, "+"), + ] + intervals_b = [ + GenomicInterval("chr1", 300, 400, "b1", 150, "+"), + ] + + load_intervals( + duckdb_connection, + "intervals_a", + [i.to_tuple() for i in intervals_a], + ) + load_intervals( + duckdb_connection, + "intervals_b", + [i.to_tuple() for i in intervals_b], + ) + + bedtools_result = intersect( + [i.to_tuple() for i in intervals_a], + [i.to_tuple() for i in intervals_b], + ) + + sql = transpile( + """ + SELECT DISTINCT a.* + FROM intervals_a a, intervals_b b + WHERE a.interval INTERSECTS b.interval + """, + tables=["intervals_a", "intervals_b"], + ) + giql_result = duckdb_connection.execute(sql).fetchall() + + comparison = compare_results(giql_result, bedtools_result) + assert comparison.match, comparison.failure_message() + + +def test_intersect_adjacent_intervals(duckdb_connection): + """ + GIVEN intervals that touch but don't overlap using half-open coordinates + WHEN INTERSECTS query is executed + THEN no results returned because adjacent does not mean overlapping + """ + intervals_a = [ + GenomicInterval("chr1", 100, 200, "a1", 100, "+"), + ] + intervals_b = [ + GenomicInterval("chr1", 200, 300, "b1", 150, "+"), + ] + + load_intervals( + duckdb_connection, + "intervals_a", + [i.to_tuple() for i in intervals_a], + ) + load_intervals( + duckdb_connection, + "intervals_b", + [i.to_tuple() for i in intervals_b], + ) + + bedtools_result = intersect( + [i.to_tuple() for i in intervals_a], + [i.to_tuple() for i in intervals_b], + ) + + sql = transpile( + """ + SELECT DISTINCT a.* + FROM intervals_a a, intervals_b b + WHERE a.interval INTERSECTS b.interval + """, + tables=["intervals_a", "intervals_b"], + ) + giql_result = duckdb_connection.execute(sql).fetchall() + + comparison = compare_results(giql_result, bedtools_result) + assert comparison.match, comparison.failure_message() + + +def test_intersect_multiple_chromosomes(duckdb_connection): + """ + GIVEN intervals on different chromosomes + WHEN INTERSECTS query is executed + THEN only same-chromosome overlaps are returned + """ + intervals_a = [ + GenomicInterval("chr1", 100, 200, "a1", 100, "+"), + GenomicInterval("chr2", 150, 250, "a2", 200, "+"), + ] + intervals_b = [ + GenomicInterval("chr1", 150, 250, "b1", 150, "+"), + GenomicInterval("chr2", 200, 300, "b2", 100, "+"), + ] + + load_intervals( + duckdb_connection, + "intervals_a", + [i.to_tuple() for i in intervals_a], + ) + load_intervals( + duckdb_connection, + "intervals_b", + [i.to_tuple() for i in intervals_b], + ) + + bedtools_result = intersect( + [i.to_tuple() for i in intervals_a], + [i.to_tuple() for i in intervals_b], + ) + + sql = transpile( + """ + SELECT DISTINCT a.* + FROM intervals_a a, intervals_b b + WHERE a.interval INTERSECTS b.interval + """, + tables=["intervals_a", "intervals_b"], + ) + giql_result = duckdb_connection.execute(sql).fetchall() + + comparison = compare_results(giql_result, bedtools_result) + assert comparison.match, comparison.failure_message() + + +def test_intersect_literal_range(giql_query): + """ + GIVEN a table with intervals on chr1 + WHEN INTERSECTS is used with a literal range string + THEN only intervals overlapping the literal range are returned + """ + intervals = [ + GenomicInterval("chr1", 100, 200, "hit1", 100, "+"), + GenomicInterval("chr1", 180, 250, "hit2", 100, "+"), + GenomicInterval("chr1", 300, 400, "miss", 100, "+"), + ] + + result = giql_query( + "SELECT * FROM intervals WHERE interval INTERSECTS 'chr1:150-220'", + tables=["intervals"], + intervals=intervals, + ) + + names = {row[3] for row in result} + assert names == {"hit1", "hit2"}, f"Expected hit1+hit2, got {names}" + + +def test_intersect_literal_cross_chromosome(giql_query): + """ + GIVEN a table with intervals on chr1 and chr2 + WHEN INTERSECTS is used with a chr2 literal range + THEN only chr2 intervals overlapping the range are returned + """ + intervals = [ + GenomicInterval("chr1", 100, 200, "chr1_int", 100, "+"), + GenomicInterval("chr2", 100, 200, "chr2_hit", 100, "+"), + GenomicInterval("chr2", 300, 400, "chr2_miss", 100, "+"), + ] + + result = giql_query( + "SELECT * FROM intervals WHERE interval INTERSECTS 'chr2:150-250'", + tables=["intervals"], + intervals=intervals, + ) + + names = {row[3] for row in result} + assert names == {"chr2_hit"}, f"Expected only chr2_hit, got {names}" + + +def test_intersect_any_set_predicate(giql_query): + """ + GIVEN a table with intervals across chromosomes + WHEN INTERSECTS ANY is used with multiple ranges + THEN intervals overlapping any of the ranges are returned + """ + intervals = [ + GenomicInterval("chr1", 100, 200, "chr1_hit", 100, "+"), + GenomicInterval("chr2", 300, 400, "chr2_hit", 100, "+"), + GenomicInterval("chr3", 100, 200, "chr3_miss", 100, "+"), + ] + + result = giql_query( + """ + SELECT * FROM intervals + WHERE interval INTERSECTS ANY('chr1:150-250', 'chr2:350-450') + """, + tables=["intervals"], + intervals=intervals, + ) + + names = {row[3] for row in result} + assert names == {"chr1_hit", "chr2_hit"}, f"Expected chr1_hit+chr2_hit, got {names}" + + +def test_intersect_all_set_predicate(giql_query): + """ + GIVEN a table with intervals of varying sizes + WHEN INTERSECTS ALL is used with two ranges + THEN only intervals overlapping both ranges are returned + """ + intervals = [ + GenomicInterval("chr1", 100, 400, "large", 100, "+"), + GenomicInterval("chr1", 100, 200, "left_only", 100, "+"), + GenomicInterval("chr1", 250, 400, "right_only", 100, "+"), + ] + + result = giql_query( + """ + SELECT * FROM intervals + WHERE interval INTERSECTS ALL('chr1:120-180', 'chr1:280-350') + """, + tables=["intervals"], + intervals=intervals, + ) + + names = {row[3] for row in result} + assert names == {"large"}, f"Expected only large, got {names}" diff --git a/tests/integration/bedtools/test_merge.py b/tests/integration/bedtools/test_merge.py new file mode 100644 index 0000000..b9724c6 --- /dev/null +++ b/tests/integration/bedtools/test_merge.py @@ -0,0 +1,191 @@ +"""Integration tests for GIQL MERGE operator. + +These tests validate that GIQL's MERGE operator produces identical +results to bedtools merge command. +""" + +from giql import transpile + +from .utils.bedtools_wrapper import merge +from .utils.comparison import compare_results +from .utils.data_models import GenomicInterval +from .utils.duckdb_loader import load_intervals + + +def test_merge_adjacent_intervals(duckdb_connection): + """ + GIVEN a set of adjacent intervals (bookended, half-open) + WHEN MERGE operator is applied + THEN adjacent intervals are merged into single intervals + """ + intervals = [ + GenomicInterval("chr1", 100, 200, "i1", 100, "+"), + GenomicInterval("chr1", 200, 300, "i2", 150, "+"), + GenomicInterval("chr1", 300, 400, "i3", 200, "+"), + ] + + load_intervals( + duckdb_connection, + "intervals", + [i.to_tuple() for i in intervals], + ) + + bedtools_result = merge([i.to_tuple() for i in intervals]) + + sql = transpile( + "SELECT MERGE(interval) FROM intervals", + tables=["intervals"], + ) + giql_result = duckdb_connection.execute(sql).fetchall() + + comparison = compare_results(giql_result, bedtools_result) + assert comparison.match, comparison.failure_message() + + +def test_merge_overlapping_intervals(duckdb_connection): + """ + GIVEN a set of overlapping intervals + WHEN MERGE operator is applied + THEN overlapping intervals are merged + """ + intervals = [ + GenomicInterval("chr1", 100, 250, "i1", 100, "+"), + GenomicInterval("chr1", 200, 350, "i2", 150, "+"), + GenomicInterval("chr1", 300, 400, "i3", 200, "+"), + ] + + load_intervals( + duckdb_connection, + "intervals", + [i.to_tuple() for i in intervals], + ) + + bedtools_result = merge([i.to_tuple() for i in intervals]) + + sql = transpile( + "SELECT MERGE(interval) FROM intervals", + tables=["intervals"], + ) + giql_result = duckdb_connection.execute(sql).fetchall() + + comparison = compare_results(giql_result, bedtools_result) + assert comparison.match, comparison.failure_message() + + +def test_merge_separated_intervals(duckdb_connection): + """ + GIVEN intervals with gaps between them + WHEN MERGE operator is applied + THEN separated intervals remain separate + """ + intervals = [ + GenomicInterval("chr1", 100, 200, "i1", 100, "+"), + GenomicInterval("chr1", 300, 400, "i2", 150, "+"), + GenomicInterval("chr1", 500, 600, "i3", 200, "+"), + ] + + load_intervals( + duckdb_connection, + "intervals", + [i.to_tuple() for i in intervals], + ) + + bedtools_result = merge([i.to_tuple() for i in intervals]) + + sql = transpile( + "SELECT MERGE(interval) FROM intervals", + tables=["intervals"], + ) + giql_result = duckdb_connection.execute(sql).fetchall() + + comparison = compare_results(giql_result, bedtools_result) + assert comparison.match, comparison.failure_message() + + +def test_merge_multiple_chromosomes(duckdb_connection): + """ + GIVEN intervals on different chromosomes with overlaps within each + WHEN MERGE operator is applied + THEN merging occurs per chromosome independently + """ + intervals = [ + GenomicInterval("chr1", 100, 200, "i1", 100, "+"), + GenomicInterval("chr1", 180, 300, "i2", 150, "+"), + GenomicInterval("chr2", 100, 200, "i3", 100, "+"), + GenomicInterval("chr2", 180, 300, "i4", 150, "+"), + ] + + load_intervals( + duckdb_connection, + "intervals", + [i.to_tuple() for i in intervals], + ) + + bedtools_result = merge([i.to_tuple() for i in intervals]) + + sql = transpile( + "SELECT MERGE(interval) FROM intervals", + tables=["intervals"], + ) + giql_result = duckdb_connection.execute(sql).fetchall() + + comparison = compare_results(giql_result, bedtools_result) + assert comparison.match, comparison.failure_message() + + +def test_merge_with_distance(giql_query): + """ + GIVEN intervals with gaps of 50bp and 150bp + WHEN MERGE with distance=100 is applied + THEN gaps <= 100bp are bridged, gaps > 100bp remain separate + """ + intervals = [ + GenomicInterval("chr1", 100, 200, "i1", 100, "+"), + GenomicInterval("chr1", 250, 350, "i2", 150, "+"), # 50bp gap + GenomicInterval("chr1", 500, 600, "i3", 200, "+"), # 150bp gap + ] + + giql_result = giql_query( + "SELECT MERGE(interval, 100) FROM intervals", + tables=["intervals"], + intervals=intervals, + ) + + # i1 and i2 bridge (50bp gap <= 100), i3 stays separate (150bp gap) + assert len(giql_result) == 2, f"Expected 2 merged groups, got {len(giql_result)}" + + sorted_result = sorted(giql_result, key=lambda r: r[1]) + assert sorted_result[0][1] == 100 # merged start + assert sorted_result[0][2] == 350 # merged end + assert sorted_result[1][1] == 500 + assert sorted_result[1][2] == 600 + + +def test_merge_stranded_giql(giql_query): + """ + GIVEN overlapping intervals on different strands + WHEN MERGE with stranded := true is applied via GIQL + THEN intervals are merged per-strand, matching bedtools merge -s count + """ + intervals = [ + GenomicInterval("chr1", 100, 200, "i1", 100, "+"), + GenomicInterval("chr1", 150, 250, "i2", 150, "+"), + GenomicInterval("chr1", 120, 220, "i3", 200, "-"), + GenomicInterval("chr1", 180, 280, "i4", 100, "-"), + ] + + bedtools_result = merge( + [i.to_tuple() for i in intervals], + strand_mode="same", + ) + + giql_result = giql_query( + "SELECT MERGE(interval, stranded := true) FROM intervals", + tables=["intervals"], + intervals=intervals, + ) + + # Should have 2 merged intervals: one for + strand, one for - strand + assert len(giql_result) == len(bedtools_result), ( + f"Expected {len(bedtools_result)} merged groups, got {len(giql_result)}" + ) diff --git a/tests/integration/bedtools/test_nearest.py b/tests/integration/bedtools/test_nearest.py new file mode 100644 index 0000000..3a91641 --- /dev/null +++ b/tests/integration/bedtools/test_nearest.py @@ -0,0 +1,314 @@ +"""Integration tests for GIQL NEAREST operator. + +These tests validate that GIQL's NEAREST operator produces results +consistent with bedtools closest command. +""" + +from giql import transpile + +from .utils.bedtools_wrapper import closest +from .utils.data_models import GenomicInterval +from .utils.duckdb_loader import load_intervals + + +def test_nearest_non_overlapping(duckdb_connection): + """ + GIVEN two sets of non-overlapping intervals + WHEN NEAREST operator is applied + THEN each interval in A finds its closest neighbor in B, matching bedtools closest + """ + intervals_a = [ + GenomicInterval("chr1", 100, 200, "a1", 100, "+"), + GenomicInterval("chr1", 500, 600, "a2", 150, "+"), + ] + intervals_b = [ + GenomicInterval("chr1", 250, 300, "b1", 100, "+"), + GenomicInterval("chr1", 350, 400, "b2", 150, "+"), + GenomicInterval("chr1", 700, 800, "b3", 200, "+"), + ] + + load_intervals( + duckdb_connection, + "intervals_a", + [i.to_tuple() for i in intervals_a], + ) + load_intervals( + duckdb_connection, + "intervals_b", + [i.to_tuple() for i in intervals_b], + ) + + bedtools_result = closest( + [i.to_tuple() for i in intervals_a], + [i.to_tuple() for i in intervals_b], + ) + + sql = transpile( + """ + SELECT a.*, b.* + FROM intervals_a a + CROSS JOIN LATERAL NEAREST( + intervals_b, + reference := a.interval, + k := 1 + ) b + ORDER BY a.chrom, a.start + """, + tables=["intervals_a", "intervals_b"], + ) + giql_result = duckdb_connection.execute(sql).fetchall() + + # Verify row counts match + assert len(giql_result) == len(bedtools_result), ( + f"Row count mismatch: GIQL={len(giql_result)}, bedtools={len(bedtools_result)}" + ) + + # Verify each A interval found the correct B neighbor + for giql_row, bt_row in zip( + sorted(giql_result, key=lambda r: (r[0], r[1])), + sorted(bedtools_result, key=lambda r: (r[0], r[1])), + ): + # Compare A interval name + assert giql_row[3] == bt_row[3], ( + f"A name mismatch: GIQL={giql_row[3]}, bedtools={bt_row[3]}" + ) + # Compare matched B interval name + assert giql_row[9] == bt_row[9], ( + f"B name mismatch: GIQL={giql_row[9]}, bedtools={bt_row[9]}" + ) + + +def test_nearest_multiple_candidates(giql_query): + """ + GIVEN an interval in A with multiple equidistant intervals in B + WHEN NEAREST operator is applied + THEN one of the equidistant intervals is returned + """ + result = giql_query( + """ + SELECT a.*, b.* + FROM intervals_a a + CROSS JOIN LATERAL NEAREST( + intervals_b, + reference := a.interval, + k := 1 + ) b + """, + tables=["intervals_a", "intervals_b"], + intervals_a=[GenomicInterval("chr1", 300, 400, "a1", 100, "+")], + intervals_b=[ + GenomicInterval("chr1", 100, 200, "b1", 100, "+"), + GenomicInterval("chr1", 500, 600, "b2", 150, "+"), + ], + ) + + assert len(result) == 1 + assert result[0][3] == "a1" + # Nearest could be either b1 or b2 (both equidistant at 100bp) + assert result[0][9] in ("b1", "b2") + + +def test_nearest_cross_chromosome(duckdb_connection): + """ + GIVEN intervals on different chromosomes + WHEN NEAREST operator is applied + THEN each interval finds nearest only on same chromosome, + matching bedtools closest + """ + intervals_a = [ + GenomicInterval("chr1", 100, 200, "a1", 100, "+"), + GenomicInterval("chr2", 100, 200, "a2", 150, "+"), + ] + intervals_b = [ + GenomicInterval("chr1", 300, 400, "b1", 100, "+"), + GenomicInterval("chr2", 300, 400, "b2", 150, "+"), + ] + + load_intervals( + duckdb_connection, + "intervals_a", + [i.to_tuple() for i in intervals_a], + ) + load_intervals( + duckdb_connection, + "intervals_b", + [i.to_tuple() for i in intervals_b], + ) + + bedtools_result = closest( + [i.to_tuple() for i in intervals_a], + [i.to_tuple() for i in intervals_b], + ) + + sql = transpile( + """ + SELECT a.*, b.* + FROM intervals_a a + CROSS JOIN LATERAL NEAREST( + intervals_b, + reference := a.interval, + k := 1 + ) b + ORDER BY a.chrom, a.start + """, + tables=["intervals_a", "intervals_b"], + ) + giql_result = duckdb_connection.execute(sql).fetchall() + + assert len(giql_result) == len(bedtools_result) + + for giql_row, bt_row in zip( + sorted(giql_result, key=lambda r: (r[0], r[1])), + sorted(bedtools_result, key=lambda r: (r[0], r[1])), + ): + # A and matched B should be on the same chromosome + assert giql_row[0] == giql_row[6], ( + f"Cross-chromosome match: A on {giql_row[0]}, B on {giql_row[6]}" + ) + assert giql_row[3] == bt_row[3] + assert giql_row[9] == bt_row[9] + + +def test_nearest_boundary_cases(giql_query): + """ + GIVEN adjacent intervals (touching but not overlapping) + WHEN NEAREST operator is applied + THEN adjacent interval is reported as nearest (distance = 0) + """ + result = giql_query( + """ + SELECT a.*, b.* + FROM intervals_a a + CROSS JOIN LATERAL NEAREST( + intervals_b, + reference := a.interval, + k := 1 + ) b + """, + tables=["intervals_a", "intervals_b"], + intervals_a=[GenomicInterval("chr1", 100, 200, "a1", 100, "+")], + intervals_b=[ + GenomicInterval("chr1", 200, 300, "b1", 150, "+"), + GenomicInterval("chr1", 500, 600, "b2", 200, "+"), + ], + ) + + assert len(result) == 1 + # b1 is adjacent (distance 0), b2 is far (distance 300) + assert result[0][9] == "b1" + + +def test_nearest_k_greater_than_one(giql_query): + """ + GIVEN one query interval and three database intervals at different distances + WHEN NEAREST with k := 3 is applied + THEN all 3 neighbors are returned, ordered by distance + """ + result = giql_query( + """ + SELECT a.name, b.name + FROM intervals_a a + CROSS JOIN LATERAL NEAREST( + intervals_b, + reference := a.interval, + k := 3 + ) b + """, + tables=["intervals_a", "intervals_b"], + intervals_a=[GenomicInterval("chr1", 200, 300, "a1", 100, "+")], + intervals_b=[ + GenomicInterval("chr1", 100, 150, "b_far", 100, "+"), + GenomicInterval("chr1", 310, 350, "b_near", 100, "+"), + GenomicInterval("chr1", 500, 600, "b_farther", 100, "+"), + ], + ) + + b_names = [row[1] for row in result] + assert len(b_names) == 3, f"Expected 3 results for k=3, got {len(b_names)}" + assert set(b_names) == {"b_far", "b_near", "b_farther"} + + +def test_nearest_k_exceeds_available(giql_query): + """ + GIVEN one query interval and only two database intervals + WHEN NEAREST with k := 5 is applied + THEN only 2 rows are returned (fewer than k available) + """ + result = giql_query( + """ + SELECT a.name, b.name + FROM intervals_a a + CROSS JOIN LATERAL NEAREST( + intervals_b, + reference := a.interval, + k := 5 + ) b + """, + tables=["intervals_a", "intervals_b"], + intervals_a=[GenomicInterval("chr1", 200, 300, "a1", 100, "+")], + intervals_b=[ + GenomicInterval("chr1", 100, 150, "b1", 100, "+"), + GenomicInterval("chr1", 400, 500, "b2", 100, "+"), + ], + ) + + assert len(result) == 2, f"Expected 2 results (fewer than k=5), got {len(result)}" + + +def test_nearest_max_distance(giql_query): + """ + GIVEN one query interval, one near and one far database interval + WHEN NEAREST with max_distance := 50 is applied + THEN only the near interval (within 50bp) is returned + """ + result = giql_query( + """ + SELECT a.name, b.name + FROM intervals_a a + CROSS JOIN LATERAL NEAREST( + intervals_b, + reference := a.interval, + k := 5, + max_distance := 50 + ) b + """, + tables=["intervals_a", "intervals_b"], + intervals_a=[GenomicInterval("chr1", 200, 300, "a1", 100, "+")], + intervals_b=[ + GenomicInterval("chr1", 310, 350, "b_near", 100, "+"), + GenomicInterval("chr1", 500, 600, "b_far", 100, "+"), + ], + ) + + b_names = [row[1] for row in result] + # b_near is 10bp away (310 - 300), b_far is 200bp away (500 - 300) + assert b_names == ["b_near"], f"Expected only b_near within 50bp, got {b_names}" + + +def test_nearest_standalone_literal_reference(giql_query): + """ + GIVEN a table with intervals + WHEN NEAREST is used in standalone mode with a literal reference + THEN the nearest intervals to the literal position are returned + """ + result = giql_query( + """ + SELECT * + FROM NEAREST( + intervals, + reference := 'chr1:350-360', + k := 2 + ) + """, + tables=["intervals"], + intervals=[ + GenomicInterval("chr1", 100, 200, "near", 100, "+"), + GenomicInterval("chr1", 400, 500, "mid", 100, "+"), + GenomicInterval("chr1", 800, 900, "far", 100, "+"), + ], + ) + + names = [row[3] for row in result] + assert len(names) == 2, f"Expected 2 results for k=2, got {len(names)}" + # near is 150bp away, mid is 40bp away, far is 440bp away + assert set(names) == {"near", "mid"} diff --git a/tests/integration/bedtools/test_strand_aware.py b/tests/integration/bedtools/test_strand_aware.py new file mode 100644 index 0000000..f9c8eb9 --- /dev/null +++ b/tests/integration/bedtools/test_strand_aware.py @@ -0,0 +1,369 @@ +"""Integration tests for GIQL strand-aware operations. + +These tests validate that GIQL correctly handles strand-specific interval +operations, matching bedtools behavior with -s and -S flags. +""" + +from giql import transpile + +from .utils.bedtools_wrapper import closest +from .utils.bedtools_wrapper import intersect +from .utils.bedtools_wrapper import merge +from .utils.comparison import compare_results +from .utils.data_models import GenomicInterval +from .utils.duckdb_loader import load_intervals + + +def test_intersect_same_strand(duckdb_connection): + """ + GIVEN intervals on both same and opposite strands + WHEN INTERSECTS with same-strand filter is applied + THEN only same-strand overlaps are reported + """ + intervals_a = [ + GenomicInterval("chr1", 100, 200, "a1", 100, "+"), + GenomicInterval("chr1", 300, 400, "a2", 150, "-"), + ] + intervals_b = [ + GenomicInterval("chr1", 150, 250, "b1", 100, "+"), + GenomicInterval("chr1", 350, 450, "b2", 150, "-"), + GenomicInterval("chr1", 150, 250, "b3", 200, "-"), + ] + + load_intervals( + duckdb_connection, + "intervals_a", + [i.to_tuple() for i in intervals_a], + ) + load_intervals( + duckdb_connection, + "intervals_b", + [i.to_tuple() for i in intervals_b], + ) + + bedtools_result = intersect( + [i.to_tuple() for i in intervals_a], + [i.to_tuple() for i in intervals_b], + strand_mode="same", + ) + + sql = transpile( + """ + SELECT DISTINCT a.* + FROM intervals_a a, intervals_b b + WHERE a.interval INTERSECTS b.interval + AND a.strand = b.strand + """, + tables=["intervals_a", "intervals_b"], + ) + giql_result = duckdb_connection.execute(sql).fetchall() + + comparison = compare_results(giql_result, bedtools_result) + assert comparison.match, comparison.failure_message() + + +def test_intersect_opposite_strand(duckdb_connection): + """ + GIVEN intervals on both same and opposite strands + WHEN INTERSECTS with opposite-strand filter is applied + THEN only opposite-strand overlaps are reported + """ + intervals_a = [ + GenomicInterval("chr1", 100, 200, "a1", 100, "+"), + GenomicInterval("chr1", 300, 400, "a2", 150, "-"), + ] + intervals_b = [ + GenomicInterval("chr1", 150, 250, "b1", 100, "-"), + GenomicInterval("chr1", 350, 450, "b2", 150, "+"), + ] + + load_intervals( + duckdb_connection, + "intervals_a", + [i.to_tuple() for i in intervals_a], + ) + load_intervals( + duckdb_connection, + "intervals_b", + [i.to_tuple() for i in intervals_b], + ) + + bedtools_result = intersect( + [i.to_tuple() for i in intervals_a], + [i.to_tuple() for i in intervals_b], + strand_mode="opposite", + ) + + sql = transpile( + """ + SELECT DISTINCT a.* + FROM intervals_a a, intervals_b b + WHERE a.interval INTERSECTS b.interval + AND a.strand != b.strand + """, + tables=["intervals_a", "intervals_b"], + ) + giql_result = duckdb_connection.execute(sql).fetchall() + + comparison = compare_results(giql_result, bedtools_result) + assert comparison.match, comparison.failure_message() + + +def test_intersect_ignore_strand(duckdb_connection): + """ + GIVEN intervals with various strand combinations + WHEN INTERSECTS without strand requirements is applied + THEN all overlaps are reported regardless of strand + """ + intervals_a = [ + GenomicInterval("chr1", 100, 200, "a1", 100, "+"), + ] + intervals_b = [ + GenomicInterval("chr1", 150, 250, "b1", 100, "+"), + GenomicInterval("chr1", 150, 250, "b2", 150, "-"), + ] + + load_intervals( + duckdb_connection, + "intervals_a", + [i.to_tuple() for i in intervals_a], + ) + load_intervals( + duckdb_connection, + "intervals_b", + [i.to_tuple() for i in intervals_b], + ) + + bedtools_result = intersect( + [i.to_tuple() for i in intervals_a], + [i.to_tuple() for i in intervals_b], + ) + + sql = transpile( + """ + SELECT DISTINCT a.* + FROM intervals_a a, intervals_b b + WHERE a.interval INTERSECTS b.interval + """, + tables=["intervals_a", "intervals_b"], + ) + giql_result = duckdb_connection.execute(sql).fetchall() + + comparison = compare_results(giql_result, bedtools_result) + assert comparison.match, comparison.failure_message() + + +def test_intersect_mixed_strands(duckdb_connection): + """ + GIVEN a complex scenario with +, -, and unstranded intervals + WHEN INTERSECTS with same-strand requirement is applied + THEN results correctly handle strand matching logic + """ + intervals_a = [ + GenomicInterval("chr1", 100, 200, "a1", 100, "+"), + GenomicInterval("chr1", 300, 400, "a2", 150, "-"), + GenomicInterval("chr1", 500, 600, "a3", 200, "."), + ] + intervals_b = [ + GenomicInterval("chr1", 150, 250, "b1", 100, "+"), + GenomicInterval("chr1", 350, 450, "b2", 150, "-"), + GenomicInterval("chr1", 550, 650, "b3", 200, "."), + ] + + load_intervals( + duckdb_connection, + "intervals_a", + [i.to_tuple() for i in intervals_a], + ) + load_intervals( + duckdb_connection, + "intervals_b", + [i.to_tuple() for i in intervals_b], + ) + + bedtools_result = intersect( + [i.to_tuple() for i in intervals_a], + [i.to_tuple() for i in intervals_b], + strand_mode="same", + ) + + sql = transpile( + """ + SELECT DISTINCT a.* + FROM intervals_a a, intervals_b b + WHERE a.interval INTERSECTS b.interval + AND a.strand = b.strand + AND a.strand != '.' + AND b.strand != '.' + """, + tables=["intervals_a", "intervals_b"], + ) + giql_result = duckdb_connection.execute(sql).fetchall() + + comparison = compare_results(giql_result, bedtools_result) + assert comparison.match, comparison.failure_message() + + +def test_nearest_same_strand(duckdb_connection): + """ + GIVEN intervals with candidates on same and opposite strands + WHEN NEAREST with stranded := true is applied + THEN only same-strand nearest intervals are reported + """ + intervals_a = [ + GenomicInterval("chr1", 100, 200, "a1", 100, "+"), + ] + intervals_b = [ + GenomicInterval("chr1", 250, 300, "b1", 100, "+"), + GenomicInterval("chr1", 220, 240, "b2", 150, "-"), + ] + + load_intervals( + duckdb_connection, + "intervals_a", + [i.to_tuple() for i in intervals_a], + ) + load_intervals( + duckdb_connection, + "intervals_b", + [i.to_tuple() for i in intervals_b], + ) + + bedtools_result = closest( + [i.to_tuple() for i in intervals_a], + [i.to_tuple() for i in intervals_b], + strand_mode="same", + ) + + sql = transpile( + """ + SELECT a.*, b.* + FROM intervals_a a + CROSS JOIN LATERAL NEAREST( + intervals_b, + reference := a.interval, + k := 1, + stranded := true + ) b + ORDER BY a.chrom, a.start + """, + tables=["intervals_a", "intervals_b"], + ) + giql_result = duckdb_connection.execute(sql).fetchall() + + assert len(giql_result) == len(bedtools_result) + # Should find b1 (same strand +), not b2 (opposite strand -) + assert giql_result[0][9] == "b1" + + +def test_nearest_opposite_strand(): + """ + GIVEN intervals with candidates on same and opposite strands + WHEN bedtools closest with opposite-strand (-S) requirement is applied + THEN only opposite-strand nearest intervals are reported + + Note: This validates bedtools reference behavior for opposite-strand + nearest. GIQL does not yet support a direct opposite-strand NEAREST + mode; achieving the equivalent would require a post-filter on NEAREST + results. + """ + intervals_a = [ + GenomicInterval("chr1", 100, 200, "a1", 100, "+"), + ] + intervals_b = [ + GenomicInterval("chr1", 250, 300, "b1", 100, "-"), + GenomicInterval("chr1", 220, 240, "b2", 150, "+"), + ] + + bedtools_result = closest( + [i.to_tuple() for i in intervals_a], + [i.to_tuple() for i in intervals_b], + strand_mode="opposite", + ) + + # Verify bedtools returns the opposite-strand interval + assert len(bedtools_result) == 1 + assert bedtools_result[0][3] == "a1" + assert bedtools_result[0][9] == "b1" + + +def test_nearest_ignore_strand(duckdb_connection): + """ + GIVEN intervals on different strands + WHEN NEAREST without strand requirements is applied + THEN closest interval is found regardless of strand + """ + intervals_a = [ + GenomicInterval("chr1", 100, 200, "a1", 100, "+"), + ] + intervals_b = [ + GenomicInterval("chr1", 250, 300, "b1", 100, "+"), + GenomicInterval("chr1", 220, 240, "b2", 150, "-"), + ] + + load_intervals( + duckdb_connection, + "intervals_a", + [i.to_tuple() for i in intervals_a], + ) + load_intervals( + duckdb_connection, + "intervals_b", + [i.to_tuple() for i in intervals_b], + ) + + bedtools_result = closest( + [i.to_tuple() for i in intervals_a], + [i.to_tuple() for i in intervals_b], + ) + + sql = transpile( + """ + SELECT a.*, b.* + FROM intervals_a a + CROSS JOIN LATERAL NEAREST( + intervals_b, + reference := a.interval, + k := 1 + ) b + ORDER BY a.chrom, a.start + """, + tables=["intervals_a", "intervals_b"], + ) + giql_result = duckdb_connection.execute(sql).fetchall() + + assert len(giql_result) == len(bedtools_result) + # b2 is closer (gap of 20bp) regardless of strand + assert giql_result[0][9] == "b2" + + +def test_merge_strand_specific(duckdb_connection, giql_query): + """ + GIVEN overlapping intervals on different strands + WHEN MERGE with stranded := true is applied + THEN intervals are merged per-strand + """ + intervals = [ + GenomicInterval("chr1", 100, 200, "i1", 100, "+"), + GenomicInterval("chr1", 150, 250, "i2", 150, "+"), + GenomicInterval("chr1", 120, 180, "i3", 200, "-"), + GenomicInterval("chr1", 160, 240, "i4", 100, "-"), + ] + + bedtools_result = merge( + [i.to_tuple() for i in intervals], + strand_mode="same", + ) + + giql_result = giql_query( + """ + SELECT MERGE(interval, stranded := true) AS merged + FROM intervals + """, + tables=["intervals"], + intervals=intervals, + ) + + # Should produce exactly 2 merged intervals (one per strand) + assert len(bedtools_result) == 2 + assert len(giql_result) == 2 diff --git a/tests/integration/bedtools/test_within.py b/tests/integration/bedtools/test_within.py new file mode 100644 index 0000000..f2935b5 --- /dev/null +++ b/tests/integration/bedtools/test_within.py @@ -0,0 +1,103 @@ +"""Integration tests for GIQL WITHIN operator. + +These tests validate that GIQL's WITHIN predicate correctly identifies +intervals that fall entirely within a given range. No direct bedtools +equivalent exists, so tests validate against known expected results. +""" + +from .utils.data_models import GenomicInterval + + +def test_within_basic(giql_query): + """ + GIVEN a table with intervals of varying sizes + WHEN WITHIN is used with a range literal + THEN only intervals fully within the range are returned + """ + result = giql_query( + "SELECT * FROM intervals WHERE interval WITHIN 'chr1:100-300'", + tables=["intervals"], + intervals=[ + GenomicInterval("chr1", 150, 250, "inside", 100, "+"), + GenomicInterval("chr1", 50, 150, "partial_left", 100, "+"), + GenomicInterval("chr1", 250, 350, "partial_right", 100, "+"), + GenomicInterval("chr1", 500, 600, "outside", 100, "+"), + ], + ) + + names = {row[3] for row in result} + # inside [150,250) is within [100,300) + # partial_left [50,150) starts before 100 + # partial_right [250,350) ends after 300 + assert names == {"inside"}, f"Expected only inside, got {names}" + + +def test_within_narrow_range(giql_query): + """ + GIVEN a table with intervals of varying sizes + WHEN WITHIN is used with a narrow range + THEN only intervals small enough to fit are returned + """ + result = giql_query( + "SELECT * FROM intervals WHERE interval WITHIN 'chr1:150-160'", + tables=["intervals"], + intervals=[ + GenomicInterval("chr1", 152, 158, "tiny", 100, "+"), + GenomicInterval("chr1", 140, 160, "medium", 100, "+"), + GenomicInterval("chr1", 100, 200, "large", 100, "+"), + ], + ) + + names = {row[3] for row in result} + # tiny [152,158) is within [150,160) + # medium [140,160) starts before 150 + # large [100,200) starts before 150 + assert names == {"tiny"}, f"Expected only tiny, got {names}" + + +def test_within_column_to_column(giql_query): + """ + GIVEN two tables where some intervals in A are within intervals in B + WHEN a.interval WITHIN b.interval is used in WHERE clause + THEN only pairs where A is fully within B are returned + """ + result = giql_query( + """ + SELECT a.name, b.name + FROM intervals_a a, intervals_b b + WHERE a.interval WITHIN b.interval + """, + tables=["intervals_a", "intervals_b"], + intervals_a=[ + GenomicInterval("chr1", 150, 250, "a_inner", 100, "+"), + GenomicInterval("chr1", 50, 400, "a_outer", 100, "+"), + ], + intervals_b=[ + GenomicInterval("chr1", 100, 300, "b1", 100, "+"), + ], + ) + + pairs = {(row[0], row[1]) for row in result} + # a_inner [150,250) is within b1 [100,300) + # a_outer [50,400) is NOT within b1 [100,300) + assert pairs == {("a_inner", "b1")}, f"Expected one pair, got {pairs}" + + +def test_within_exact_boundary(giql_query): + """ + GIVEN an interval whose boundaries exactly match the query range + WHEN WITHIN is used with that exact range + THEN the interval is returned (exact match counts as within) + """ + result = giql_query( + "SELECT * FROM intervals WHERE interval WITHIN 'chr1:100-200'", + tables=["intervals"], + intervals=[ + GenomicInterval("chr1", 100, 200, "exact", 100, "+"), + GenomicInterval("chr1", 99, 200, "start_outside", 100, "+"), + GenomicInterval("chr1", 100, 201, "end_outside", 100, "+"), + ], + ) + + names = {row[3] for row in result} + assert names == {"exact"}, f"Expected only exact, got {names}" diff --git a/tests/integration/bedtools/utils/__init__.py b/tests/integration/bedtools/utils/__init__.py new file mode 100644 index 0000000..3da3c83 --- /dev/null +++ b/tests/integration/bedtools/utils/__init__.py @@ -0,0 +1 @@ +"""Utility modules for bedtools integration tests.""" diff --git a/tests/integration/bedtools/utils/bedtools_wrapper.py b/tests/integration/bedtools/utils/bedtools_wrapper.py new file mode 100644 index 0000000..c61be44 --- /dev/null +++ b/tests/integration/bedtools/utils/bedtools_wrapper.py @@ -0,0 +1,160 @@ +"""Pybedtools wrapper for genomic interval operations.""" + +import pybedtools + + +class BedtoolsError(Exception): + """Raised when bedtools operation fails.""" + + +def create_bedtool(intervals: list[tuple]) -> pybedtools.BedTool: + """Create BedTool object from interval tuples.""" + bed_strings = [] + for interval in intervals: + if len(interval) == 3: + bed_strings.append(f"{interval[0]}\t{interval[1]}\t{interval[2]}") + elif len(interval) >= 6: + chrom, start, end, name, score, strand = interval[:6] + name = name if name is not None else "." + score = score if score is not None else 0 + strand = strand if strand is not None else "." + bed_strings.append(f"{chrom}\t{start}\t{end}\t{name}\t{score}\t{strand}") + else: + raise ValueError(f"Invalid interval format: {interval}") + + bed_string = "\n".join(bed_strings) + return pybedtools.BedTool(bed_string, from_string=True) + + +def intersect( + intervals_a: list[tuple], + intervals_b: list[tuple], + strand_mode: str | None = None, +) -> list[tuple]: + """Find overlapping intervals using bedtools intersect.""" + try: + bt_a = create_bedtool(intervals_a) + bt_b = create_bedtool(intervals_b) + + kwargs = {"u": True} + if strand_mode == "same": + kwargs["s"] = True + elif strand_mode == "opposite": + kwargs["S"] = True + + result = bt_a.intersect(bt_b, **kwargs) + return bedtool_to_tuples(result) + + except Exception as e: + raise BedtoolsError(f"Intersect operation failed: {e}") + + +def merge(intervals: list[tuple], strand_mode: str | None = None) -> list[tuple]: + """Merge overlapping intervals using bedtools merge.""" + try: + bt = create_bedtool(intervals) + bt_sorted = bt.sort() + + kwargs = {} + if strand_mode == "same": + kwargs["s"] = True + + result = bt_sorted.merge(**kwargs) + return bedtool_to_tuples(result, bed_format="bed3") + + except Exception as e: + raise BedtoolsError(f"Merge operation failed: {e}") + + +def closest( + intervals_a: list[tuple], + intervals_b: list[tuple], + strand_mode: str | None = None, + k: int = 1, +) -> list[tuple]: + """Find closest intervals using bedtools closest.""" + try: + bt_a = create_bedtool(intervals_a) + bt_b = create_bedtool(intervals_b) + + bt_a = bt_a.sort() + bt_b = bt_b.sort() + + kwargs = {"d": True, "t": "first"} + if k > 1: + kwargs["k"] = k + if strand_mode == "same": + kwargs["s"] = True + elif strand_mode == "opposite": + kwargs["S"] = True + + result = bt_a.closest(bt_b, **kwargs) + return bedtool_to_tuples(result, bed_format="closest") + + except Exception as e: + raise BedtoolsError(f"Closest operation failed: {e}") + + +def bedtool_to_tuples( + bedtool: pybedtools.BedTool, bed_format: str = "bed6" +) -> list[tuple]: + """Convert BedTool object to list of tuples. + + Args: + bedtool: pybedtools.BedTool object + bed_format: Expected format ('bed3', 'bed6', or 'closest') + + Closest format assumes BED6+BED6+distance (13 fields): + Fields 0-5: A interval (chrom, start, end, name, score, strand) + Fields 6-11: B interval (chrom, start, end, name, score, strand) + Field 12: distance (integer) + """ + rows = [] + + for interval in bedtool: + fields = interval.fields + + if bed_format == "bed3": + rows.append((fields[0], int(fields[1]), int(fields[2]))) + + elif bed_format == "bed6": + while len(fields) < 6: + if len(fields) == 3: + fields.append(".") + elif len(fields) == 4: + fields.append("0") + elif len(fields) == 5: + fields.append(".") + + rows.append( + ( + fields[0], + int(fields[1]), + int(fields[2]), + fields[3] if fields[3] != "." else None, + int(fields[4]) if fields[4] != "." else None, + fields[5] if fields[5] != "." else None, + ) + ) + + elif bed_format == "closest": + if len(fields) < 13: + raise ValueError( + f"Unexpected number of fields for closest: {len(fields)}" + ) + row = [] + for i, field_val in enumerate(fields): + if i in (1, 2, 7, 8, 12): + row.append(int(field_val)) + elif i in (4, 10): + row.append(int(field_val) if field_val != "." else None) + elif i in (3, 5, 9, 11): + row.append(field_val if field_val != "." else None) + else: + row.append(field_val) + rows.append(tuple(row)) + + else: + raise ValueError(f"Unsupported format: {bed_format}") + + return rows diff --git a/tests/integration/bedtools/utils/comparison.py b/tests/integration/bedtools/utils/comparison.py new file mode 100644 index 0000000..562b55c --- /dev/null +++ b/tests/integration/bedtools/utils/comparison.py @@ -0,0 +1,95 @@ +"""Result comparison logic for GIQL vs bedtools outputs.""" + +from typing import Any + +from .data_models import ComparisonResult + +# Sentinel for sorting None values deterministically below all real values. +_NONE_SORT_KEY = (0, "") + + +def _sort_key(row: tuple) -> tuple: + """Generate sort key for order-independent comparison.""" + return tuple(_NONE_SORT_KEY if v is None else (1, v) for v in row) + + +def _values_match(val1: Any, val2: Any, epsilon: float = 1e-9) -> bool: + """Compare two values with appropriate tolerance.""" + if val1 is None and val2 is None: + return True + if val1 is None or val2 is None: + return False + + if isinstance(val1, float) or isinstance(val2, float): + try: + return abs(float(val1) - float(val2)) <= epsilon + except (ValueError, TypeError): + return False + + return val1 == val2 + + +def compare_results( + giql_rows: list[tuple], + bedtools_rows: list[tuple], + epsilon: float = 1e-9, +) -> ComparisonResult: + """Compare GIQL and bedtools results with appropriate tolerance. + + Comparison rules: + - Integer positions/counts: exact match required + - Floating-point values: epsilon tolerance + - Row ordering: order-independent (sorts both result sets) + """ + giql_count = len(giql_rows) + bedtools_count = len(bedtools_rows) + + giql_sorted = sorted(giql_rows, key=_sort_key) + bedtools_sorted = sorted(bedtools_rows, key=_sort_key) + + differences = [] + + if giql_count != bedtools_count: + differences.append( + f"Row count mismatch: GIQL has {giql_count} rows, " + f"bedtools has {bedtools_count} rows" + ) + + max_rows = max(giql_count, bedtools_count) + for i in range(max_rows): + if i >= giql_count: + differences.append( + f"Row {i}: Missing in GIQL, present in bedtools: {bedtools_sorted[i]}" + ) + continue + if i >= bedtools_count: + differences.append( + f"Row {i}: Present in GIQL, missing in bedtools: {giql_sorted[i]}" + ) + continue + + giql_row = giql_sorted[i] + bedtools_row = bedtools_sorted[i] + + if len(giql_row) != len(bedtools_row): + differences.append( + f"Row {i}: Column count mismatch " + f"(GIQL: {len(giql_row)} cols, " + f"bedtools: {len(bedtools_row)} cols)" + ) + continue + + for col_idx, (giql_val, bedtools_val) in enumerate(zip(giql_row, bedtools_row)): + if not _values_match(giql_val, bedtools_val, epsilon): + differences.append( + f"Row {i}, col {col_idx}: " + f"GIQL={giql_val!r} != bedtools={bedtools_val!r}" + ) + + return ComparisonResult( + match=len(differences) == 0, + giql_row_count=giql_count, + bedtools_row_count=bedtools_count, + differences=differences, + comparison_metadata={"epsilon": epsilon, "sorted": True}, + ) diff --git a/tests/integration/bedtools/utils/data_models.py b/tests/integration/bedtools/utils/data_models.py new file mode 100644 index 0000000..defcb82 --- /dev/null +++ b/tests/integration/bedtools/utils/data_models.py @@ -0,0 +1,75 @@ +"""Data models for bedtools integration testing. + +- GenomicInterval: Represents a single genomic interval +- ComparisonResult: Result of comparing GIQL vs bedtools outputs +""" + +from dataclasses import dataclass +from dataclasses import field + + +@dataclass +class GenomicInterval: + """Represents a single genomic interval with all BED file fields.""" + + chrom: str + start: int + end: int + name: str | None = None + score: int | None = None + strand: str | None = None + + def __post_init__(self): + if self.start >= self.end: + raise ValueError( + f"Invalid interval: start ({self.start}) >= end ({self.end})" + ) + if self.start < 0: + raise ValueError(f"Invalid interval: start ({self.start}) < 0") + if self.strand and self.strand not in ["+", "-", "."]: + raise ValueError(f"Invalid strand: {self.strand}") + if self.score is not None and not (0 <= self.score <= 1000): + raise ValueError(f"Invalid score: {self.score}") + + def to_tuple(self) -> tuple: + return ( + self.chrom, + self.start, + self.end, + self.name, + self.score, + self.strand, + ) + + +@dataclass +class ComparisonResult: + """Result of comparing GIQL and bedtools outputs.""" + + match: bool + giql_row_count: int + bedtools_row_count: int + differences: list[str] = field(default_factory=list) + comparison_metadata: dict = field(default_factory=dict) + + def __bool__(self) -> bool: + return self.match + + def failure_message(self) -> str: + if self.match: + return "Results match" + + msg = [ + "Results do not match", + f" GIQL rows: {self.giql_row_count}", + f" Bedtools rows: {self.bedtools_row_count}", + ] + + if self.differences: + msg.append(" Differences:") + for diff in self.differences[:10]: + msg.append(f" - {diff}") + if len(self.differences) > 10: + msg.append(f" ... and {len(self.differences) - 10} more") + + return "\n".join(msg) diff --git a/tests/integration/bedtools/utils/duckdb_loader.py b/tests/integration/bedtools/utils/duckdb_loader.py new file mode 100644 index 0000000..286b543 --- /dev/null +++ b/tests/integration/bedtools/utils/duckdb_loader.py @@ -0,0 +1,27 @@ +"""Utilities for loading interval data into DuckDB tables.""" + + +def load_intervals(conn, table_name: str, intervals: list[tuple]) -> None: + """Load interval tuples into a DuckDB table. + + Creates a table with GIQL default column names (chrom, start, end, + name, score, strand) and inserts the provided intervals. + + Args: + conn: DuckDB connection + table_name: Name of the table to create. Must be a simple + identifier -- this is test-only code with controlled inputs. + intervals: List of 6-element tuples + (chrom, start, end, name, score, strand) + """ + conn.execute(f""" + CREATE TABLE {table_name} ( + chrom VARCHAR, + "start" INTEGER, + "end" INTEGER, + name VARCHAR, + score INTEGER, + strand VARCHAR + ) + """) + conn.executemany(f"INSERT INTO {table_name} VALUES (?,?,?,?,?,?)", intervals) diff --git a/tests/test_cluster_parsing.py b/tests/test_cluster_parsing.py index 9be8cd6..f0f9039 100644 --- a/tests/test_cluster_parsing.py +++ b/tests/test_cluster_parsing.py @@ -32,7 +32,9 @@ def test_from_arg_list_with_property_eq_syntax(self): assert isinstance(cluster_expr, GIQLCluster), ( f"Expected GIQLCluster node, got {type(cluster_expr)}" ) - assert cluster_expr.args.get("stranded") is not None, "Missing stranded parameter" + assert cluster_expr.args.get("stranded") is not None, ( + "Missing stranded parameter" + ) def test_from_arg_list_with_eq_as_positional(self): """ diff --git a/tests/test_nearest_parsing.py b/tests/test_nearest_parsing.py index 9fd5fc5..9e07035 100644 --- a/tests/test_nearest_parsing.py +++ b/tests/test_nearest_parsing.py @@ -213,7 +213,9 @@ def test_from_arg_list_with_eq_as_positional(self): joins = ast.args.get("joins") join = joins[0] lateral_expr = join.this - nearest_expr = lateral_expr.this if hasattr(lateral_expr, "this") else lateral_expr + nearest_expr = ( + lateral_expr.this if hasattr(lateral_expr, "this") else lateral_expr + ) assert isinstance(nearest_expr, GIQLNearest) assert nearest_expr.args.get("k") is None, ( "= should not be treated as named parameter assignment" @@ -235,6 +237,10 @@ def test_from_arg_list_with_kwarg_syntax(self): joins = ast.args.get("joins") join = joins[0] lateral_expr = join.this - nearest_expr = lateral_expr.this if hasattr(lateral_expr, "this") else lateral_expr + nearest_expr = ( + lateral_expr.this if hasattr(lateral_expr, "this") else lateral_expr + ) assert isinstance(nearest_expr, GIQLNearest) - assert nearest_expr.args.get("k") is not None, "Missing k parameter with => syntax" + assert nearest_expr.args.get("k") is not None, ( + "Missing k parameter with => syntax" + )