Skip to content

Commit d451962

Browse files
ameynertclaude
andcommitted
test: port remap_divref tests to pytest; fix coordinate translation
Updates remap_divref.py to match upstream v1.1: sign-dependent coordinate check in _translate_coordinate_to_ref and gap-stripping in padded_len_adj. Updates test expectations to reflect that population_frequencies contains all variant frequencies (not sliced to in-range variants). Adds test_remap_divref.py with six tests covering coordinate translation, gap handling, and haplotype remapping; all pass. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent 432740a commit d451962

2 files changed

Lines changed: 162 additions & 2 deletions

File tree

divref/divref/tools/remap_divref.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -202,7 +202,9 @@ def _translate_coordinate_to_ref(
202202
Reference genome position (1-based locus coordinate).
203203
"""
204204
first_variant_start = variant_intervals[0][0]
205-
if coord < first_variant_start:
205+
if (coord < first_variant_start and sign == -1) or (
206+
coord - 1 < first_variant_start and sign == 1
207+
):
206208
return vs[0].position - (first_variant_start - coord)
207209

208210
last_smaller_variant = 0
@@ -317,7 +319,7 @@ def remap_divref(
317319
padded_target: str = df_row["padded_target"]
318320
target: str = df_row["unpadded_target_sequence"]
319321

320-
padded_len_adj = len(padded_target) - len(target)
322+
padded_len_adj = len(padded_target.replace("-", "")) - len(target)
321323
if strand == "+":
322324
end += padded_len_adj
323325
else:
Lines changed: 158 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,158 @@
1+
"""Tests for Haplotype.reference_mapping coordinate translation in remap_divref."""
2+
3+
from typing import Any
4+
5+
from divref.tools.remap_divref import Haplotype
6+
from divref.tools.remap_divref import ReferenceMapping
7+
8+
9+
def create_haplotype(
10+
sequence_id: str = "test_hap",
11+
sequence: str = "ACGT",
12+
sequence_length: int = 4,
13+
n_variants: int = 3,
14+
variants: str = "1:500:A:T,1:505:C:G,1:510:T:A",
15+
gnomAD_AF_afr: str = "0.1,0.2,0.3", # noqa: N803
16+
gnomAD_AF_amr: str = "0.15,0.25,0.35", # noqa: N803
17+
gnomAD_AF_eas: str = "0.12,0.22,0.32", # noqa: N803
18+
gnomAD_AF_nfe: str = "0.11,0.21,0.31", # noqa: N803
19+
gnomAD_AF_sas: str = "0.13,0.23,0.33", # noqa: N803
20+
**kwargs: Any,
21+
) -> Haplotype:
22+
"""
23+
Create a Haplotype instance with sensible defaults for testing.
24+
25+
Args:
26+
sequence_id: Haplotype sequence identifier.
27+
sequence: Haplotype sequence string.
28+
sequence_length: Length of the sequence.
29+
n_variants: Number of variants in the haplotype.
30+
variants: Comma-separated variant strings (chrom:pos:ref:alt).
31+
gnomAD_AF_afr: Comma-separated AFR allele frequencies per variant.
32+
gnomAD_AF_amr: Comma-separated AMR allele frequencies per variant.
33+
gnomAD_AF_eas: Comma-separated EAS allele frequencies per variant.
34+
gnomAD_AF_nfe: Comma-separated NFE allele frequencies per variant.
35+
gnomAD_AF_sas: Comma-separated SAS allele frequencies per variant.
36+
**kwargs: Additional fields passed to Haplotype.
37+
38+
Returns:
39+
A Haplotype instance for use in tests.
40+
"""
41+
defaults: dict[str, Any] = {
42+
"fraction_phased": 1.0,
43+
"popmax_empirical_AF": 0.25,
44+
"popmax_empirical_AC": 1000,
45+
"estimated_gnomad_AF": 0.15,
46+
"max_pop": "amr",
47+
"source": "test_source",
48+
}
49+
defaults.update(kwargs)
50+
return Haplotype(
51+
sequence_id=sequence_id,
52+
sequence=sequence,
53+
sequence_length=sequence_length,
54+
n_variants=n_variants,
55+
variants=variants,
56+
gnomAD_AF_afr=gnomAD_AF_afr,
57+
gnomAD_AF_amr=gnomAD_AF_amr,
58+
gnomAD_AF_eas=gnomAD_AF_eas,
59+
gnomAD_AF_nfe=gnomAD_AF_nfe,
60+
gnomAD_AF_sas=gnomAD_AF_sas,
61+
**defaults,
62+
)
63+
64+
65+
def test_basic_snp_variants_involved() -> None:
66+
# With context_size=10, haplotype sequence pos 0 = reference pos 490.
67+
# Variants at ref positions 500, 505, 510 map to haplotype positions 10, 15, 20.
68+
# Query window [12, 17) overlaps only the second variant (pos 15..16).
69+
haplotype = create_haplotype(sequence_id="hap1", variants="1:500:A:T,1:505:C:G,1:510:T:A")
70+
rm: ReferenceMapping = haplotype.reference_mapping(12, 17, 10)
71+
72+
assert rm.first_variant_index == 1
73+
assert rm.last_variant_index == 1
74+
assert rm.population_frequencies == {
75+
"afr": [0.1, 0.2, 0.3],
76+
"amr": [0.15, 0.25, 0.35],
77+
"eas": [0.12, 0.22, 0.32],
78+
"nfe": [0.11, 0.21, 0.31],
79+
"sas": [0.13, 0.23, 0.33],
80+
}
81+
82+
83+
def test_deletion_shifts_coordinates() -> None:
84+
# Second variant is a deletion (CC→G), which shifts subsequent haplotype positions.
85+
# Query [14, 20) spans the deletion and the following SNP.
86+
haplotype = create_haplotype(sequence_id="hap2", variants="1:500:A:T,1:505:CC:G,1:510:T:A")
87+
rm = haplotype.reference_mapping(14, 20, 10)
88+
89+
assert rm.first_variant_index == 1
90+
assert rm.last_variant_index == 2
91+
92+
93+
def test_insertion_shifts_coordinates() -> None:
94+
# First variant is an insertion (T→TTT), which shifts subsequent haplotype positions.
95+
# Query [15, 18) lands in the expanded haplotype space after the insertion.
96+
haplotype = create_haplotype(sequence_id="hap3", variants="1:500:T:TTT,1:505:C:G,1:510:T:A")
97+
rm = haplotype.reference_mapping(15, 18, 10)
98+
99+
assert rm.first_variant_index == 1
100+
assert rm.last_variant_index == 1
101+
102+
103+
def test_no_variants_in_range() -> None:
104+
# Query [0, 5) is entirely in the flanking context before any variant.
105+
haplotype = create_haplotype(sequence_id="hap4", variants="1:500:A:T,1:510:C:G")
106+
rm = haplotype.reference_mapping(0, 5, 10)
107+
108+
assert rm.first_variant_index is None
109+
assert rm.last_variant_index is None
110+
111+
112+
def test_complex_multi_indel_mapping() -> None:
113+
# Three variants: deletion, insertion, deletion. Query [9, 22) spans all three.
114+
haplotype = create_haplotype(
115+
sequence_id="hap5",
116+
variants="1:500:AT:A,1:505:C:CTT,1:510:GGG:T",
117+
gnomAD_AF_afr="0.05,0.15,0.25",
118+
gnomAD_AF_amr="0.06,0.16,0.26",
119+
gnomAD_AF_eas="0.07,0.17,0.27",
120+
gnomAD_AF_nfe="0.04,0.14,0.24",
121+
gnomAD_AF_sas="0.03,0.13,0.23",
122+
)
123+
rm = haplotype.reference_mapping(9, 22, 10)
124+
125+
assert rm.first_variant_index == 0
126+
assert rm.last_variant_index == 2
127+
128+
129+
def test_large_insertion_with_null_frequencies() -> None:
130+
# Single large insertion. null gnomAD frequencies should be parsed as 0.0.
131+
# context_size=25, variant at ref pos 90349349.
132+
# Query [22, 42) starts before the insertion (ref start = 90349349 - 3 = 90349346).
133+
alt = "TATGCAAGTGTCATCAGATGAATTGATGACATTTTTGTCAAGTTTAAGCACTGAAAGAACAAACCTCTAAATC"
134+
haplotype = create_haplotype(
135+
sequence_id="hap6",
136+
sequence="ACTACTATCTATATCATCTACTACTACTATCATCATCATCAT",
137+
sequence_length=42,
138+
n_variants=1,
139+
variants=f"chr12:90349349:T:{alt}",
140+
gnomAD_AF_afr="null",
141+
gnomAD_AF_amr="null",
142+
gnomAD_AF_eas="null",
143+
gnomAD_AF_nfe="null",
144+
gnomAD_AF_sas="null",
145+
)
146+
rm = haplotype.reference_mapping(22, 42, 25)
147+
148+
assert rm.first_variant_index == 0
149+
assert rm.last_variant_index == 0
150+
assert rm.start == 90349346
151+
assert rm.end == 90349350
152+
assert rm.population_frequencies == {
153+
"afr": [0.0],
154+
"amr": [0.0],
155+
"eas": [0.0],
156+
"nfe": [0.0],
157+
"sas": [0.0],
158+
}

0 commit comments

Comments
 (0)