|
| 1 | +"""Tests for Haplotype.reference_mapping coordinate translation in remap_divref.""" |
| 2 | + |
| 3 | +from typing import Any |
| 4 | + |
| 5 | +from divref.tools.remap_divref import Haplotype |
| 6 | +from divref.tools.remap_divref import ReferenceMapping |
| 7 | + |
| 8 | + |
| 9 | +def create_haplotype( |
| 10 | + sequence_id: str = "test_hap", |
| 11 | + sequence: str = "ACGT", |
| 12 | + sequence_length: int = 4, |
| 13 | + n_variants: int = 3, |
| 14 | + variants: str = "1:500:A:T,1:505:C:G,1:510:T:A", |
| 15 | + gnomAD_AF_afr: str = "0.1,0.2,0.3", # noqa: N803 |
| 16 | + gnomAD_AF_amr: str = "0.15,0.25,0.35", # noqa: N803 |
| 17 | + gnomAD_AF_eas: str = "0.12,0.22,0.32", # noqa: N803 |
| 18 | + gnomAD_AF_nfe: str = "0.11,0.21,0.31", # noqa: N803 |
| 19 | + gnomAD_AF_sas: str = "0.13,0.23,0.33", # noqa: N803 |
| 20 | + **kwargs: Any, |
| 21 | +) -> Haplotype: |
| 22 | + """ |
| 23 | + Create a Haplotype instance with sensible defaults for testing. |
| 24 | +
|
| 25 | + Args: |
| 26 | + sequence_id: Haplotype sequence identifier. |
| 27 | + sequence: Haplotype sequence string. |
| 28 | + sequence_length: Length of the sequence. |
| 29 | + n_variants: Number of variants in the haplotype. |
| 30 | + variants: Comma-separated variant strings (chrom:pos:ref:alt). |
| 31 | + gnomAD_AF_afr: Comma-separated AFR allele frequencies per variant. |
| 32 | + gnomAD_AF_amr: Comma-separated AMR allele frequencies per variant. |
| 33 | + gnomAD_AF_eas: Comma-separated EAS allele frequencies per variant. |
| 34 | + gnomAD_AF_nfe: Comma-separated NFE allele frequencies per variant. |
| 35 | + gnomAD_AF_sas: Comma-separated SAS allele frequencies per variant. |
| 36 | + **kwargs: Additional fields passed to Haplotype. |
| 37 | +
|
| 38 | + Returns: |
| 39 | + A Haplotype instance for use in tests. |
| 40 | + """ |
| 41 | + defaults: dict[str, Any] = { |
| 42 | + "fraction_phased": 1.0, |
| 43 | + "popmax_empirical_AF": 0.25, |
| 44 | + "popmax_empirical_AC": 1000, |
| 45 | + "estimated_gnomad_AF": 0.15, |
| 46 | + "max_pop": "amr", |
| 47 | + "source": "test_source", |
| 48 | + } |
| 49 | + defaults.update(kwargs) |
| 50 | + return Haplotype( |
| 51 | + sequence_id=sequence_id, |
| 52 | + sequence=sequence, |
| 53 | + sequence_length=sequence_length, |
| 54 | + n_variants=n_variants, |
| 55 | + variants=variants, |
| 56 | + gnomAD_AF_afr=gnomAD_AF_afr, |
| 57 | + gnomAD_AF_amr=gnomAD_AF_amr, |
| 58 | + gnomAD_AF_eas=gnomAD_AF_eas, |
| 59 | + gnomAD_AF_nfe=gnomAD_AF_nfe, |
| 60 | + gnomAD_AF_sas=gnomAD_AF_sas, |
| 61 | + **defaults, |
| 62 | + ) |
| 63 | + |
| 64 | + |
| 65 | +def test_basic_snp_variants_involved() -> None: |
| 66 | + # With context_size=10, haplotype sequence pos 0 = reference pos 490. |
| 67 | + # Variants at ref positions 500, 505, 510 map to haplotype positions 10, 15, 20. |
| 68 | + # Query window [12, 17) overlaps only the second variant (pos 15..16). |
| 69 | + haplotype = create_haplotype(sequence_id="hap1", variants="1:500:A:T,1:505:C:G,1:510:T:A") |
| 70 | + rm: ReferenceMapping = haplotype.reference_mapping(12, 17, 10) |
| 71 | + |
| 72 | + assert rm.first_variant_index == 1 |
| 73 | + assert rm.last_variant_index == 1 |
| 74 | + assert rm.population_frequencies == { |
| 75 | + "afr": [0.1, 0.2, 0.3], |
| 76 | + "amr": [0.15, 0.25, 0.35], |
| 77 | + "eas": [0.12, 0.22, 0.32], |
| 78 | + "nfe": [0.11, 0.21, 0.31], |
| 79 | + "sas": [0.13, 0.23, 0.33], |
| 80 | + } |
| 81 | + |
| 82 | + |
| 83 | +def test_deletion_shifts_coordinates() -> None: |
| 84 | + # Second variant is a deletion (CC→G), which shifts subsequent haplotype positions. |
| 85 | + # Query [14, 20) spans the deletion and the following SNP. |
| 86 | + haplotype = create_haplotype(sequence_id="hap2", variants="1:500:A:T,1:505:CC:G,1:510:T:A") |
| 87 | + rm = haplotype.reference_mapping(14, 20, 10) |
| 88 | + |
| 89 | + assert rm.first_variant_index == 1 |
| 90 | + assert rm.last_variant_index == 2 |
| 91 | + |
| 92 | + |
| 93 | +def test_insertion_shifts_coordinates() -> None: |
| 94 | + # First variant is an insertion (T→TTT), which shifts subsequent haplotype positions. |
| 95 | + # Query [15, 18) lands in the expanded haplotype space after the insertion. |
| 96 | + haplotype = create_haplotype(sequence_id="hap3", variants="1:500:T:TTT,1:505:C:G,1:510:T:A") |
| 97 | + rm = haplotype.reference_mapping(15, 18, 10) |
| 98 | + |
| 99 | + assert rm.first_variant_index == 1 |
| 100 | + assert rm.last_variant_index == 1 |
| 101 | + |
| 102 | + |
| 103 | +def test_no_variants_in_range() -> None: |
| 104 | + # Query [0, 5) is entirely in the flanking context before any variant. |
| 105 | + haplotype = create_haplotype(sequence_id="hap4", variants="1:500:A:T,1:510:C:G") |
| 106 | + rm = haplotype.reference_mapping(0, 5, 10) |
| 107 | + |
| 108 | + assert rm.first_variant_index is None |
| 109 | + assert rm.last_variant_index is None |
| 110 | + |
| 111 | + |
| 112 | +def test_complex_multi_indel_mapping() -> None: |
| 113 | + # Three variants: deletion, insertion, deletion. Query [9, 22) spans all three. |
| 114 | + haplotype = create_haplotype( |
| 115 | + sequence_id="hap5", |
| 116 | + variants="1:500:AT:A,1:505:C:CTT,1:510:GGG:T", |
| 117 | + gnomAD_AF_afr="0.05,0.15,0.25", |
| 118 | + gnomAD_AF_amr="0.06,0.16,0.26", |
| 119 | + gnomAD_AF_eas="0.07,0.17,0.27", |
| 120 | + gnomAD_AF_nfe="0.04,0.14,0.24", |
| 121 | + gnomAD_AF_sas="0.03,0.13,0.23", |
| 122 | + ) |
| 123 | + rm = haplotype.reference_mapping(9, 22, 10) |
| 124 | + |
| 125 | + assert rm.first_variant_index == 0 |
| 126 | + assert rm.last_variant_index == 2 |
| 127 | + |
| 128 | + |
| 129 | +def test_large_insertion_with_null_frequencies() -> None: |
| 130 | + # Single large insertion. null gnomAD frequencies should be parsed as 0.0. |
| 131 | + # context_size=25, variant at ref pos 90349349. |
| 132 | + # Query [22, 42) starts before the insertion (ref start = 90349349 - 3 = 90349346). |
| 133 | + alt = "TATGCAAGTGTCATCAGATGAATTGATGACATTTTTGTCAAGTTTAAGCACTGAAAGAACAAACCTCTAAATC" |
| 134 | + haplotype = create_haplotype( |
| 135 | + sequence_id="hap6", |
| 136 | + sequence="ACTACTATCTATATCATCTACTACTACTATCATCATCATCAT", |
| 137 | + sequence_length=42, |
| 138 | + n_variants=1, |
| 139 | + variants=f"chr12:90349349:T:{alt}", |
| 140 | + gnomAD_AF_afr="null", |
| 141 | + gnomAD_AF_amr="null", |
| 142 | + gnomAD_AF_eas="null", |
| 143 | + gnomAD_AF_nfe="null", |
| 144 | + gnomAD_AF_sas="null", |
| 145 | + ) |
| 146 | + rm = haplotype.reference_mapping(22, 42, 25) |
| 147 | + |
| 148 | + assert rm.first_variant_index == 0 |
| 149 | + assert rm.last_variant_index == 0 |
| 150 | + assert rm.start == 90349346 |
| 151 | + assert rm.end == 90349350 |
| 152 | + assert rm.population_frequencies == { |
| 153 | + "afr": [0.0], |
| 154 | + "amr": [0.0], |
| 155 | + "eas": [0.0], |
| 156 | + "nfe": [0.0], |
| 157 | + "sas": [0.0], |
| 158 | + } |
0 commit comments