Skip to content

Commit 2e3ca4f

Browse files
ameynertclaude
andcommitted
feat: add compute_variation_ratios tool
Port compute_variation_ratios.py from human-diversity-reference/scripts as a defopt-compatible toolkit tool. Counts per-sample non-reference variant sites above each of seven predefined gnomAD population frequency thresholds and writes a sample-level Hail table with population labels and counts. Removes the debug pprint call present in the original. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent 7012193 commit 2e3ca4f

File tree

1 file changed

+51
-0
lines changed

1 file changed

+51
-0
lines changed
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
"""Tool to compute per-sample variant site counts across gnomAD frequency thresholds."""
2+
3+
import hail as hl
4+
5+
from divref.haplotype import HailPath
6+
7+
_FREQ_THRESHOLDS = [0, 0.0001, 0.001, 0.005, 0.01, 0.05, 0.1]
8+
9+
10+
def compute_variation_ratios(
11+
*,
12+
vcfs_path: HailPath,
13+
gnomad_va_file: HailPath,
14+
gnomad_sa_file: HailPath,
15+
output_ht: HailPath,
16+
) -> None:
17+
"""
18+
Compute per-sample counts of non-reference variant sites across frequency thresholds.
19+
20+
For each sample and each predefined gnomAD population frequency threshold, counts
21+
the number of sites where the sample carries a non-reference genotype. Writes a
22+
sample-level Hail table with population labels and per-threshold site counts.
23+
24+
Args:
25+
vcfs_path: Path or glob pattern to input VCF files (GRCh38).
26+
gnomad_va_file: Path to the gnomAD variant annotations Hail table
27+
(from extract_gnomad_afs).
28+
gnomad_sa_file: Path to the gnomAD sample metadata Hail table
29+
(from extract_gnomad_afs).
30+
output_ht: Output path for the sample-level Hail table.
31+
"""
32+
hl.init()
33+
34+
gnomad_sa = hl.read_table(gnomad_sa_file)
35+
gnomad_va = hl.read_table(gnomad_va_file)
36+
37+
mt = hl.import_vcf(vcfs_path, reference_genome="GRCh38", min_partitions=64)
38+
mt = mt.select_rows().select_cols()
39+
mt = mt.annotate_rows(freq=gnomad_va[mt.row_key].pop_freqs)
40+
mt = mt.filter_rows(hl.is_defined(mt.freq))
41+
mt = mt.annotate_rows(max_pop_freq=hl.max(mt.freq.map(lambda x: hl.max(x.AF))))
42+
43+
mt = mt.annotate_cols(pop=gnomad_sa[mt.col_key].pop)
44+
mt = mt.annotate_cols(
45+
counts=hl.struct(**{
46+
f"n_sites_above_{x}": hl.agg.count_where(mt.GT.is_non_ref() & (mt.max_pop_freq > x))
47+
for x in _FREQ_THRESHOLDS
48+
})
49+
)
50+
51+
mt.cols().write(output_ht, overwrite=True)

0 commit comments

Comments
 (0)