|
| 1 | +"""Tool to compute per-sample variant site counts across gnomAD frequency thresholds.""" |
| 2 | + |
| 3 | +import hail as hl |
| 4 | + |
| 5 | +from divref.haplotype import HailPath |
| 6 | + |
| 7 | +_FREQ_THRESHOLDS = [0, 0.0001, 0.001, 0.005, 0.01, 0.05, 0.1] |
| 8 | + |
| 9 | + |
| 10 | +def compute_variation_ratios( |
| 11 | + *, |
| 12 | + vcfs_path: HailPath, |
| 13 | + gnomad_va_file: HailPath, |
| 14 | + gnomad_sa_file: HailPath, |
| 15 | + output_ht: HailPath, |
| 16 | +) -> None: |
| 17 | + """ |
| 18 | + Compute per-sample counts of non-reference variant sites across frequency thresholds. |
| 19 | +
|
| 20 | + For each sample and each predefined gnomAD population frequency threshold, counts |
| 21 | + the number of sites where the sample carries a non-reference genotype. Writes a |
| 22 | + sample-level Hail table with population labels and per-threshold site counts. |
| 23 | +
|
| 24 | + Args: |
| 25 | + vcfs_path: Path or glob pattern to input VCF files (GRCh38). |
| 26 | + gnomad_va_file: Path to the gnomAD variant annotations Hail table |
| 27 | + (from extract_gnomad_afs). |
| 28 | + gnomad_sa_file: Path to the gnomAD sample metadata Hail table |
| 29 | + (from extract_gnomad_afs). |
| 30 | + output_ht: Output path for the sample-level Hail table. |
| 31 | + """ |
| 32 | + hl.init() |
| 33 | + |
| 34 | + gnomad_sa = hl.read_table(gnomad_sa_file) |
| 35 | + gnomad_va = hl.read_table(gnomad_va_file) |
| 36 | + |
| 37 | + mt = hl.import_vcf(vcfs_path, reference_genome="GRCh38", min_partitions=64) |
| 38 | + mt = mt.select_rows().select_cols() |
| 39 | + mt = mt.annotate_rows(freq=gnomad_va[mt.row_key].pop_freqs) |
| 40 | + mt = mt.filter_rows(hl.is_defined(mt.freq)) |
| 41 | + mt = mt.annotate_rows(max_pop_freq=hl.max(mt.freq.map(lambda x: hl.max(x.AF)))) |
| 42 | + |
| 43 | + mt = mt.annotate_cols(pop=gnomad_sa[mt.col_key].pop) |
| 44 | + mt = mt.annotate_cols( |
| 45 | + counts=hl.struct(**{ |
| 46 | + f"n_sites_above_{x}": hl.agg.count_where(mt.GT.is_non_ref() & (mt.max_pop_freq > x)) |
| 47 | + for x in _FREQ_THRESHOLDS |
| 48 | + }) |
| 49 | + ) |
| 50 | + |
| 51 | + mt.cols().write(output_ht, overwrite=True) |
0 commit comments