From 21da7376a1bc501d530b2a5c44c5edba267bec6b Mon Sep 17 00:00:00 2001 From: Kristian Ullrich Date: Wed, 26 Mar 2025 10:40:29 +0100 Subject: [PATCH] Create vcfAllSiteParser2.py --- vcfAllSiteParser2.py | 81 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 81 insertions(+) create mode 100644 vcfAllSiteParser2.py diff --git a/vcfAllSiteParser2.py b/vcfAllSiteParser2.py new file mode 100644 index 0000000..d999879 --- /dev/null +++ b/vcfAllSiteParser2.py @@ -0,0 +1,81 @@ +#!/usr/bin/env python + +import sys +import gzip +import re +from collections import defaultdict + +class MaskGenerator: + def __init__(self, filename, chrom): + self.chrom = chrom + self.lastCalledPos = -1 + self.lastStartPos = -1 + self.file = gzip.open(filename, "wt") + def addCalledPosition(self, pos): + if self.lastCalledPos == -1: + self.lastCalledPos = pos + self.lastStartPos = pos + elif pos == self.lastCalledPos + 1: + self.lastCalledPos = pos + else: + self.file.write(f"{self.chrom}\t{self.lastStartPos - 1}\t{self.lastCalledPos}\n") + self.lastStartPos = pos + self.lastCalledPos = pos + def close(self): + if self.lastCalledPos != -1: + self.file.write(f"{self.chrom}\t{self.lastStartPos - 1}\t{self.lastCalledPos}\n") + self.file.close() + +if len(sys.argv) < 3: + print("Usage: ./vcfCaller.py ") + print("Reads a VCF file, processes all chromosomes, and outputs separate mask BED files and filtered VCFs for each chromosome.") + sys.exit(1) + +input_vcf = sys.argv[1] +output_prefix = sys.argv[2] + +prev_chrom = None + +# Dictionaries to store VCF file handles and MaskGenerators for each chromosome +vcf_files = {} +mask_generators = {} +header_lines = [] # Store header lines + +with gzip.open(input_vcf, "rt") if input_vcf.endswith(".gz") else open(input_vcf, "r") as vcf_file: + line_cnt = 0 + for line in vcf_file: + line = line.rstrip() + if line.startswith('#'): + # Write header to all chromosome VCF files when they are created + header_lines.append(line) + continue + fields = line.split('\t') + chrom, pos, refAllele, altAllele, info, genotypes = ( + fields[0], int(fields[1]), fields[3], fields[4], fields[7], fields[9] + ) + if chrom != prev_chrom: + if prev_chrom is not None: + # Close previous chromosome files + vcf_files[prev_chrom].close() + mask_generators[prev_chrom].close() + # Open new files for the new chromosome + vcf_files[chrom] = gzip.open(f"{output_prefix}_{chrom}.vcf.gz", "wt") + mask_generators[chrom] = MaskGenerator(f"{output_prefix}_{chrom}_mask.bed.gz", chrom) + # Write headers for the new chromosome + for header in header_lines: + vcf_files[chrom].write(header) + prev_chrom = chrom # Update previous chromosome tracker + if line_cnt % 10000 == 0: + sys.stderr.write(f"Parsing position {pos} on {chrom}\n") + line_cnt += 1 + if re.match("^[ACTGactg]$", refAllele) and re.match("^[ACTGactg\.]$", altAllele): + if genotypes[0] != '.' and genotypes[2] != '.': + mask_generators[chrom].addCalledPosition(pos) + if genotypes[0] == '1' or genotypes[2] == '1': + vcf_files[chrom].write(line + "\n") + +if prev_chrom: + vcf_files[prev_chrom].close() + mask_generators[prev_chrom].close() + +print("Processing complete. Separate output files per chromosome are compressed with bgzip.")