-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathvcf2.py
More file actions
executable file
·87 lines (73 loc) · 3.26 KB
/
vcf2.py
File metadata and controls
executable file
·87 lines (73 loc) · 3.26 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import os.path
import subprocess
import logging
logger = logging.getLogger()
import collections
import vcf as pyvcf
class Vcf(object):
def __init__(self, filename, build=True):
# Determine if the input file was compressed by bgzip and has a tabix
# index
ftest = subprocess.check_output(["file", filename])
if "gzip compressed data, extra field" not in ftest:
if os.path.isfile(filename + '.gz'):
filename = filename + '.gz'
else:
logger.warn('Input file needs to be compressed by bgzip')
logger.info('I will try to run bgzip now')
bgz = filename+'.gz'
with open(bgz, 'w') as FH:
p = subprocess.call(['bgzip', '-c', filename], stdout=FH)
if p == 0:
logger.info('Sucessfully made bzipped file')
filename = bgz
else:
logger.error('I could not compress the file using bgzip, please run "bgzip {0}"'.format(filename))
raise IOError
if not os.path.isfile(filename + '.tbi'):
logger.warn('A Tabix index file was not found.')
logger.info('Trying to create tabix index file')
try:
subprocess.check_call(['tabix', filename, '-p', 'vcf'])
logger.info('Sucessfully built tabix index')
except:
logger.error('I could not create the index file, please run "tabix {0} -p vcf"'.format(filename))
# Create pyvcf object
self.vcf_reader = pyvcf.Reader(filename=filename, compressed=True)
def pull_vcf_region(self, chrom, start, end):
""" PUll all records that belong to a spcific region
Arguments:
----------
chrom (str) = chromosome
start (int) = start location
end (int) = end location
Returns:
--------
A pyVCF object that contains all of the records for the given region.
"""
return self.vcf_reader.fetch(chrom, start, end)
def pull_homz(self, region=False, snp=True, indel=False, sv=False):
""" Pull homozygous individuals that have a differ from ref base. This
difference can be due to a SNP, InDel or structural variant.
Arguments:
----------
region (pyVCF) = pyVCF object that contains a list of records. If not
object is passed will use the full pyVCF file stored by the Vcf class
snp (bool) = True if you want snps to be included
indel (bool) = True if you want indels to be included
sv (bool) = True if you want structural variants to be included
"""
if not region:
region = self.vcf_reader
variants = collections.defaultdict(list)
for record in region:
if snp and record.is_snp:
for homz in record.get_hom_alts():
variants[record.POS].append(homz.sample)
if indel and record.is_indel:
for homz in record.get_hom_alts():
variants[record.POS].append(homz.sample)
if sv and record.is_sv:
for homz in record.get_hom_alts():
variants[record.POS].append(homz.sample)
return variants