Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
255 changes: 178 additions & 77 deletions snvs/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,114 +175,215 @@ def goldset_pre_filters(sample_name, bamname_n, bamname_t, chrom, start, end, ba

def filter_snvs(candidates_folder, bamname_n, bamname_t, regions, batch_num, output_filename=None):

output_file = os.path.join(candidates_folder, 'batch_%s.csv' % str(batch_num) )
# constants
MIN_BQ = c.MIN_BASE_QUALITY
MIN_COV = c.MIN_COVERAGE
MIN_AF_T = c.MIN_MUTANT_ALLELE_FREQUENCY_IN_TUMOR
MIN_AR_T = c.MIN_MUTANT_ALLELE_READS_IN_TUMOR
MAX_AF_N = c.MAX_ALTERNATIVE_ALLELE_FREQUENCY_IN_NORMAL

output_file = os.path.join(candidates_folder, f"batch_{batch_num}.csv")

if os.path.exists(output_file):
print(("SNV BATCH COMPELTE:", output_file))
print("SNV BATCH COMPLETE:", output_file)
return

regiontime = time()
bamfile_n = pysam.AlignmentFile(bamname_n, 'rb')
bamfile_t = pysam.AlignmentFile(bamname_t, 'rb')

candidates = []

for region in regions:
chrom, start, end = region[0], region[1], region[2]
# t0 = now()
bamfile_n = pysam.AlignmentFile(bamname_n, "rb")
bamfile_t = pysam.AlignmentFile(bamname_t, "rb")

try:
coverage_n = bamfile_n.count_coverage(chrom, start, end+1, quality_threshold=c.MIN_BASE_QUALITY, read_callback = check_read)
except ValueError:
if chrom == 'MT':
# MT is chrM in hg19
chrom = 'chrM'
else:
chrom = 'chr%s' % chrom
total_positions = 0
total_alt_candidates = 0
total_skipped_cov = 0
total_ties = 0

with open(output_file, "a") as f:
for chrom, start, end in regions:
try:
coverage_n = bamfile_n.count_coverage(chrom, start, end+1, quality_threshold=c.MIN_BASE_QUALITY, read_callback = check_read)
coverage_n = bamfile_n.count_coverage(
chrom, start, end + 1, quality_threshold=MIN_BQ, read_callback=check_read
)
except ValueError:
print("Region does not exist in normal BAM")
return

try:
coverage_t = bamfile_t.count_coverage(chrom, start, end+1, quality_threshold=c.MIN_BASE_QUALITY, read_callback = check_read)
except ValueError:
if chrom == 'MT':
# MT is chrM in hg19
chrom = 'chrM'
else:
chrom = 'chr%s' % chrom
alt_chrom = "chrM" if chrom == "MT" else (f"chr{chrom}" if not str(chrom).startswith("chr") else chrom)
try:
coverage_n = bamfile_n.count_coverage(
alt_chrom, start, end + 1, quality_threshold=MIN_BQ, read_callback=check_read
)
chrom = alt_chrom
except ValueError:
print(f"[WARN] Region {chrom}:{start}-{end} missing in normal BAM")
continue

try:
coverage_t = bamfile_t.count_coverage(chrom, start, end+1, quality_threshold=c.MIN_BASE_QUALITY, read_callback = check_read)
try:
coverage_t = bamfile_t.count_coverage(
chrom, start, end + 1, quality_threshold=MIN_BQ, read_callback=check_read
)
except ValueError:
print("Region does not exist in tumor BAM")
return

# [ (#A, #C, #G, #T), (#A, #C, #G, #T), (#A, #C, #G, #T), ] at each position in normal
coverage_list_n = [(coverage_n[0][i], coverage_n[1][i], coverage_n[2][i], coverage_n[3][i])
for i in range(len(coverage_n[0]))]
del coverage_n
alt_chrom = "chrM" if chrom == "MT" else (f"chr{chrom}" if not str(chrom).startswith("chr") else chrom)
try:
coverage_t = bamfile_t.count_coverage(
alt_chrom, start, end + 1, quality_threshold=MIN_BQ, read_callback=check_read
)
chrom = alt_chrom
except ValueError:
print(f"[WARN] Region {chrom}:{start}-{end} missing in tumor BAM")
continue

coverage_list_n = np.vstack(coverage_n).astype(np.int32, copy=False)
coverage_list_t = np.vstack(coverage_t).astype(np.int32, copy=False)
L = coverage_list_n.shape[1]
if L == 0:
continue

# [ (#A, #C, #G, #T), (#A, #C, #G, #T), (#A, #C, #G, #T), ] at each position in tumor
coverage_list_t = [(coverage_t[0][i], coverage_t[1][i], coverage_t[2][i], coverage_t[3][i])
for i in range(len(coverage_t[0]))]
total_positions += L

coverage_per_pos_n = coverage_list_n.sum(axis=0)
coverage_per_pos_t = coverage_list_t.sum(axis=0)
cov_mask = (coverage_per_pos_n >= MIN_COV) & (coverage_per_pos_t >= MIN_COV)
total_skipped_cov += np.sum(~cov_mask)

# --- tie-aware alt_mask patch ---
max_counts = coverage_list_n.max(axis=0)
ref_mask = coverage_list_n == max_counts # handle multiallelic ties
alt_mask = ~ref_mask

# count tie positions for diagnostics
ties = (coverage_list_n == max_counts).sum(axis=0) > 1
total_ties += np.sum(ties)

# guard against zero coverage
cov_n_safe = np.where(coverage_per_pos_n == 0, 1, coverage_per_pos_n)
cov_t_safe = np.where(coverage_per_pos_t == 0, 1, coverage_per_pos_t)

# use float64 to avoid rounding-out tiny AFs
alt_freq_t = coverage_list_t.astype(np.float64) / cov_t_safe
alt_reads_t = coverage_list_t
alt_freq_n = coverage_list_n.astype(np.float64) / cov_n_safe

# per-base filters
tumor_alt_AF_high = alt_freq_t >= MIN_AF_T
tumor_alt_AR_high = alt_reads_t >= MIN_AR_T
normal_alt_AF_low = alt_freq_n <= MAX_AF_N

alt_pass = (
alt_mask
& tumor_alt_AF_high
& tumor_alt_AR_high
& normal_alt_AF_low
)
any_alt_passes = alt_pass.any(axis=0) & cov_mask
idx = np.nonzero(any_alt_passes)[0]
total_alt_candidates += idx.size

if idx.size:
out = "\n".join(f"{chrom}\t{start + int(i)}" for i in idx) + "\n"
f.write(out)

print(
f"[DEBUG] Processed {total_positions} positions; "
f"skipped(low cov): {total_skipped_cov}; "
f"ties in normal: {total_ties}; "
f"alt candidates written: {total_alt_candidates}"
)
print(f"COMPLETED SNV BATCH: {output_file}")


# for region in regions:
# chrom, start, end = region[0], region[1], region[2]

# try:
# coverage_n = bam_n.count_coverage(chrom, start, end+1, quality_threshold=c.MIN_BASE_QUALITY, read_callback = check_read)
# except ValueError:
# if chrom == 'MT':
# # MT is chrM in hg19
# chrom = 'chrM'
# else:
# chrom = 'chr%s' % chrom

# try:
# coverage_n = bam_n.count_coverage(chrom, start, end+1, quality_threshold=c.MIN_BASE_QUALITY, read_callback = check_read)
# except ValueError:
# print("Region does not exist in normal BAM")
# return

# try:
# coverage_t = bam_t.count_coverage(chrom, start, end+1, quality_threshold=c.MIN_BASE_QUALITY, read_callback = check_read)
# except ValueError:
# if chrom == 'MT':
# # MT is chrM in hg19
# chrom = 'chrM'
# else:
# chrom = 'chr%s' % chrom

# try:
# coverage_t = bam_t.count_coverage(chrom, start, end+1, quality_threshold=c.MIN_BASE_QUALITY, read_callback = check_read)
# except ValueError:
# print("Region does not exist in tumor BAM")
# return

# # [ (#A, #C, #G, #T), (#A, #C, #G, #T), (#A, #C, #G, #T), ] at each position in normal
# coverage_list_n = [(coverage_n[0][i], coverage_n[1][i], coverage_n[2][i], coverage_n[3][i])
# for i in range(len(coverage_n[0]))]
# del coverage_n

# # [ (#A, #C, #G, #T), (#A, #C, #G, #T), (#A, #C, #G, #T), ] at each position in tumor
# coverage_list_t = [(coverage_t[0][i], coverage_t[1][i], coverage_t[2][i], coverage_t[3][i])
# for i in range(len(coverage_t[0]))]

del coverage_t
# del coverage_t

gc.collect()
# gc.collect()

map_dict = {0: 'A', 1: 'C', 2: 'G', 3: 'T'}
# map_dict = {0: 'A', 1: 'C', 2: 'G', 3: 'T'}

for i in range(len(coverage_list_n)):
# for i in range(len(coverage_list_n)):

pos_key = 'chrom%spos%d' % (chrom, start+i)
# pos_key = 'chrom%spos%d' % (chrom, start+i)

coverage_of_normal = sum(coverage_list_n[i])
coverage_of_tumor = sum(coverage_list_t[i])
# coverage_of_normal = sum(coverage_list_n[i])
# coverage_of_tumor = sum(coverage_list_t[i])

# Skip if either normal or tumor coverage is low
if coverage_of_normal < c.MIN_COVERAGE or coverage_of_tumor < c.MIN_COVERAGE:
continue
# # Skip if either normal or tumor coverage is low
# if coverage_of_normal < c.MIN_COVERAGE or coverage_of_tumor < c.MIN_COVERAGE:
# continue

# see if a variant allele is found with more than threshold frequency in tumor
# assuming homozygosity
max_frequency_base_in_normal = max(enumerate( coverage_list_n[i] ), key=operator.itemgetter(1))
# # see if a variant allele is found with more than threshold frequency in tumor
# # assuming homozygosity
# max_frequency_base_in_normal = max(enumerate( coverage_list_n[i] ), key=operator.itemgetter(1))

is_candidate_position = False
# is_candidate_position = False

"""
Filter positions where no alternate allele has freq > c.MIN_MUTANT_ALLELE_FREQUENCY_IN_TUMOR
Filter positions where no alterate allele has more than 2 supporting reads
Filter positions where no alternate allele is found in the normal sample with frequency less than c.MAX_ALTERNATIVE_ALLELE_FREQUENCY_IN_NORMAL
"""
# """
# Filter positions where no alternate allele has freq > c.MIN_MUTANT_ALLELE_FREQUENCY_IN_TUMOR
# Filter positions where no alterate allele has more than 2 supporting reads
# Filter positions where no alternate allele is found in the normal sample with frequency less than c.MAX_ALTERNATIVE_ALLELE_FREQUENCY_IN_NORMAL
# """

for j in range(len(coverage_list_t[i])):
# for j in range(len(coverage_list_t[i])):

if j != max_frequency_base_in_normal[0]:
# checking alternate alleles in tumor
# if j != max_frequency_base_in_normal[0]:
# # checking alternate alleles in tumor

allele_frequency_high = (coverage_list_t[i][j] / coverage_of_tumor) >= c.MIN_MUTANT_ALLELE_FREQUENCY_IN_TUMOR
allele_read_count_high = coverage_list_t[i][j] >= c.MIN_MUTANT_ALLELE_READS_IN_TUMOR
allele_frequency_low_in_normal = (coverage_list_n[i][j] / coverage_of_normal) <= c.MAX_ALTERNATIVE_ALLELE_FREQUENCY_IN_NORMAL
# allele_frequency_high = (coverage_list_t[i][j] / coverage_of_tumor) >= c.MIN_MUTANT_ALLELE_FREQUENCY_IN_TUMOR
# allele_read_count_high = coverage_list_t[i][j] >= c.MIN_MUTANT_ALLELE_READS_IN_TUMOR
# allele_frequency_low_in_normal = (coverage_list_n[i][j] / coverage_of_normal) <= c.MAX_ALTERNATIVE_ALLELE_FREQUENCY_IN_NORMAL

is_potential_mutation = allele_frequency_high and allele_read_count_high and allele_frequency_low_in_normal
# is_potential_mutation = allele_frequency_high and allele_read_count_high and allele_frequency_low_in_normal

if is_potential_mutation:
is_candidate_position = True
break
# if is_potential_mutation:
# is_candidate_position = True
# break

if is_candidate_position:
candidates.append((chrom, start + i))
# if is_candidate_position:
# candidates.append((chrom, start + i))

# create folder for batches for this sample
#if not os.path.exists(os.path.join(c.filtering_folder, c.filtering_batches_folder, sample_name)):
# os.makedirs(os.path.join(c.filtering_folder, c.filtering_batches_folder, sample_name))

# save batch.csv
with open(output_file, 'a') as f:
for pos in candidates:
f.write('%s\t%s\n' % (pos[0], pos[1]))
# with open(output_file, 'a') as f:
# for pos in candidates:
# f.write('%s\t%s\n' % (pos[0], pos[1]))

print(('COMPLETED SNV BATCH: ', output_file))
# print(('COMPLETED SNV BATCH: ', output_file))