diff --git a/micall/core/aln2counts.py b/micall/core/aln2counts.py index 44a467123..f2b2e530f 100755 --- a/micall/core/aln2counts.py +++ b/micall/core/aln2counts.py @@ -34,6 +34,8 @@ SeedNucleotide from micall.utils.spring_beads import Wire, Bead from micall.utils.translation import translate +import micall.utils.exact_coverage as exact_coverage +import numpy as np logger = logging.getLogger(__name__) @@ -407,6 +409,10 @@ def __init__(self, # {seed_name: {pos: count} self.conseq_insertion_counts = (conseq_insertion_counts or defaultdict(Counter)) + # {contig_name: {position: exact_coverage}} + self.exact_coverage_data = defaultdict(dict) + self._exact_coverage_calculated = set() # Track which seeds have been calculated + self._current_seed_info = {} # {seed_name: {seed_ref, overlap_size, contigs, coverage, kmer_index, has_data}} self.nuc_writer = self.nuc_detail_writer = self.conseq_writer = None self.amino_writer = self.amino_detail_writer = None self.genome_coverage_writer = self.minimap_hits_writer = None @@ -604,6 +610,74 @@ def process_reads(self, self.detailed_concordance_writer, use_combined_reports=True) + def _initialize_exact_coverage_for_seed(self, seed_name, overlap_size): + """Initialize exact coverage structures for a seed. + + @param seed_name: Name of the seed reference + @param overlap_size: Overlap size for exact coverage calculation + """ + # Use remap_conseq if available, otherwise use original seed reference + if self.remap_conseqs and seed_name in self.remap_conseqs: + seed_ref = self.remap_conseqs[seed_name] + else: + try: + seed_ref = self.projects.getReference(seed_name) + except KeyError: + # Reference not found (e.g., partial contigs), skip exact coverage + return + + # Store seed info for incremental updates + self._current_seed_info[seed_name] = { + 'seed_ref': seed_ref, + 'overlap_size': overlap_size, + 'contigs': {seed_name: seed_ref}, + 'coverage': {seed_name: np.zeros(len(seed_ref), dtype=np.int32)}, + 'kmer_index': {}, # Shared k-mer index for all reads + 'has_data': False + } + + # Load existing data if present + if seed_name in self.exact_coverage_data: + for pos, count in self.exact_coverage_data[seed_name].items(): + if 1 <= pos <= len(seed_ref): + self._current_seed_info[seed_name]['coverage'][seed_name][pos - 1] = count + + def _add_to_exact_coverage(self, seed_name, kmer_index, contigs, coverage, overlap_size, seq, count): + """Add a single read to exact coverage calculation. + + @param seed_name: Name of the seed reference + @param seq: Read sequence + @param count: Read count + """ + if seed_name not in self._current_seed_info: + return False + + # Process this single read directly without iterator overhead + exact_coverage.process_single_read(seq, count, kmer_index, contigs, coverage, overlap_size) + return True + + def _finalize_exact_coverage_for_seed(self, seed_name): + """Finalize exact coverage calculation for a seed. + + @param seed_name: Name of the seed reference + """ + if seed_name not in self._current_seed_info: + return + + info = self._current_seed_info[seed_name] + if not info['has_data']: + return + + # Store/update the coverage data + for pos_0based, count in enumerate(info['coverage'][seed_name]): + if count > 0: + self.exact_coverage_data[seed_name][pos_0based + 1] = int(count) + elif (pos_0based + 1) in self.exact_coverage_data[seed_name]: + del self.exact_coverage_data[seed_name][pos_0based + 1] + + # Clean up + del self._current_seed_info[seed_name] + def read(self, aligned_reads, included_regions: typing.Optional[typing.Set] = None, @@ -619,6 +693,57 @@ def read(self, all other regions should be excluded, or None to ignore @param excluded_regions: coordinate regions that should not be reported. """ + + # Generator that calculates exact coverage as it yields rows + def process_with_exact_coverage(aligned_reads): + refname = None + seed_name = None + + for row in aligned_reads: + # Extract metadata from first row + if refname is None: + refname = row.get('refname') + if refname: + seed_name = trim_contig_name(refname) + # Determine overlap size from the first read + first_read_seq = row.get('seq', '') + first_read_len = len(first_read_seq) + # Use 1/4 of read length, minimum 0, maximum 70 + overlap_size = max(0, min(70, first_read_len // 4)) + # Initialize exact coverage for this seed + self._initialize_exact_coverage_for_seed(seed_name, overlap_size) + # Get references to structures after initialization (if successful) + if seed_name in self._current_seed_info: + info = self._current_seed_info[seed_name] + contigs = info['contigs'] + coverage = info['coverage'] + kmer_index = info['kmer_index'] + overlap_size = info['overlap_size'] + + if seed_name not in self._current_seed_info: + # Skip exact coverage processing if initialization failed + yield row + continue + + # Add to exact coverage + seq = row['seq'] + count = int(row.get('count', 1)) + if self._add_to_exact_coverage(seed_name=seed_name, + contigs=contigs, + coverage=coverage, + overlap_size=overlap_size, + kmer_index=kmer_index, + seq=seq, + count=count): + info['has_data'] = True + + yield row + + # Finalize exact coverage after all rows processed + if seed_name: + self._finalize_exact_coverage_for_seed(seed_name) + + aligned_reads = process_with_exact_coverage(aligned_reads) aligned_reads = self.align_deletions(aligned_reads) self.seed_aminos = {} # {reading_frame: [SeedAmino(consensus_nuc_index)]} @@ -1056,7 +1181,8 @@ def _create_nuc_writer(nuc_file): 'ins', 'clip', 'v3_overlap', - 'coverage'], + 'coverage', + 'exact_coverage'], lineterminator=os.linesep) def write_nuc_header(self, nuc_file): @@ -1093,6 +1219,13 @@ def write_counts(self, genome_pos = (str(report_nuc.position+genome_start_pos - 1) if report_nuc.position is not None else '') + + # Get exact coverage if available + coverage_score_val = '' + if seed_nuc.consensus_index is not None: + query_pos = seed_nuc.consensus_index + 1 # Convert 0-based to 1-based + coverage_score_val = self.exact_coverage_data.get(seed, {}).get(query_pos, '') + row = {'seed': seed, 'region': region, 'q-cutoff': self.qcut, @@ -1103,7 +1236,8 @@ def write_counts(self, 'ins': seed_nuc.insertion_count, 'clip': seed_nuc.clip_count, 'v3_overlap': seed_nuc.v3_overlap, - 'coverage': seed_nuc.get_coverage()} + 'coverage': seed_nuc.get_coverage(), + 'exact_coverage': coverage_score_val} for base in 'ACTGN': nuc_count = seed_nuc.counts[base] row[base] = nuc_count @@ -1682,7 +1816,7 @@ def load_reading_frames(self, seed_name): if coord_amino == '-': continue coord_codon_index += 1 - + nuc_pos = conseq_codon_index * 3 - frame_index for i in range(3): result[nuc_pos+i] = frame_index diff --git a/micall/tests/test_aln2counts.py b/micall/tests/test_aln2counts.py index a0fead2c8..861a50764 100644 --- a/micall/tests/test_aln2counts.py +++ b/micall/tests/test_aln2counts.py @@ -513,48 +513,45 @@ def testMultiplePrefixNucleotideReport(self): aligned_reads3 = prepare_reads("3-R1-seed,15,0,2,0,TTTAGG") expected_text = """\ -seed,region,q-cutoff,query.nuc.pos,refseq.nuc.pos,genome.pos,\ -A,C,G,T,N,del,ins,clip,v3_overlap,coverage -R1-seed,R1,15,1,1,1,5,0,0,0,0,0,0,0,0,5 -R1-seed,R1,15,2,2,2,5,0,0,0,0,0,0,0,0,5 -R1-seed,R1,15,3,3,3,5,0,0,0,0,0,0,0,0,5 -R1-seed,R1,15,,4,4,0,0,0,7,0,0,0,0,0,7 -R1-seed,R1,15,,5,5,0,0,0,7,0,0,0,0,0,7 -R1-seed,R1,15,,6,6,0,0,0,7,0,0,0,0,0,7 -R1-seed,R1,15,4,7,7,2,0,0,0,0,0,0,0,0,2 -R1-seed,R1,15,5,8,8,0,0,2,0,0,0,0,0,0,2 -R1-seed,R1,15,6,9,9,0,0,2,0,0,0,0,0,0,2 -R2-seed,R2,15,1,7,7,0,0,4,0,0,0,0,0,0,4 -R2-seed,R2,15,2,8,8,0,0,4,0,0,0,0,0,0,4 -R2-seed,R2,15,3,9,9,0,4,0,0,0,0,0,0,0,4 -R2-seed,R2,15,4,10,10,0,4,0,0,0,0,0,0,0,4 -R2-seed,R2,15,5,11,11,0,4,0,0,0,0,0,0,0,4 -R2-seed,R2,15,6,12,12,0,0,4,0,0,0,0,0,0,4 +seed,region,q-cutoff,query.nuc.pos,refseq.nuc.pos,genome.pos,A,C,G,T,N,del,ins,clip,v3_overlap,coverage,exact_coverage +R1-seed,R1,15,1,1,1,5,0,0,0,0,0,0,0,0,5, +R1-seed,R1,15,2,2,2,5,0,0,0,0,0,0,0,0,5,10 +R1-seed,R1,15,3,3,3,5,0,0,0,0,0,0,0,0,5,10 +R1-seed,R1,15,,4,4,0,0,0,7,0,0,0,0,0,7, +R1-seed,R1,15,,5,5,0,0,0,7,0,0,0,0,0,7, +R1-seed,R1,15,,6,6,0,0,0,7,0,0,0,0,0,7, +R1-seed,R1,15,4,7,7,2,0,0,0,0,0,0,0,0,2,10 +R1-seed,R1,15,5,8,8,0,0,2,0,0,0,0,0,0,2,12 +R1-seed,R1,15,6,9,9,0,0,2,0,0,0,0,0,0,2,2 +R2-seed,R2,15,1,7,7,0,0,4,0,0,0,0,0,0,4, +R2-seed,R2,15,2,8,8,0,0,4,0,0,0,0,0,0,4, +R2-seed,R2,15,3,9,9,0,4,0,0,0,0,0,0,0,4, +R2-seed,R2,15,4,10,10,0,4,0,0,0,0,0,0,0,4, +R2-seed,R2,15,5,11,11,0,4,0,0,0,0,0,0,0,4, +R2-seed,R2,15,6,12,12,0,0,4,0,0,0,0,0,0,4, """ expected_detail_text = """\ -seed,region,q-cutoff,query.nuc.pos,refseq.nuc.pos,genome.pos,\ -A,C,G,T,N,del,ins,clip,v3_overlap,coverage -1-R1-seed,R1,15,1,1,1,5,0,0,0,0,0,0,0,0,5 -1-R1-seed,R1,15,2,2,2,5,0,0,0,0,0,0,0,0,5 -1-R1-seed,R1,15,3,3,3,5,0,0,0,0,0,0,0,0,5 -1-R1-seed,R1,15,4,4,4,0,0,0,5,0,0,0,0,0,5 -1-R1-seed,R1,15,5,5,5,0,0,0,5,0,0,0,0,0,5 -1-R1-seed,R1,15,6,6,6,0,0,0,5,0,0,0,0,0,5 -2-R2-seed,R2,15,1,7,7,0,0,4,0,0,0,0,0,0,4 -2-R2-seed,R2,15,2,8,8,0,0,4,0,0,0,0,0,0,4 -2-R2-seed,R2,15,3,9,9,0,4,0,0,0,0,0,0,0,4 -2-R2-seed,R2,15,4,10,10,0,4,0,0,0,0,0,0,0,4 -2-R2-seed,R2,15,5,11,11,0,4,0,0,0,0,0,0,0,4 -2-R2-seed,R2,15,6,12,12,0,0,4,0,0,0,0,0,0,4 -3-R1-seed,R1,15,1,4,4,0,0,0,2,0,0,0,0,0,2 -3-R1-seed,R1,15,2,5,5,0,0,0,2,0,0,0,0,0,2 -3-R1-seed,R1,15,3,6,6,0,0,0,2,0,0,0,0,0,2 -3-R1-seed,R1,15,4,7,7,2,0,0,0,0,0,0,0,0,2 -3-R1-seed,R1,15,5,8,8,0,0,2,0,0,0,0,0,0,2 -3-R1-seed,R1,15,6,9,9,0,0,2,0,0,0,0,0,0,2 +seed,region,q-cutoff,query.nuc.pos,refseq.nuc.pos,genome.pos,A,C,G,T,N,del,ins,clip,v3_overlap,coverage,exact_coverage +1-R1-seed,R1,15,1,1,1,5,0,0,0,0,0,0,0,0,5, +1-R1-seed,R1,15,2,2,2,5,0,0,0,0,0,0,0,0,5, +1-R1-seed,R1,15,3,3,3,5,0,0,0,0,0,0,0,0,5, +1-R1-seed,R1,15,4,4,4,0,0,0,5,0,0,0,0,0,5, +1-R1-seed,R1,15,5,5,5,0,0,0,5,0,0,0,0,0,5, +1-R1-seed,R1,15,6,6,6,0,0,0,5,0,0,0,0,0,5, +2-R2-seed,R2,15,1,7,7,0,0,4,0,0,0,0,0,0,4, +2-R2-seed,R2,15,2,8,8,0,0,4,0,0,0,0,0,0,4, +2-R2-seed,R2,15,3,9,9,0,4,0,0,0,0,0,0,0,4, +2-R2-seed,R2,15,4,10,10,0,4,0,0,0,0,0,0,0,4, +2-R2-seed,R2,15,5,11,11,0,4,0,0,0,0,0,0,0,4, +2-R2-seed,R2,15,6,12,12,0,0,4,0,0,0,0,0,0,4, +3-R1-seed,R1,15,1,4,4,0,0,0,2,0,0,0,0,0,2, +3-R1-seed,R1,15,2,5,5,0,0,0,2,0,0,0,0,0,2, +3-R1-seed,R1,15,3,6,6,0,0,0,2,0,0,0,0,0,2, +3-R1-seed,R1,15,4,7,7,2,0,0,0,0,0,0,0,0,2, +3-R1-seed,R1,15,5,8,8,0,0,2,0,0,0,0,0,0,2, +3-R1-seed,R1,15,6,9,9,0,0,2,0,0,0,0,0,0,2, """ - self.report.write_nuc_header(self.report_file) self.report.write_nuc_detail_header(self.detail_report_file) self.report.read(aligned_reads1) @@ -568,8 +565,9 @@ def testMultiplePrefixNucleotideReport(self): self.report.combine_reports() self.report.write_nuc_counts() - assert self.detail_report_file.getvalue() == expected_detail_text - assert self.report_file.getvalue() == expected_text + self.assertMultiLineEqual(expected_detail_text, self.detail_report_file.getvalue()) + self.assertMultiLineEqual(expected_text, self.report_file.getvalue()) + def testNucleotideDetailReportOnlyPartials(self): """ The only contig is a partial BLAST match, not reported. """ @@ -580,36 +578,36 @@ def testNucleotideDetailReportOnlyPartials(self): expected_text = """\ seed,region,q-cutoff,query.nuc.pos,refseq.nuc.pos,genome.pos,\ -A,C,G,T,N,del,ins,clip,v3_overlap,coverage -R2-seed,R2,15,1,7,7,0,0,4,0,0,0,0,0,0,4 -R2-seed,R2,15,2,8,8,0,0,4,0,0,0,0,0,0,4 -R2-seed,R2,15,3,9,9,0,4,0,0,0,0,0,0,0,4 -R2-seed,R2,15,4,10,10,0,4,0,0,0,0,0,0,0,4 -R2-seed,R2,15,5,11,11,0,4,0,0,0,0,0,0,0,4 -R2-seed,R2,15,6,12,12,0,0,4,0,0,0,0,0,0,4 -R1-seed,R1,15,1,4,4,0,0,0,2,0,0,0,0,0,2 -R1-seed,R1,15,2,5,5,0,0,0,2,0,0,0,0,0,2 -R1-seed,R1,15,3,6,6,0,0,0,2,0,0,0,0,0,2 -R1-seed,R1,15,4,7,7,2,0,0,0,0,0,0,0,0,2 -R1-seed,R1,15,5,8,8,0,0,2,0,0,0,0,0,0,2 -R1-seed,R1,15,6,9,9,0,0,2,0,0,0,0,0,0,2 +A,C,G,T,N,del,ins,clip,v3_overlap,coverage,exact_coverage +R2-seed,R2,15,1,7,7,0,0,4,0,0,0,0,0,0,4, +R2-seed,R2,15,2,8,8,0,0,4,0,0,0,0,0,0,4, +R2-seed,R2,15,3,9,9,0,4,0,0,0,0,0,0,0,4, +R2-seed,R2,15,4,10,10,0,4,0,0,0,0,0,0,0,4, +R2-seed,R2,15,5,11,11,0,4,0,0,0,0,0,0,0,4, +R2-seed,R2,15,6,12,12,0,0,4,0,0,0,0,0,0,4, +R1-seed,R1,15,1,4,4,0,0,0,2,0,0,0,0,0,2, +R1-seed,R1,15,2,5,5,0,0,0,2,0,0,0,0,0,2, +R1-seed,R1,15,3,6,6,0,0,0,2,0,0,0,0,0,2, +R1-seed,R1,15,4,7,7,2,0,0,0,0,0,0,0,0,2, +R1-seed,R1,15,5,8,8,0,0,2,0,0,0,0,0,0,2,2 +R1-seed,R1,15,6,9,9,0,0,2,0,0,0,0,0,0,2,2 """ expected_detail_text = """\ seed,region,q-cutoff,query.nuc.pos,refseq.nuc.pos,genome.pos,\ -A,C,G,T,N,del,ins,clip,v3_overlap,coverage -2-R2-seed,R2,15,1,7,7,0,0,4,0,0,0,0,0,0,4 -2-R2-seed,R2,15,2,8,8,0,0,4,0,0,0,0,0,0,4 -2-R2-seed,R2,15,3,9,9,0,4,0,0,0,0,0,0,0,4 -2-R2-seed,R2,15,4,10,10,0,4,0,0,0,0,0,0,0,4 -2-R2-seed,R2,15,5,11,11,0,4,0,0,0,0,0,0,0,4 -2-R2-seed,R2,15,6,12,12,0,0,4,0,0,0,0,0,0,4 -3-R1-seed,R1,15,1,4,4,0,0,0,2,0,0,0,0,0,2 -3-R1-seed,R1,15,2,5,5,0,0,0,2,0,0,0,0,0,2 -3-R1-seed,R1,15,3,6,6,0,0,0,2,0,0,0,0,0,2 -3-R1-seed,R1,15,4,7,7,2,0,0,0,0,0,0,0,0,2 -3-R1-seed,R1,15,5,8,8,0,0,2,0,0,0,0,0,0,2 -3-R1-seed,R1,15,6,9,9,0,0,2,0,0,0,0,0,0,2 +A,C,G,T,N,del,ins,clip,v3_overlap,coverage,exact_coverage +2-R2-seed,R2,15,1,7,7,0,0,4,0,0,0,0,0,0,4, +2-R2-seed,R2,15,2,8,8,0,0,4,0,0,0,0,0,0,4, +2-R2-seed,R2,15,3,9,9,0,4,0,0,0,0,0,0,0,4, +2-R2-seed,R2,15,4,10,10,0,4,0,0,0,0,0,0,0,4, +2-R2-seed,R2,15,5,11,11,0,4,0,0,0,0,0,0,0,4, +2-R2-seed,R2,15,6,12,12,0,0,4,0,0,0,0,0,0,4, +3-R1-seed,R1,15,1,4,4,0,0,0,2,0,0,0,0,0,2, +3-R1-seed,R1,15,2,5,5,0,0,0,2,0,0,0,0,0,2, +3-R1-seed,R1,15,3,6,6,0,0,0,2,0,0,0,0,0,2, +3-R1-seed,R1,15,4,7,7,2,0,0,0,0,0,0,0,0,2, +3-R1-seed,R1,15,5,8,8,0,0,2,0,0,0,0,0,0,2, +3-R1-seed,R1,15,6,9,9,0,0,2,0,0,0,0,0,0,2, """ self.report.write_nuc_header(self.report_file) @@ -662,15 +660,15 @@ def testSoftClippingNucleotideReport(self): expected_text = """\ seed,region,q-cutoff,query.nuc.pos,refseq.nuc.pos,genome.pos,\ -A,C,G,T,N,del,ins,clip,v3_overlap,coverage -R1-seed,R1,15,,1,1,0,0,0,0,0,0,0,9,0,0 -R1-seed,R1,15,,2,2,0,0,0,0,0,0,0,9,0,0 -R1-seed,R1,15,3,3,3,9,0,0,0,0,0,0,0,0,9 -R1-seed,R1,15,4,4,4,0,0,0,9,0,0,0,0,0,9 -R1-seed,R1,15,5,5,5,0,0,0,9,0,0,0,0,0,9 -R1-seed,R1,15,6,6,6,0,0,0,9,0,0,0,0,0,9 -R1-seed,R1,15,7,7,7,9,0,0,0,0,0,0,0,0,9 -R1-seed,R1,15,,8,8,0,0,0,0,0,0,0,9,0,0 +A,C,G,T,N,del,ins,clip,v3_overlap,coverage,exact_coverage +R1-seed,R1,15,,1,1,0,0,0,0,0,0,0,9,0,0, +R1-seed,R1,15,,2,2,0,0,0,0,0,0,0,9,0,0, +R1-seed,R1,15,3,3,3,9,0,0,0,0,0,0,0,0,9, +R1-seed,R1,15,4,4,4,0,0,0,9,0,0,0,0,0,9,9 +R1-seed,R1,15,5,5,5,0,0,0,9,0,0,0,0,0,9,9 +R1-seed,R1,15,6,6,6,0,0,0,9,0,0,0,0,0,9,9 +R1-seed,R1,15,7,7,7,9,0,0,0,0,0,0,0,0,9, +R1-seed,R1,15,,8,8,0,0,0,0,0,0,0,9,0,0, """ self.report.read_clipping(clipping) @@ -810,13 +808,13 @@ def testInsertionBetweenReadAndConsensusNucleotideReport(self): expected_text = """\ seed,region,q-cutoff,query.nuc.pos,refseq.nuc.pos,genome.pos,\ -A,C,G,T,N,del,ins,clip,v3_overlap,coverage -R1-seed,R1,15,1,1,1,9,0,0,0,0,0,0,0,0,9 -R1-seed,R1,15,2,2,2,9,0,0,0,0,0,0,0,0,9 -R1-seed,R1,15,3,3,3,9,0,0,0,0,0,2,0,0,9 -R1-seed,R1,15,4,4,4,0,0,0,9,0,0,0,0,0,9 -R1-seed,R1,15,5,5,5,0,0,0,9,0,0,0,0,0,9 -R1-seed,R1,15,6,6,6,0,0,0,9,0,0,0,0,0,9 +A,C,G,T,N,del,ins,clip,v3_overlap,coverage,exact_coverage +R1-seed,R1,15,1,1,1,9,0,0,0,0,0,0,0,0,9, +R1-seed,R1,15,2,2,2,9,0,0,0,0,0,0,0,0,9,18 +R1-seed,R1,15,3,3,3,9,0,0,0,0,0,2,0,0,9,18 +R1-seed,R1,15,4,4,4,0,0,0,9,0,0,0,0,0,9,18 +R1-seed,R1,15,5,5,5,0,0,0,9,0,0,0,0,0,9,18 +R1-seed,R1,15,6,6,6,0,0,0,9,0,0,0,0,0,9, """ self.report.read_insertions(conseq_ins_csv) @@ -953,13 +951,13 @@ def testOffsetNucleotideReport(self): expected_text = """\ seed,region,q-cutoff,query.nuc.pos,refseq.nuc.pos,genome.pos,\ -A,C,G,T,N,del,ins,clip,v3_overlap,coverage -R1-seed,R1,15,4,4,4,0,0,0,1,0,0,0,0,0,1 -R1-seed,R1,15,5,5,5,0,0,0,1,0,0,0,0,0,1 -R1-seed,R1,15,6,6,6,0,0,0,9,0,0,0,0,0,9 -R1-seed,R1,15,7,7,7,0,8,0,0,0,0,0,0,0,8 -R1-seed,R1,15,8,8,8,0,0,8,0,0,0,0,0,0,8 -R1-seed,R1,15,9,9,9,8,0,0,0,0,0,0,0,0,8 +A,C,G,T,N,del,ins,clip,v3_overlap,coverage,exact_coverage +R1-seed,R1,15,4,4,4,0,0,0,1,0,0,0,0,0,1,1 +R1-seed,R1,15,5,5,5,0,0,0,1,0,0,0,0,0,1,1 +R1-seed,R1,15,6,6,6,0,0,0,9,0,0,0,0,0,9,1 +R1-seed,R1,15,7,7,7,0,8,0,0,0,0,0,0,0,8, +R1-seed,R1,15,8,8,8,0,0,8,0,0,0,0,0,0,8, +R1-seed,R1,15,9,9,9,8,0,0,0,0,0,0,0,0,8, """ self.report.read(aligned_reads) @@ -976,12 +974,12 @@ def testPartialCodonNucleotideReport(self): expected_text = """\ seed,region,q-cutoff,query.nuc.pos,refseq.nuc.pos,genome.pos,\ -A,C,G,T,N,del,ins,clip,v3_overlap,coverage -R1-seed,R1,15,1,1,1,9,0,0,0,0,0,0,0,0,9 -R1-seed,R1,15,2,2,2,9,0,0,0,0,0,0,0,0,9 -R1-seed,R1,15,3,3,3,9,0,0,0,0,0,0,0,0,9 -R1-seed,R1,15,4,4,4,0,0,0,9,0,0,0,0,0,9 -R1-seed,R1,15,5,5,5,0,0,0,9,0,0,0,0,0,9 +A,C,G,T,N,del,ins,clip,v3_overlap,coverage,exact_coverage +R1-seed,R1,15,1,1,1,9,0,0,0,0,0,0,0,0,9, +R1-seed,R1,15,2,2,2,9,0,0,0,0,0,0,0,0,9,9 +R1-seed,R1,15,3,3,3,9,0,0,0,0,0,0,0,0,9,18 +R1-seed,R1,15,4,4,4,0,0,0,9,0,0,0,0,0,9,18 +R1-seed,R1,15,5,5,5,0,0,0,9,0,0,0,0,0,9,9 """ self.report.read(aligned_reads) @@ -998,12 +996,12 @@ def testPartialStartCodonNucleotideReport(self): expected_text = """\ seed,region,q-cutoff,query.nuc.pos,refseq.nuc.pos,genome.pos,\ -A,C,G,T,N,del,ins,clip,v3_overlap,coverage -R1-seed,R1,15,1,5,5,0,0,0,9,0,0,0,0,0,9 -R1-seed,R1,15,2,6,6,0,0,0,9,0,0,0,0,0,9 -R1-seed,R1,15,3,7,7,9,0,0,0,0,0,0,0,0,9 -R1-seed,R1,15,4,8,8,0,0,9,0,0,0,0,0,0,9 -R1-seed,R1,15,5,9,9,0,0,9,0,0,0,0,0,0,9 +A,C,G,T,N,del,ins,clip,v3_overlap,coverage,exact_coverage +R1-seed,R1,15,1,5,5,0,0,0,9,0,0,0,0,0,9, +R1-seed,R1,15,2,6,6,0,0,0,9,0,0,0,0,0,9, +R1-seed,R1,15,3,7,7,9,0,0,0,0,0,0,0,0,9, +R1-seed,R1,15,4,8,8,0,0,9,0,0,0,0,0,0,9, +R1-seed,R1,15,5,9,9,0,0,9,0,0,0,0,0,0,9, """ self.report.read(aligned_reads) @@ -1042,13 +1040,13 @@ def testLowQualityNucleotideReport(self): expected_text = """\ seed,region,q-cutoff,query.nuc.pos,refseq.nuc.pos,genome.pos,\ -A,C,G,T,N,del,ins,clip,v3_overlap,coverage -R1-seed,R1,15,1,1,1,9,0,0,0,0,0,0,0,0,9 -R1-seed,R1,15,2,2,2,9,0,0,0,0,0,0,0,0,9 -R1-seed,R1,15,3,3,3,9,0,0,0,0,0,0,0,0,9 -R1-seed,R1,15,4,4,4,0,0,0,9,0,0,0,0,0,9 -R1-seed,R1,15,5,5,5,0,0,0,0,9,0,0,0,0,0 -R1-seed,R1,15,6,6,6,0,0,0,9,0,0,0,0,0,9 +A,C,G,T,N,del,ins,clip,v3_overlap,coverage,exact_coverage +R1-seed,R1,15,1,1,1,9,0,0,0,0,0,0,0,0,9, +R1-seed,R1,15,2,2,2,9,0,0,0,0,0,0,0,0,9, +R1-seed,R1,15,3,3,3,9,0,0,0,0,0,0,0,0,9, +R1-seed,R1,15,4,4,4,0,0,0,9,0,0,0,0,0,9, +R1-seed,R1,15,5,5,5,0,0,0,0,9,0,0,0,0,0, +R1-seed,R1,15,6,6,6,0,0,0,9,0,0,0,0,0,9, """ self.report.read(aligned_reads) @@ -1135,16 +1133,16 @@ def testShiftedReadingFrameNucleotideReport(self): expected_text = """\ seed,region,q-cutoff,query.nuc.pos,refseq.nuc.pos,genome.pos,\ -A,C,G,T,N,del,ins,clip,v3_overlap,coverage -R1-seed,R1,15,2,1,1,9,0,0,0,0,0,0,0,0,9 -R1-seed,R1,15,3,2,2,9,0,0,0,0,0,0,0,0,9 -R1-seed,R1,15,4,3,3,9,0,0,0,0,0,0,0,0,9 -R1-seed,R1,15,5,4,4,0,0,0,9,0,0,0,0,0,9 -R1-seed,R1,15,6,5,5,0,0,0,9,0,0,0,0,0,9 -R1-seed,R1,15,7,6,6,0,0,0,9,0,0,0,0,0,9 -R1-seed,R1,15,8,7,7,0,9,0,0,0,0,0,0,0,9 -R1-seed,R1,15,9,8,8,0,0,9,0,0,0,0,0,0,9 -R1-seed,R1,15,10,9,9,9,0,0,0,0,0,0,0,0,9 +A,C,G,T,N,del,ins,clip,v3_overlap,coverage,exact_coverage +R1-seed,R1,15,2,1,1,9,0,0,0,0,0,0,0,0,9, +R1-seed,R1,15,3,2,2,9,0,0,0,0,0,0,0,0,9, +R1-seed,R1,15,4,3,3,9,0,0,0,0,0,0,0,0,9, +R1-seed,R1,15,5,4,4,0,0,0,9,0,0,0,0,0,9, +R1-seed,R1,15,6,5,5,0,0,0,9,0,0,0,0,0,9, +R1-seed,R1,15,7,6,6,0,0,0,9,0,0,0,0,0,9, +R1-seed,R1,15,8,7,7,0,9,0,0,0,0,0,0,0,9, +R1-seed,R1,15,9,8,8,0,0,9,0,0,0,0,0,0,9, +R1-seed,R1,15,10,9,9,9,0,0,0,0,0,0,0,0,9, """ self.report.read(aligned_reads) @@ -1166,16 +1164,16 @@ def testDeletionNucleotideReport(self): expected_text = """\ seed,region,q-cutoff,query.nuc.pos,refseq.nuc.pos,genome.pos,\ -A,C,G,T,N,del,ins,clip,v3_overlap,coverage -R1-seed,R1,15,1,1,1,9,0,0,0,0,0,0,0,0,9 -R1-seed,R1,15,2,2,2,9,0,0,0,0,0,0,0,0,9 -R1-seed,R1,15,3,3,3,9,0,0,0,0,0,0,0,0,9 -R1-seed,R1,15,4,4,4,0,0,0,0,0,9,0,0,0,9 -R1-seed,R1,15,5,5,5,0,0,0,0,0,9,0,0,0,9 -R1-seed,R1,15,6,6,6,0,0,0,0,0,9,0,0,0,9 -R1-seed,R1,15,7,7,7,9,0,0,0,0,0,0,0,0,9 -R1-seed,R1,15,8,8,8,0,0,9,0,0,0,0,0,0,9 -R1-seed,R1,15,9,9,9,0,0,9,0,0,0,0,0,0,9 +A,C,G,T,N,del,ins,clip,v3_overlap,coverage,exact_coverage +R1-seed,R1,15,1,1,1,9,0,0,0,0,0,0,0,0,9, +R1-seed,R1,15,2,2,2,9,0,0,0,0,0,0,0,0,9, +R1-seed,R1,15,3,3,3,9,0,0,0,0,0,0,0,0,9, +R1-seed,R1,15,4,4,4,0,0,0,0,0,9,0,0,0,9, +R1-seed,R1,15,5,5,5,0,0,0,0,0,9,0,0,0,9, +R1-seed,R1,15,6,6,6,0,0,0,0,0,9,0,0,0,9, +R1-seed,R1,15,7,7,7,9,0,0,0,0,0,0,0,0,9, +R1-seed,R1,15,8,8,8,0,0,9,0,0,0,0,0,0,9, +R1-seed,R1,15,9,9,9,0,0,9,0,0,0,0,0,0,9, """ self.report.read(aligned_reads) @@ -1203,31 +1201,31 @@ def testDeletionBetweenSeedAndCoordinateNucleotideReport(self): expected_text = """\ seed,region,q-cutoff,query.nuc.pos,refseq.nuc.pos,genome.pos,\ -A,C,G,T,N,del,ins,clip,v3_overlap,coverage -R3-seed,R3,15,1,1,1,9,0,0,0,0,0,0,0,0,9 -R3-seed,R3,15,2,2,2,9,0,0,0,0,0,0,0,0,9 -R3-seed,R3,15,3,3,3,9,0,0,0,0,0,0,0,0,9 -R3-seed,R3,15,4,4,4,0,0,0,9,0,0,0,0,0,9 -R3-seed,R3,15,5,5,5,0,0,0,9,0,0,0,0,0,9 -R3-seed,R3,15,6,6,6,0,0,0,9,0,0,0,0,0,9 -R3-seed,R3,15,7,7,7,0,9,0,0,0,0,0,0,0,9 -R3-seed,R3,15,8,8,8,9,0,0,0,0,0,0,0,0,9 -R3-seed,R3,15,9,9,9,0,0,9,0,0,0,0,0,0,9 -R3-seed,R3,15,,10,10,0,0,0,0,0,9,0,0,0,9 -R3-seed,R3,15,,11,11,0,0,0,0,0,9,0,0,0,9 -R3-seed,R3,15,,12,12,0,0,0,0,0,9,0,0,0,9 -R3-seed,R3,15,10,13,13,0,9,0,0,0,0,0,0,0,9 -R3-seed,R3,15,11,14,14,0,9,0,0,0,0,0,0,0,9 -R3-seed,R3,15,12,15,15,9,0,0,0,0,0,0,0,0,9 -R3-seed,R3,15,13,16,16,0,9,0,0,0,0,0,0,0,9 -R3-seed,R3,15,14,17,17,0,0,9,0,0,0,0,0,0,9 -R3-seed,R3,15,15,18,18,9,0,0,0,0,0,0,0,0,9 -R3-seed,R3,15,16,19,19,0,0,9,0,0,0,0,0,0,9 -R3-seed,R3,15,17,20,20,9,0,0,0,0,0,0,0,0,9 -R3-seed,R3,15,18,21,21,0,0,9,0,0,0,0,0,0,9 -R3-seed,R3,15,19,22,22,0,9,0,0,0,0,0,0,0,9 -R3-seed,R3,15,20,23,23,9,0,0,0,0,0,0,0,0,9 -R3-seed,R3,15,21,24,24,0,0,0,9,0,0,0,0,0,9 +A,C,G,T,N,del,ins,clip,v3_overlap,coverage,exact_coverage +R3-seed,R3,15,1,1,1,9,0,0,0,0,0,0,0,0,9, +R3-seed,R3,15,2,2,2,9,0,0,0,0,0,0,0,0,9, +R3-seed,R3,15,3,3,3,9,0,0,0,0,0,0,0,0,9, +R3-seed,R3,15,4,4,4,0,0,0,9,0,0,0,0,0,9, +R3-seed,R3,15,5,5,5,0,0,0,9,0,0,0,0,0,9, +R3-seed,R3,15,6,6,6,0,0,0,9,0,0,0,0,0,9, +R3-seed,R3,15,7,7,7,0,9,0,0,0,0,0,0,0,9, +R3-seed,R3,15,8,8,8,9,0,0,0,0,0,0,0,0,9, +R3-seed,R3,15,9,9,9,0,0,9,0,0,0,0,0,0,9, +R3-seed,R3,15,,10,10,0,0,0,0,0,9,0,0,0,9, +R3-seed,R3,15,,11,11,0,0,0,0,0,9,0,0,0,9, +R3-seed,R3,15,,12,12,0,0,0,0,0,9,0,0,0,9, +R3-seed,R3,15,10,13,13,0,9,0,0,0,0,0,0,0,9, +R3-seed,R3,15,11,14,14,0,9,0,0,0,0,0,0,0,9, +R3-seed,R3,15,12,15,15,9,0,0,0,0,0,0,0,0,9, +R3-seed,R3,15,13,16,16,0,9,0,0,0,0,0,0,0,9, +R3-seed,R3,15,14,17,17,0,0,9,0,0,0,0,0,0,9, +R3-seed,R3,15,15,18,18,9,0,0,0,0,0,0,0,0,9, +R3-seed,R3,15,16,19,19,0,0,9,0,0,0,0,0,0,9, +R3-seed,R3,15,17,20,20,9,0,0,0,0,0,0,0,0,9, +R3-seed,R3,15,18,21,21,0,0,9,0,0,0,0,0,0,9, +R3-seed,R3,15,19,22,22,0,9,0,0,0,0,0,0,0,9, +R3-seed,R3,15,20,23,23,9,0,0,0,0,0,0,0,0,9, +R3-seed,R3,15,21,24,24,0,0,0,9,0,0,0,0,0,9, """ self.report.read(aligned_reads) @@ -1416,31 +1414,31 @@ def testInsertionBetweenSeedAndCoordinateNucleotideReport(self): """) expected_text = """\ seed,region,q-cutoff,query.nuc.pos,refseq.nuc.pos,genome.pos,\ -A,C,G,T,N,del,ins,clip,v3_overlap,coverage -R3-seed,R3,15,10,1,1,9,0,0,0,0,0,0,0,0,9 -R3-seed,R3,15,11,2,2,9,0,0,0,0,0,0,0,0,9 -R3-seed,R3,15,12,3,3,9,0,0,0,0,0,0,0,0,9 -R3-seed,R3,15,13,4,4,0,0,0,9,0,0,0,0,0,9 -R3-seed,R3,15,14,5,5,0,0,0,9,0,0,0,0,0,9 -R3-seed,R3,15,15,6,6,0,0,0,9,0,0,0,0,0,9 -R3-seed,R3,15,16,7,7,0,9,0,0,0,0,0,0,0,9 -R3-seed,R3,15,17,8,8,9,0,0,0,0,0,0,0,0,9 -R3-seed,R3,15,18,9,9,0,0,9,0,0,0,0,0,0,9 -R3-seed,R3,15,19,10,10,9,0,0,0,0,0,0,0,0,9 -R3-seed,R3,15,20,11,11,0,9,0,0,0,0,0,0,0,9 -R3-seed,R3,15,21,12,12,0,0,0,9,0,0,9,0,0,9 -R3-seed,R3,15,25,13,13,0,9,0,0,0,0,0,0,0,9 -R3-seed,R3,15,26,14,14,0,9,0,0,0,0,0,0,0,9 -R3-seed,R3,15,27,15,15,0,9,0,0,0,0,0,0,0,9 -R3-seed,R3,15,28,16,16,0,9,0,0,0,0,0,0,0,9 -R3-seed,R3,15,29,17,17,0,0,9,0,0,0,0,0,0,9 -R3-seed,R3,15,30,18,18,9,0,0,0,0,0,0,0,0,9 -R3-seed,R3,15,31,19,19,0,0,9,0,0,0,0,0,0,9 -R3-seed,R3,15,32,20,20,9,0,0,0,0,0,0,0,0,9 -R3-seed,R3,15,33,21,21,0,0,9,0,0,0,0,0,0,9 -R3-seed,R3,15,34,22,22,0,9,0,0,0,0,0,0,0,9 -R3-seed,R3,15,35,23,23,9,0,0,0,0,0,0,0,0,9 -R3-seed,R3,15,36,24,24,0,0,0,9,0,0,0,0,0,9 +A,C,G,T,N,del,ins,clip,v3_overlap,coverage,exact_coverage +R3-seed,R3,15,10,1,1,9,0,0,0,0,0,0,0,0,9, +R3-seed,R3,15,11,2,2,9,0,0,0,0,0,0,0,0,9, +R3-seed,R3,15,12,3,3,9,0,0,0,0,0,0,0,0,9, +R3-seed,R3,15,13,4,4,0,0,0,9,0,0,0,0,0,9, +R3-seed,R3,15,14,5,5,0,0,0,9,0,0,0,0,0,9, +R3-seed,R3,15,15,6,6,0,0,0,9,0,0,0,0,0,9, +R3-seed,R3,15,16,7,7,0,9,0,0,0,0,0,0,0,9, +R3-seed,R3,15,17,8,8,9,0,0,0,0,0,0,0,0,9, +R3-seed,R3,15,18,9,9,0,0,9,0,0,0,0,0,0,9, +R3-seed,R3,15,19,10,10,9,0,0,0,0,0,0,0,0,9, +R3-seed,R3,15,20,11,11,0,9,0,0,0,0,0,0,0,9, +R3-seed,R3,15,21,12,12,0,0,0,9,0,0,9,0,0,9, +R3-seed,R3,15,25,13,13,0,9,0,0,0,0,0,0,0,9, +R3-seed,R3,15,26,14,14,0,9,0,0,0,0,0,0,0,9, +R3-seed,R3,15,27,15,15,0,9,0,0,0,0,0,0,0,9, +R3-seed,R3,15,28,16,16,0,9,0,0,0,0,0,0,0,9, +R3-seed,R3,15,29,17,17,0,0,9,0,0,0,0,0,0,9, +R3-seed,R3,15,30,18,18,9,0,0,0,0,0,0,0,0,9, +R3-seed,R3,15,31,19,19,0,0,9,0,0,0,0,0,0,9, +R3-seed,R3,15,32,20,20,9,0,0,0,0,0,0,0,0,9, +R3-seed,R3,15,33,21,21,0,0,9,0,0,0,0,0,0,9, +R3-seed,R3,15,34,22,22,0,9,0,0,0,0,0,0,0,9, +R3-seed,R3,15,35,23,23,9,0,0,0,0,0,0,0,0,9, +R3-seed,R3,15,36,24,24,0,0,0,9,0,0,0,0,0,9, """ self.report.read(aligned_reads) diff --git a/micall/tests/test_aln2counts_exact_coverage.py b/micall/tests/test_aln2counts_exact_coverage.py new file mode 100644 index 000000000..a45c2e648 --- /dev/null +++ b/micall/tests/test_aln2counts_exact_coverage.py @@ -0,0 +1,627 @@ +import csv +from io import StringIO + +from micall.core.aln2counts import aln2counts + +# Import fixtures +from micall.tests.test_aln2counts_report import default_sequence_report # noqa: F401 +from micall.tests.test_remap import load_projects + +assert load_projects + + +def test_exact_coverage_with_remap_conseq(): + """Test that exact_coverage column is populated when remap_conseq_csv is provided.""" + # Use a seed name that exists in the default project config + seed_name = "HIV1-B-FR-K03455-seed" + aligned_csv = StringIO(f"""\ +refname,qcut,rank,count,offset,seq +{seed_name},15,0,5,0,AAATTTCCC +{seed_name},15,0,5,0,AAATTTCCC +{seed_name},15,0,5,0,AAATTTCCC +""") + remap_conseq_csv = StringIO(f"""\ +region,sequence +{seed_name},AAATTTCCC +""") + nuc_csv = StringIO() + amino_csv = StringIO() + insertions_csv = StringIO() + conseq_csv = StringIO() + failed_align_csv = StringIO() + coverage_summary_csv = StringIO() + aln2counts(aligned_csv=aligned_csv, + nuc_csv=nuc_csv, + amino_csv=amino_csv, + insertions_csv=insertions_csv, + conseq_csv=conseq_csv, + failed_align_csv=failed_align_csv, + coverage_summary_csv=coverage_summary_csv, + remap_conseq_csv=remap_conseq_csv) + nuc_csv.seek(0) + reader = csv.DictReader(nuc_csv) + rows = list(reader) + assert len(rows) > 0, "Should have nuc rows" + assert 'exact_coverage' in rows[0], "Should have exact_coverage column" + exact_coverages = [row['exact_coverage'] for row in rows] + non_empty = [ec for ec in exact_coverages if ec and ec.strip()] + assert len(non_empty) > 0, f"Should have some non-empty exact_coverage values, got: {exact_coverages}" + for ec in non_empty: + assert ec.isdigit(), f"exact_coverage should be numeric, got: {ec}" + assert int(ec) > 0, f"exact_coverage should be positive, got: {ec}" + +def test_exact_coverage_without_remap_conseq(): + """Test that exact_coverage column is empty when remap_conseq_csv is NOT provided.""" + # Use a known seed from projects + aligned_csv = StringIO("""refname,qcut,rank,count,offset,seq +HIV1-B-FR-K03455-seed,15,0,5,0,AAATTT +""") + nuc_csv = StringIO() + amino_csv = StringIO() + insertions_csv = StringIO() + conseq_csv = StringIO() + failed_align_csv = StringIO() + coverage_summary_csv = StringIO() + aln2counts(aligned_csv=aligned_csv, + nuc_csv=nuc_csv, + amino_csv=amino_csv, + insertions_csv=insertions_csv, + conseq_csv=conseq_csv, + failed_align_csv=failed_align_csv, + coverage_summary_csv=coverage_summary_csv, + remap_conseq_csv=None) + nuc_csv.seek(0) + reader = csv.DictReader(nuc_csv) + rows = list(reader) + assert len(rows) > 0, "Should have nuc rows" + assert 'exact_coverage' in rows[0], "Should have exact_coverage column" + exact_coverages = [row['exact_coverage'] for row in rows] + assert all(not ec or not ec.strip() for ec in exact_coverages), \ + f"exact_coverage should be empty without remap_conseq_csv, got: {exact_coverages}" + +def test_exact_coverage_multiple_contigs(): + """Test exact_coverage with multiple contigs.""" + # Use two different HIV seeds + seed1 = "HIV1-B-FR-K03455-seed" + seed2 = "HIV1-CRF02_AG-GH-AB286855-seed" + aligned_csv = StringIO(f"""\ +refname,qcut,rank,count,offset,seq +{seed1},15,0,3,0,AAATTTCCCCCCC +{seed1},15,0,3,0,AAATTTCCACCCC +{seed2},15,0,2,0,GGGCCCAAACCCC +{seed2},15,0,2,0,GGGCCCAATCCCC +""") + remap_conseq_csv = StringIO(f"""\ +region,sequence +{seed1},AAATTTCCCCCCC +{seed2},GGGCCCAAACCCC +""") + nuc_csv = StringIO() + amino_csv = StringIO() + insertions_csv = StringIO() + conseq_csv = StringIO() + failed_align_csv = StringIO() + coverage_summary_csv = StringIO() + aln2counts(aligned_csv=aligned_csv, + nuc_csv=nuc_csv, + amino_csv=amino_csv, + insertions_csv=insertions_csv, + conseq_csv=conseq_csv, + failed_align_csv=failed_align_csv, + coverage_summary_csv=coverage_summary_csv, + remap_conseq_csv=remap_conseq_csv) + + nuc_csv.seek(0) + contents = nuc_csv.read() + assert contents != [], "Nuc CSV should not be empty" + + nuc_csv.seek(0) + reader = csv.DictReader(nuc_csv) + rows = list(reader) + by_seed = {} + for row in rows: + seed = row['seed'] + if seed not in by_seed: + by_seed[seed] = [] + by_seed[seed].append(row) + assert seed1 in by_seed, f"Should have {seed1}" + assert seed2 in by_seed, f"Should have {seed2}" + for seed in [seed1, seed2]: + exact_coverages = [row['exact_coverage'] for row in by_seed[seed]] + non_empty = [ec for ec in exact_coverages if ec and ec.strip()] + assert len(non_empty) > 0, f"Contig {seed} should have non-empty exact_coverage" + + +def test_exact_coverage_multiple_contigs_different_numbers(): + """Test exact_coverage with multiple contigs.""" + # Use two different HIV seeds + seed1 = "HIV1-B-FR-K03455-seed" + seed2 = "HIV1-CRF02_AG-GH-AB286855-seed" + aligned_csv = StringIO(f"""\ +refname,qcut,rank,count,offset,seq +{seed1},15,0,3,0,AAATTTCCC +{seed1},15,0,3,0,AAATTTCCC +{seed2},15,0,2,0,GGGCCCAAA +{seed2},15,0,2,0,GGGCCCAAA +""") + remap_conseq_csv = StringIO(f"""\ +region,sequence +{seed1},AAATTTCCC +{seed2},GGGCCCAAA +""") + nuc_csv = StringIO() + amino_csv = StringIO() + insertions_csv = StringIO() + conseq_csv = StringIO() + failed_align_csv = StringIO() + coverage_summary_csv = StringIO() + aln2counts(aligned_csv=aligned_csv, + nuc_csv=nuc_csv, + amino_csv=amino_csv, + insertions_csv=insertions_csv, + conseq_csv=conseq_csv, + failed_align_csv=failed_align_csv, + coverage_summary_csv=coverage_summary_csv, + remap_conseq_csv=remap_conseq_csv) + + nuc_csv.seek(0) + contents = nuc_csv.read() + assert contents != [], "Nuc CSV should not be empty" + + nuc_csv.seek(0) + reader = csv.DictReader(nuc_csv) + rows = list(reader) + by_seed = {} + for row in rows: + seed = row['seed'] + if seed not in by_seed: + by_seed[seed] = [] + by_seed[seed].append(row) + assert seed1 in by_seed, f"Should have {seed1}" + assert seed2 in by_seed, f"Should have {seed2}" + for seed in [seed1, seed2]: + exact_coverages = [row['exact_coverage'] for row in by_seed[seed]] + non_empty = [ec for ec in exact_coverages if ec and ec.strip()] + assert len(non_empty) > 0, f"Contig {seed} should have non-empty exact_coverage" + + +def test_exact_coverage_accumulation_and_name_mapping(): + """ + Test that exact_coverage accumulates when multiple contigs with different + prefixes map to the same seed name. + """ + seed_name = "HIV1-B-FR-K03455-seed" + # Contig 1: count 5, palindrome read -> 10 coverage + # Contig 2: count 2, palindrome read -> 4 coverage + # Both should map to seed-name. + aligned_csv = StringIO(f"""\ +refname,qcut,rank,count,offset,seq +1-{seed_name},15,0,5,0,AAATTT +2-{seed_name},15,0,2,0,AAATTT +""") + remap_conseq_csv = StringIO(f"""\ +region,sequence +{seed_name},AAATTT +1-{seed_name},AAATTT +2-{seed_name},AAATTT +""") + nuc_csv = StringIO() + nuc_detail_csv = StringIO() + amino_csv = StringIO() + insertions_csv = StringIO() + conseq_csv = StringIO() + failed_align_csv = StringIO() + # Pass nuc_detail_csv to trigger combine_reports logic + aln2counts(aligned_csv=aligned_csv, + nuc_csv=nuc_csv, + nuc_detail_csv=nuc_detail_csv, + amino_csv=amino_csv, + insertions_csv=insertions_csv, + conseq_csv=conseq_csv, + failed_align_csv=failed_align_csv, + remap_conseq_csv=remap_conseq_csv) + nuc_csv.seek(0) + contents = nuc_csv.read() + assert contents != [], "Nuc CSV should not be empty" + + nuc_csv.seek(0) + reader = csv.DictReader(nuc_csv) + rows = list(reader) + assert all(r['seed'] == seed_name for r in rows), f"All rows should have seed {seed_name}" + assert len(rows) > 0, "Should have at least one row" + # Find a row with non-empty exact_coverage + row_with_coverage = next((r for r in rows if r.get('exact_coverage') and r['exact_coverage'].strip()), None) + assert row_with_coverage is not None, f"Should have at least one row with exact_coverage, got rows: {[(r['refseq.nuc.pos'], r['query.nuc.pos'], r['exact_coverage']) for r in rows]}" + ec = row_with_coverage['exact_coverage'] + assert ec.isdigit(), f"Exact coverage should be numeric, got: {ec}" + # Expected: 5*2 (count 5, palindrome) + 2*2 (count 2, palindrome) = 14 + assert int(ec) == 14, f"Exact coverage should be 14 (5*2 + 2*2), got: {ec}" + assert int(ec) == 14, f"Expected accumulated coverage 14 (5*2 + 2*2 for palindrome reads), got {ec}" + + +def test_no_contamination_between_seeds(): + """ + Critical: Ensure coverage from one seed does NOT leak to another. + Uses non-palindromic sequences to avoid doubling from reverse-complement matching. + """ + seed1 = "HIV1-B-FR-K03455-seed" + seed2 = "HIV1-CRF02_AG-GH-AB286855-seed" + + # Non-palindromic sequences + aligned_csv = StringIO(f"""\ +refname,qcut,rank,count,offset,seq +{seed1},15,0,10,0,AAACCCGGG +{seed2},15,0,20,0,GGGCCCAAA +""") + remap_conseq_csv = StringIO(f"""\ +region,sequence +{seed1},AAACCCGGG +{seed2},GGGCCCAAA +""") + nuc_csv = StringIO() + amino_csv = StringIO() + insertions_csv = StringIO() + conseq_csv = StringIO() + failed_align_csv = StringIO() + + aln2counts( + aligned_csv=aligned_csv, + nuc_csv=nuc_csv, + amino_csv=amino_csv, + insertions_csv=insertions_csv, + conseq_csv=conseq_csv, + failed_align_csv=failed_align_csv, + remap_conseq_csv=remap_conseq_csv, + ) + + nuc_csv.seek(0) + reader = csv.DictReader(nuc_csv) + rows = list(reader) + + # Group by seed + by_seed = {} + for row in rows: + seed = row["seed"] + if seed not in by_seed: + by_seed[seed] = [] + by_seed[seed].append(row) + + # Get max coverages + seed1_coverages = [ + int(r["exact_coverage"]) + for r in by_seed[seed1] + if r["exact_coverage"] and r["exact_coverage"].strip() + ] + seed2_coverages = [ + int(r["exact_coverage"]) + for r in by_seed[seed2] + if r["exact_coverage"] and r["exact_coverage"].strip() + ] + + # Seed1 with count=10 should have coverage 10 + # Seed2 with count=20 should have coverage 20 + # They should NOT be equal (no contamination) + assert len(seed1_coverages) > 0, "seed1 should have coverage" + assert len(seed2_coverages) > 0, "seed2 should have coverage" + + max1 = max(seed1_coverages) + max2 = max(seed2_coverages) + + assert max1 == 10, f"seed1 max coverage should be 10, got {max1}" + assert max2 == 20, f"seed2 max coverage should be 20, got {max2}" + assert max1 != max2, "Coverages should be different (no contamination)" + + +def test_prefixes_accumulate_correctly(): + """ + Critical: Multiple prefixed contigs (1-seed, 2-seed) should accumulate + to the base seed with correct total coverage. + """ + seed_name = "HIV1-B-FR-K03455-seed" + + aligned_csv = StringIO(f"""\ +refname,qcut,rank,count,offset,seq +1-{seed_name},15,0,7,0,AAATTTCCC +2-{seed_name},15,0,3,0,AAATTTCCC +""") + remap_conseq_csv = StringIO(f"""\ +region,sequence +{seed_name},AAATTTCCC +1-{seed_name},AAATTTCCC +2-{seed_name},AAATTTCCC +""") + nuc_csv = StringIO() + nuc_detail_csv = StringIO() + amino_csv = StringIO() + insertions_csv = StringIO() + conseq_csv = StringIO() + failed_align_csv = StringIO() + + aln2counts( + aligned_csv=aligned_csv, + nuc_csv=nuc_csv, + nuc_detail_csv=nuc_detail_csv, + amino_csv=amino_csv, + insertions_csv=insertions_csv, + conseq_csv=conseq_csv, + failed_align_csv=failed_align_csv, + remap_conseq_csv=remap_conseq_csv, + ) + + nuc_csv.seek(0) + reader = csv.DictReader(nuc_csv) + rows = list(reader) + + # All rows should map to base seed (without prefix) + assert all(r["seed"] == seed_name for r in rows), ( + "All rows should have base seed name" + ) + + # Get coverage values + coverages = [ + int(r["exact_coverage"]) + for r in rows + if r["exact_coverage"] and r["exact_coverage"].strip() + ] + + # Total should be 7 + 3 = 10 + assert len(coverages) > 0, "Should have coverage values" + assert max(coverages) == 10, ( + f"Max coverage should be 10 (7+3), got {max(coverages)}" + ) + + +def test_offset_reads_included(): + """ + Reads with any offset should contribute to exact_coverage. + The offset just indicates where the alignment started, but exact coverage + does its own k-mer based matching independent of alignment position. + """ + seed_name = "HIV1-B-FR-K03455-seed" + + aligned_csv = StringIO(f"""\ +refname,qcut,rank,count,offset,seq +{seed_name},15,0,10,0,AAATTTCCC +{seed_name},15,0,50,5,AAATTTCCC +""") + remap_conseq_csv = StringIO(f"""\ +region,sequence +{seed_name},AAATTTCCC +""") + nuc_csv = StringIO() + amino_csv = StringIO() + insertions_csv = StringIO() + conseq_csv = StringIO() + failed_align_csv = StringIO() + + aln2counts( + aligned_csv=aligned_csv, + nuc_csv=nuc_csv, + amino_csv=amino_csv, + insertions_csv=insertions_csv, + conseq_csv=conseq_csv, + failed_align_csv=failed_align_csv, + remap_conseq_csv=remap_conseq_csv, + ) + + nuc_csv.seek(0) + reader = csv.DictReader(nuc_csv) + rows = list(reader) + + coverages = [ + int(r["exact_coverage"]) + for r in rows + if r["exact_coverage"] and r["exact_coverage"].strip() + ] + + # Should have coverage from BOTH reads (10 + 50 = 60) + # regardless of offset, since exact coverage does k-mer matching + assert max(coverages) == 60, ( + f"Max coverage should be 60 (10+50 from both reads), got {max(coverages)}" + ) + + +def test_mismatched_reads_excluded(): + """ + Critical: Reads with mismatches should NOT contribute to exact_coverage. + """ + seed_name = "HIV1-B-FR-K03455-seed" + + aligned_csv = StringIO(f"""\ +refname,qcut,rank,count,offset,seq +{seed_name},15,0,10,0,AAATTTCCC +{seed_name},15,0,50,0,AAATTTCCT +{seed_name},15,0,30,0,AAATATCCC +""") + remap_conseq_csv = StringIO(f"""\ +region,sequence +{seed_name},AAATTTCCC +""") + nuc_csv = StringIO() + amino_csv = StringIO() + insertions_csv = StringIO() + conseq_csv = StringIO() + failed_align_csv = StringIO() + + aln2counts( + aligned_csv=aligned_csv, + nuc_csv=nuc_csv, + amino_csv=amino_csv, + insertions_csv=insertions_csv, + conseq_csv=conseq_csv, + failed_align_csv=failed_align_csv, + remap_conseq_csv=remap_conseq_csv, + ) + + nuc_csv.seek(0) + reader = csv.DictReader(nuc_csv) + rows = list(reader) + + coverages = [ + int(r["exact_coverage"]) + for r in rows + if r["exact_coverage"] and r["exact_coverage"].strip() + ] + + # Should only count the exact match (count=10) + assert max(coverages) == 10, ( + f"Max coverage should be 10 (exact matches only), got {max(coverages)}" + ) + + +def test_query_positions_consistent(): + """ + Critical: query.nuc.pos should be 1-indexed and consistent across combined reports. + """ + seed_name = "HIV1-B-FR-K03455-seed" + + aligned_csv = StringIO(f"""\ +refname,qcut,rank,count,offset,seq +1-{seed_name},15,0,5,0,AAATTT +2-{seed_name},15,0,2,0,AAATTT +""") + remap_conseq_csv = StringIO(f"""\ +region,sequence +{seed_name},AAATTT +1-{seed_name},AAATTT +2-{seed_name},AAATTT +""") + nuc_csv = StringIO() + nuc_detail_csv = StringIO() + amino_csv = StringIO() + insertions_csv = StringIO() + conseq_csv = StringIO() + failed_align_csv = StringIO() + + aln2counts( + aligned_csv=aligned_csv, + nuc_csv=nuc_csv, + nuc_detail_csv=nuc_detail_csv, + amino_csv=amino_csv, + insertions_csv=insertions_csv, + conseq_csv=conseq_csv, + failed_align_csv=failed_align_csv, + remap_conseq_csv=remap_conseq_csv, + ) + + nuc_csv.seek(0) + reader = csv.DictReader(nuc_csv) + rows = list(reader) + + # Get query positions + query_positions = [int(r["query.nuc.pos"]) for r in rows if r["query.nuc.pos"]] + + # Should be 1-indexed and consecutive + assert min(query_positions) == 1, "query.nuc.pos should start at 1" + assert max(query_positions) == 6, "query.nuc.pos should end at 6" + assert sorted(query_positions) == [1, 2, 3, 4, 5, 6], ( + "Positions should be consecutive" + ) + + # Verify coverage is at correct positions + coverage_by_pos = {} + for row in rows: + if ( + row["query.nuc.pos"] + and row["exact_coverage"] + and row["exact_coverage"].strip() + ): + pos = int(row["query.nuc.pos"]) + cov = int(row["exact_coverage"]) + coverage_by_pos[pos] = cov + + # Should have coverage at some middle positions + assert len(coverage_by_pos) > 0, "Should have coverage at some positions" + # Check what values we got + if coverage_by_pos: + # With 6bp read and overlap_size = 6//4 = 1, edges are trimmed + # Middle positions should have full coverage (5+2=7) + # But may vary due to edge trimming + print(f"coverage_by_pos: {coverage_by_pos}") + # Just verify we have reasonable coverage values + assert max(coverage_by_pos.values()) > 0, "Should have some coverage" + + +def test_independent_seed_position_spaces(): + """ + Critical: Different seeds have independent position numbering. + Uses non-palindromic sequences to test actual coverage values. + """ + seed1 = "HIV1-B-FR-K03455-seed" + seed2 = "HIV1-CRF02_AG-GH-AB286855-seed" + + # seed1: 6bp, seed2: 9bp - non-palindromic + aligned_csv = StringIO(f"""\ +refname,qcut,rank,count,offset,seq +{seed1},15,0,10,0,AAACCC +{seed2},15,0,20,0,GGGAAACCC +""") + remap_conseq_csv = StringIO(f"""\ +region,sequence +{seed1},AAACCC +{seed2},GGGAAACCC +""") + nuc_csv = StringIO() + amino_csv = StringIO() + insertions_csv = StringIO() + conseq_csv = StringIO() + failed_align_csv = StringIO() + + aln2counts( + aligned_csv=aligned_csv, + nuc_csv=nuc_csv, + amino_csv=amino_csv, + insertions_csv=insertions_csv, + conseq_csv=conseq_csv, + failed_align_csv=failed_align_csv, + remap_conseq_csv=remap_conseq_csv, + ) + + nuc_csv.seek(0) + reader = csv.DictReader(nuc_csv) + rows = list(reader) + + # Group by seed + by_seed = {} + for row in rows: + seed = row["seed"] + if seed not in by_seed: + by_seed[seed] = [] + by_seed[seed].append(row) + + # Check positions + seed1_positions = sorted( + [int(r["query.nuc.pos"]) for r in by_seed[seed1] if r["query.nuc.pos"]] + ) + seed2_positions = sorted( + [int(r["query.nuc.pos"]) for r in by_seed[seed2] if r["query.nuc.pos"]] + ) + + assert seed1_positions == [1, 2, 3, 4, 5, 6], "seed1 should have positions 1-6" + assert seed2_positions == [1, 2, 3, 4, 5, 6, 7, 8, 9], ( + "seed2 should have positions 1-9" + ) + + # Check coverages are independent + seed1_coverage = { + int(r["query.nuc.pos"]): int(r["exact_coverage"]) + for r in by_seed[seed1] + if r["query.nuc.pos"] and r["exact_coverage"] and r["exact_coverage"].strip() + } + seed2_coverage = { + int(r["query.nuc.pos"]): int(r["exact_coverage"]) + for r in by_seed[seed2] + if r["query.nuc.pos"] and r["exact_coverage"] and r["exact_coverage"].strip() + } + + # They have different position counts and different coverage values, showing they're independent + assert len(seed1_coverage) > 0, "seed1 should have coverage" + assert len(seed2_coverage) > 0, "seed2 should have coverage" + + # The key test: coverage values should be different (10 vs 20) + if seed1_coverage and seed2_coverage: + max1 = max(seed1_coverage.values()) + max2 = max(seed2_coverage.values()) + assert max1 == 10, f"seed1 should have max coverage 10, got {max1}" + assert max2 == 20, f"seed2 should have max coverage 20, got {max2}" + assert max1 != max2, f"Max coverages should differ: seed1={max1}, seed2={max2}" diff --git a/micall/tests/test_aln2counts_report.py b/micall/tests/test_aln2counts_report.py index 1a869a261..8fcc6aadd 100644 --- a/micall/tests/test_aln2counts_report.py +++ b/micall/tests/test_aln2counts_report.py @@ -420,13 +420,13 @@ def test_single_read_nucleotide_report(sequence_report): expected_text = """\ seed,region,q-cutoff,query.nuc.pos,refseq.nuc.pos,genome.pos,\ -A,C,G,T,N,del,ins,clip,v3_overlap,coverage -R1-seed,R1,15,1,1,1,9,0,0,0,0,0,0,0,0,9 -R1-seed,R1,15,2,2,2,9,0,0,0,0,0,0,0,0,9 -R1-seed,R1,15,3,3,3,9,0,0,0,0,0,0,0,0,9 -R1-seed,R1,15,4,4,4,0,0,0,9,0,0,0,0,0,9 -R1-seed,R1,15,5,5,5,0,0,0,9,0,0,0,0,0,9 -R1-seed,R1,15,6,6,6,0,0,0,9,0,0,0,0,0,9 +A,C,G,T,N,del,ins,clip,v3_overlap,coverage,exact_coverage +R1-seed,R1,15,1,1,1,9,0,0,0,0,0,0,0,0,9, +R1-seed,R1,15,2,2,2,9,0,0,0,0,0,0,0,0,9,18 +R1-seed,R1,15,3,3,3,9,0,0,0,0,0,0,0,0,9,18 +R1-seed,R1,15,4,4,4,0,0,0,9,0,0,0,0,0,9,18 +R1-seed,R1,15,5,5,5,0,0,0,9,0,0,0,0,0,9,18 +R1-seed,R1,15,6,6,6,0,0,0,9,0,0,0,0,0,9, """ report_file = StringIO() @@ -476,25 +476,25 @@ def test_multiple_prefix_nucleotide_report_overlapping_regions( expected_text = """\ seed,region,q-cutoff,query.nuc.pos,refseq.nuc.pos,genome.pos,\ -A,C,G,T,N,del,ins,clip,v3_overlap,coverage -R1-seed,R1,15,1,1,7,5,0,0,0,0,0,0,0,0,5 -R1-seed,R1,15,2,2,8,5,0,0,0,0,0,0,0,0,5 -R1-seed,R1,15,3,3,9,5,0,0,0,0,0,0,0,0,5 -R1-seed,R1,15,,4,10,0,0,0,7,0,0,0,0,0,7 -R1-seed,R1,15,,5,11,0,0,0,7,0,0,0,0,0,7 -R1-seed,R1,15,,6,12,0,0,0,7,0,0,0,0,0,7 -R1-seed,R1,15,4,7,13,2,0,0,0,0,0,0,0,0,2 -R1-seed,R1,15,5,8,14,0,0,2,0,0,0,0,0,0,2 -R1-seed,R1,15,6,9,15,0,0,2,0,0,0,0,0,0,2 -R1-seed,R1-expanded,15,1,7,7,5,0,0,0,0,0,0,0,0,5 -R1-seed,R1-expanded,15,2,8,8,5,0,0,0,0,0,0,0,0,5 -R1-seed,R1-expanded,15,3,9,9,5,0,0,0,0,0,0,0,0,5 -R1-seed,R1-expanded,15,,10,10,0,0,0,7,0,0,0,0,0,7 -R1-seed,R1-expanded,15,,11,11,0,0,0,7,0,0,0,0,0,7 -R1-seed,R1-expanded,15,,12,12,0,0,0,7,0,0,0,0,0,7 -R1-seed,R1-expanded,15,4,13,13,2,0,0,0,0,0,0,0,0,2 -R1-seed,R1-expanded,15,5,14,14,0,0,2,0,0,0,0,0,0,2 -R1-seed,R1-expanded,15,6,15,15,0,0,2,0,0,0,0,0,0,2 +A,C,G,T,N,del,ins,clip,v3_overlap,coverage,exact_coverage +R1-seed,R1,15,1,1,7,5,0,0,0,0,0,0,0,0,5, +R1-seed,R1,15,2,2,8,5,0,0,0,0,0,0,0,0,5, +R1-seed,R1,15,3,3,9,5,0,0,0,0,0,0,0,0,5, +R1-seed,R1,15,,4,10,0,0,0,7,0,0,0,0,0,7, +R1-seed,R1,15,,5,11,0,0,0,7,0,0,0,0,0,7, +R1-seed,R1,15,,6,12,0,0,0,7,0,0,0,0,0,7, +R1-seed,R1,15,4,7,13,2,0,0,0,0,0,0,0,0,2, +R1-seed,R1,15,5,8,14,0,0,2,0,0,0,0,0,0,2, +R1-seed,R1,15,6,9,15,0,0,2,0,0,0,0,0,0,2, +R1-seed,R1-expanded,15,1,7,7,5,0,0,0,0,0,0,0,0,5, +R1-seed,R1-expanded,15,2,8,8,5,0,0,0,0,0,0,0,0,5, +R1-seed,R1-expanded,15,3,9,9,5,0,0,0,0,0,0,0,0,5, +R1-seed,R1-expanded,15,,10,10,0,0,0,7,0,0,0,0,0,7, +R1-seed,R1-expanded,15,,11,11,0,0,0,7,0,0,0,0,0,7, +R1-seed,R1-expanded,15,,12,12,0,0,0,7,0,0,0,0,0,7, +R1-seed,R1-expanded,15,4,13,13,2,0,0,0,0,0,0,0,0,2, +R1-seed,R1-expanded,15,5,14,14,0,0,2,0,0,0,0,0,0,2, +R1-seed,R1-expanded,15,6,15,15,0,0,2,0,0,0,0,0,0,2, """ report = sequence_report_overlapping_regions @@ -525,16 +525,16 @@ def test_nucleotide_report_excluded_regions(sequence_report_overlapping_regions) expected_text = """\ seed,region,q-cutoff,query.nuc.pos,refseq.nuc.pos,genome.pos,\ -A,C,G,T,N,del,ins,clip,v3_overlap,coverage -R1-seed,R1,15,1,1,7,5,0,0,0,0,0,0,0,0,5 -R1-seed,R1,15,2,2,8,5,0,0,0,0,0,0,0,0,5 -R1-seed,R1,15,3,3,9,5,0,0,0,0,0,0,0,0,5 -R1-seed,R1,15,4,4,10,0,0,0,5,0,0,0,0,0,5 -R1-seed,R1,15,5,5,11,0,0,0,5,0,0,0,0,0,5 -R1-seed,R1,15,6,6,12,0,0,0,5,0,0,0,0,0,5 -R1-seed,R1,15,7,7,13,5,0,0,0,0,0,0,0,0,5 -R1-seed,R1,15,8,8,14,0,0,5,0,0,0,0,0,0,5 -R1-seed,R1,15,9,9,15,0,0,5,0,0,0,0,0,0,5 +A,C,G,T,N,del,ins,clip,v3_overlap,coverage,exact_coverage +R1-seed,R1,15,1,1,7,5,0,0,0,0,0,0,0,0,5, +R1-seed,R1,15,2,2,8,5,0,0,0,0,0,0,0,0,5, +R1-seed,R1,15,3,3,9,5,0,0,0,0,0,0,0,0,5, +R1-seed,R1,15,4,4,10,0,0,0,5,0,0,0,0,0,5, +R1-seed,R1,15,5,5,11,0,0,0,5,0,0,0,0,0,5, +R1-seed,R1,15,6,6,12,0,0,0,5,0,0,0,0,0,5, +R1-seed,R1,15,7,7,13,5,0,0,0,0,0,0,0,0,5, +R1-seed,R1,15,8,8,14,0,0,5,0,0,0,0,0,0,5, +R1-seed,R1,15,9,9,15,0,0,5,0,0,0,0,0,0,5,5 """ report = sequence_report_overlapping_regions @@ -558,16 +558,16 @@ def test_nucleotide_report_included_regions(sequence_report_overlapping_regions) expected_text = """\ seed,region,q-cutoff,query.nuc.pos,refseq.nuc.pos,genome.pos,\ -A,C,G,T,N,del,ins,clip,v3_overlap,coverage -R1-seed,R1-expanded,15,1,7,7,5,0,0,0,0,0,0,0,0,5 -R1-seed,R1-expanded,15,2,8,8,5,0,0,0,0,0,0,0,0,5 -R1-seed,R1-expanded,15,3,9,9,5,0,0,0,0,0,0,0,0,5 -R1-seed,R1-expanded,15,4,10,10,0,0,0,5,0,0,0,0,0,5 -R1-seed,R1-expanded,15,5,11,11,0,0,0,5,0,0,0,0,0,5 -R1-seed,R1-expanded,15,6,12,12,0,0,0,5,0,0,0,0,0,5 -R1-seed,R1-expanded,15,7,13,13,5,0,0,0,0,0,0,0,0,5 -R1-seed,R1-expanded,15,8,14,14,0,0,5,0,0,0,0,0,0,5 -R1-seed,R1-expanded,15,9,15,15,0,0,5,0,0,0,0,0,0,5 +A,C,G,T,N,del,ins,clip,v3_overlap,coverage,exact_coverage +R1-seed,R1-expanded,15,1,7,7,5,0,0,0,0,0,0,0,0,5, +R1-seed,R1-expanded,15,2,8,8,5,0,0,0,0,0,0,0,0,5, +R1-seed,R1-expanded,15,3,9,9,5,0,0,0,0,0,0,0,0,5, +R1-seed,R1-expanded,15,4,10,10,0,0,0,5,0,0,0,0,0,5, +R1-seed,R1-expanded,15,5,11,11,0,0,0,5,0,0,0,0,0,5, +R1-seed,R1-expanded,15,6,12,12,0,0,0,5,0,0,0,0,0,5, +R1-seed,R1-expanded,15,7,13,13,5,0,0,0,0,0,0,0,0,5, +R1-seed,R1-expanded,15,8,14,14,0,0,5,0,0,0,0,0,0,5, +R1-seed,R1-expanded,15,9,15,15,0,0,5,0,0,0,0,0,0,5,5 """ report = sequence_report_overlapping_regions @@ -716,17 +716,17 @@ def test_duplicated_sars_base_nuc(default_sequence_report): # A,C,G,T,N,...,coverage expected_section = """\ -SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,21,13198,13463,0,0,0,9,0,0,0,0,0,9 -SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,22,13199,13464,0,0,0,9,0,0,0,0,0,9 -SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,23,13200,13465,9,0,0,0,0,0,0,0,0,9 -SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,24,13201,13466,9,0,0,0,0,0,0,0,0,9 -SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,25,13202,13467,9,0,0,0,0,0,0,0,0,9 -SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,26,13203,13468,0,9,0,0,0,0,0,0,0,9 -SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,27,13204,13469,0,0,9,0,0,0,0,0,0,9 -SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,28,13205,13470,0,0,9,0,0,0,0,0,0,9 -SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,29,13206,13471,0,0,9,0,0,0,0,0,0,9 -SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,30,13207,13472,0,0,0,9,0,0,0,0,0,9 -SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,31,13208,13473,0,0,0,9,0,0,0,0,0,9""" +SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,21,13198,13463,0,0,0,9,0,0,0,0,0,9, +SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,22,13199,13464,0,0,0,9,0,0,0,0,0,9, +SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,23,13200,13465,9,0,0,0,0,0,0,0,0,9, +SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,24,13201,13466,9,0,0,0,0,0,0,0,0,9, +SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,25,13202,13467,9,0,0,0,0,0,0,0,0,9, +SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,26,13203,13468,0,9,0,0,0,0,0,0,0,9, +SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,27,13204,13469,0,0,9,0,0,0,0,0,0,9, +SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,28,13205,13470,0,0,9,0,0,0,0,0,0,9, +SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,29,13206,13471,0,0,9,0,0,0,0,0,0,9, +SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,30,13207,13472,0,0,0,9,0,0,0,0,0,9, +SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,31,13208,13473,0,0,0,9,0,0,0,0,0,9,""" report_file = StringIO() default_sequence_report.write_nuc_header(report_file) @@ -796,9 +796,9 @@ def test_duplicated_sars_base_last_region_nuc(default_sequence_report): # A,C,G,T,N,...,coverage expected_section = """\ -SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,34,13211,13476,0,9,0,0,0,0,0,0,0,9 -SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,35,13212,13477,0,0,9,0,0,0,0,0,0,9 -SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,36,13213,13478,0,0,9,0,0,0,0,0,0,9""" +SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,34,13211,13476,0,9,0,0,0,0,0,0,0,9, +SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,35,13212,13477,0,0,9,0,0,0,0,0,0,9, +SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,36,13213,13478,0,0,9,0,0,0,0,0,0,9,""" report_file = StringIO() default_sequence_report.write_nuc_header(report_file) @@ -829,10 +829,10 @@ def test_duplicated_sars_base_last_contig_nuc(default_sequence_report): # A,C,G,T,N,...,coverage expected_section = """\ -SARS-CoV-2-seed,SARS-CoV-2-nsp12,15,58,59,13500,9,0,0,0,0,0,0,0,0,9 -SARS-CoV-2-seed,SARS-CoV-2-nsp12,15,59,60,13501,0,9,0,0,0,0,0,0,0,9 -SARS-CoV-2-seed,SARS-CoV-2-nsp12,15,60,61,13502,0,9,0,0,0,0,0,0,0,9 -SARS-CoV-2-seed,SARS-CoV-2-nsp12,15,61,62,13503,0,0,9,0,0,0,0,0,0,9""" +SARS-CoV-2-seed,SARS-CoV-2-nsp12,15,58,59,13500,9,0,0,0,0,0,0,0,0,9, +SARS-CoV-2-seed,SARS-CoV-2-nsp12,15,59,60,13501,0,9,0,0,0,0,0,0,0,9, +SARS-CoV-2-seed,SARS-CoV-2-nsp12,15,60,61,13502,0,9,0,0,0,0,0,0,0,9, +SARS-CoV-2-seed,SARS-CoV-2-nsp12,15,61,62,13503,0,0,9,0,0,0,0,0,0,9,""" report_file = StringIO() default_sequence_report.write_nuc_header(report_file) @@ -931,17 +931,17 @@ def test_skipped_nucleotide_nuc(default_sequence_report): # skipped pos is 5772 in the genome, and 21 within this read expected_section = """\ -HIV1-B-FR-K03455-seed,HIV1B-vpr,15,21,212,5770,9,0,0,0,0,0,0,0,0,9 -HIV1-B-FR-K03455-seed,HIV1B-vpr,15,22,213,5771,0,0,0,9,0,0,0,0,0,9 -HIV1-B-FR-K03455-seed,HIV1B-vpr,15,23,214,5772,0,0,0,9,0,0,0,0,0,9 -HIV1-B-FR-K03455-seed,HIV1B-vpr,15,24,215,5773,0,0,0,9,0,0,0,0,0,9 -HIV1-B-FR-K03455-seed,HIV1B-vpr,15,25,216,5774,0,0,0,9,0,0,0,0,0,9 -HIV1-B-FR-K03455-seed,HIV1B-vpr,15,26,217,5775,0,9,0,0,0,0,0,0,0,9 -HIV1-B-FR-K03455-seed,HIV1B-vpr,15,27,218,5776,9,0,0,0,0,0,0,0,0,9 -HIV1-B-FR-K03455-seed,HIV1B-vpr,15,28,219,5777,0,0,9,0,0,0,0,0,0,9 -HIV1-B-FR-K03455-seed,HIV1B-vpr,15,29,220,5778,9,0,0,0,0,0,0,0,0,9 -HIV1-B-FR-K03455-seed,HIV1B-vpr,15,30,221,5779,9,0,0,0,0,0,0,0,0,9 -HIV1-B-FR-K03455-seed,HIV1B-vpr,15,31,222,5780,0,0,0,9,0,0,0,0,0,9""" +HIV1-B-FR-K03455-seed,HIV1B-vpr,15,21,212,5770,9,0,0,0,0,0,0,0,0,9, +HIV1-B-FR-K03455-seed,HIV1B-vpr,15,22,213,5771,0,0,0,9,0,0,0,0,0,9, +HIV1-B-FR-K03455-seed,HIV1B-vpr,15,23,214,5772,0,0,0,9,0,0,0,0,0,9, +HIV1-B-FR-K03455-seed,HIV1B-vpr,15,24,215,5773,0,0,0,9,0,0,0,0,0,9, +HIV1-B-FR-K03455-seed,HIV1B-vpr,15,25,216,5774,0,0,0,9,0,0,0,0,0,9, +HIV1-B-FR-K03455-seed,HIV1B-vpr,15,26,217,5775,0,9,0,0,0,0,0,0,0,9, +HIV1-B-FR-K03455-seed,HIV1B-vpr,15,27,218,5776,9,0,0,0,0,0,0,0,0,9, +HIV1-B-FR-K03455-seed,HIV1B-vpr,15,28,219,5777,0,0,9,0,0,0,0,0,0,9, +HIV1-B-FR-K03455-seed,HIV1B-vpr,15,29,220,5778,9,0,0,0,0,0,0,0,0,9, +HIV1-B-FR-K03455-seed,HIV1B-vpr,15,30,221,5779,9,0,0,0,0,0,0,0,0,9, +HIV1-B-FR-K03455-seed,HIV1B-vpr,15,31,222,5780,0,0,0,9,0,0,0,0,0,9,""" report_file = StringIO() default_sequence_report.write_nuc_header(report_file) @@ -968,18 +968,18 @@ def test_no_skipped_nucleotide_nuc(default_sequence_report): # skipped pos is 5772 in the genome expected_section = """\ -HIV1-B-FR-K03455-seed,HIV1B-vpr,15,21,212,5770,9,0,0,0,0,0,0,0,0,9 -HIV1-B-FR-K03455-seed,HIV1B-vpr,15,22,213,5771,0,0,0,9,0,0,0,0,0,9 -HIV1-B-FR-K03455-seed,HIV1B-vpr,15,,214,5772,0,0,0,0,0,9,0,0,0,9 -HIV1-B-FR-K03455-seed,HIV1B-vpr,15,23,215,5773,0,0,0,9,0,0,0,0,0,9 -HIV1-B-FR-K03455-seed,HIV1B-vpr,15,24,216,5774,0,0,0,9,0,0,0,0,0,9 -HIV1-B-FR-K03455-seed,HIV1B-vpr,15,25,217,5775,0,9,0,0,0,0,0,0,0,9 -HIV1-B-FR-K03455-seed,HIV1B-vpr,15,26,218,5776,9,0,0,0,0,0,0,0,0,9 -HIV1-B-FR-K03455-seed,HIV1B-vpr,15,27,219,5777,0,0,9,0,0,0,0,0,0,9 -HIV1-B-FR-K03455-seed,HIV1B-vpr,15,28,220,5778,9,0,0,0,0,0,0,0,0,9 -HIV1-B-FR-K03455-seed,HIV1B-vpr,15,29,221,5779,9,0,0,0,0,0,0,0,0,9 -HIV1-B-FR-K03455-seed,HIV1B-vpr,15,30,222,5780,0,0,0,9,0,0,0,0,0,9 -HIV1-B-FR-K03455-seed,HIV1B-vpr,15,31,223,5781,0,0,0,9,0,0,0,0,0,9""" +HIV1-B-FR-K03455-seed,HIV1B-vpr,15,21,212,5770,9,0,0,0,0,0,0,0,0,9, +HIV1-B-FR-K03455-seed,HIV1B-vpr,15,22,213,5771,0,0,0,9,0,0,0,0,0,9, +HIV1-B-FR-K03455-seed,HIV1B-vpr,15,,214,5772,0,0,0,0,0,9,0,0,0,9, +HIV1-B-FR-K03455-seed,HIV1B-vpr,15,23,215,5773,0,0,0,9,0,0,0,0,0,9, +HIV1-B-FR-K03455-seed,HIV1B-vpr,15,24,216,5774,0,0,0,9,0,0,0,0,0,9, +HIV1-B-FR-K03455-seed,HIV1B-vpr,15,25,217,5775,0,9,0,0,0,0,0,0,0,9, +HIV1-B-FR-K03455-seed,HIV1B-vpr,15,26,218,5776,9,0,0,0,0,0,0,0,0,9, +HIV1-B-FR-K03455-seed,HIV1B-vpr,15,27,219,5777,0,0,9,0,0,0,0,0,0,9, +HIV1-B-FR-K03455-seed,HIV1B-vpr,15,28,220,5778,9,0,0,0,0,0,0,0,0,9, +HIV1-B-FR-K03455-seed,HIV1B-vpr,15,29,221,5779,9,0,0,0,0,0,0,0,0,9, +HIV1-B-FR-K03455-seed,HIV1B-vpr,15,30,222,5780,0,0,0,9,0,0,0,0,0,9, +HIV1-B-FR-K03455-seed,HIV1B-vpr,15,31,223,5781,0,0,0,9,0,0,0,0,0,9,""" report_file = StringIO() default_sequence_report.write_nuc_header(report_file) @@ -1237,17 +1237,17 @@ def test_nuc_minority_insertions(default_sequence_report): """) expected_text_untranslated = """\ -HIV1-B-FR-K03455-seed,HIV1B-sl4,15,8,4,796,0,0,10,0,0,0,0,0,0,10 -HIV1-B-FR-K03455-seed,HIV1B-sl4,15,9,5,797,0,10,0,0,0,0,0,0,0,10 -HIV1-B-FR-K03455-seed,HIV1B-sl4,15,10,6,798,0,0,10,0,0,0,2,0,0,10 -HIV1-B-FR-K03455-seed,HIV1B-sl4,15,11,7,799,10,0,0,0,0,0,0,0,0,10 -HIV1-B-FR-K03455-seed,HIV1B-sl4,15,12,8,800,0,0,10,0,0,0,0,0,0,10""" +HIV1-B-FR-K03455-seed,HIV1B-sl4,15,8,4,796,0,0,10,0,0,0,0,0,0,10, +HIV1-B-FR-K03455-seed,HIV1B-sl4,15,9,5,797,0,10,0,0,0,0,0,0,0,10, +HIV1-B-FR-K03455-seed,HIV1B-sl4,15,10,6,798,0,0,10,0,0,0,2,0,0,10, +HIV1-B-FR-K03455-seed,HIV1B-sl4,15,11,7,799,10,0,0,0,0,0,0,0,0,10, +HIV1-B-FR-K03455-seed,HIV1B-sl4,15,12,8,800,0,0,10,0,0,0,0,0,0,10,""" expected_text_translated = """\ -HIV1-B-FR-K03455-seed,HIV1B-gag,15,8,7,796,0,0,10,0,0,0,0,0,0,10 -HIV1-B-FR-K03455-seed,HIV1B-gag,15,9,8,797,0,10,0,0,0,0,0,0,0,10 -HIV1-B-FR-K03455-seed,HIV1B-gag,15,10,9,798,0,0,10,0,0,0,2,0,0,10 -HIV1-B-FR-K03455-seed,HIV1B-gag,15,11,10,799,10,0,0,0,0,0,0,0,0,10 -HIV1-B-FR-K03455-seed,HIV1B-gag,15,12,11,800,0,0,10,0,0,0,0,0,0,10""" +HIV1-B-FR-K03455-seed,HIV1B-gag,15,8,7,796,0,0,10,0,0,0,0,0,0,10, +HIV1-B-FR-K03455-seed,HIV1B-gag,15,9,8,797,0,10,0,0,0,0,0,0,0,10, +HIV1-B-FR-K03455-seed,HIV1B-gag,15,10,9,798,0,0,10,0,0,0,2,0,0,10, +HIV1-B-FR-K03455-seed,HIV1B-gag,15,11,10,799,10,0,0,0,0,0,0,0,0,10, +HIV1-B-FR-K03455-seed,HIV1B-gag,15,12,11,800,0,0,10,0,0,0,0,0,0,10,""" nuc_file = StringIO() default_sequence_report.read_insertions(conseq_ins_csv) @@ -1276,11 +1276,11 @@ def test_nuc_small_majority_insertion(default_sequence_report): # ^^^^^^^^^ expected_text = """\ -HIV1-B-FR-K03455-seed,HIV1B-gag,15,53,52,841,10,0,0,0,0,0,0,0,0,10 -HIV1-B-FR-K03455-seed,HIV1B-gag,15,54,53,842,10,0,0,0,0,0,0,0,0,10 -HIV1-B-FR-K03455-seed,HIV1B-gag,15,55,54,843,10,0,0,0,0,0,10,0,0,10 -HIV1-B-FR-K03455-seed,HIV1B-gag,15,65,55,844,10,0,0,0,0,0,0,0,0,10 -HIV1-B-FR-K03455-seed,HIV1B-gag,15,66,56,845,0,0,0,10,0,0,0,0,0,10""" +HIV1-B-FR-K03455-seed,HIV1B-gag,15,53,52,841,10,0,0,0,0,0,0,0,0,10, +HIV1-B-FR-K03455-seed,HIV1B-gag,15,54,53,842,10,0,0,0,0,0,0,0,0,10, +HIV1-B-FR-K03455-seed,HIV1B-gag,15,55,54,843,10,0,0,0,0,0,10,0,0,10, +HIV1-B-FR-K03455-seed,HIV1B-gag,15,65,55,844,10,0,0,0,0,0,0,0,0,10, +HIV1-B-FR-K03455-seed,HIV1B-gag,15,66,56,845,0,0,0,10,0,0,0,0,0,10,""" expected_insertions = """\ seed,mixture_cutoff,region,ref_region_pos,ref_genome_pos,query_pos,insertion @@ -1314,11 +1314,11 @@ def test_nuc_large_majority_insertion(default_sequence_report): # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ expected_text = """\ -HIV1-B-FR-K03455-seed,HIV1B-gag,15,53,52,841,10,0,0,0,0,0,0,0,0,10 -HIV1-B-FR-K03455-seed,HIV1B-gag,15,54,53,842,10,0,0,0,0,0,0,0,0,10 -HIV1-B-FR-K03455-seed,HIV1B-gag,15,55,54,843,10,0,0,0,0,0,10,0,0,10 -HIV1-B-FR-K03455-seed,HIV1B-gag,15,89,55,844,10,0,0,0,0,0,0,0,0,10 -HIV1-B-FR-K03455-seed,HIV1B-gag,15,90,56,845,0,0,0,10,0,0,0,0,0,10""" +HIV1-B-FR-K03455-seed,HIV1B-gag,15,53,52,841,10,0,0,0,0,0,0,0,0,10, +HIV1-B-FR-K03455-seed,HIV1B-gag,15,54,53,842,10,0,0,0,0,0,0,0,0,10, +HIV1-B-FR-K03455-seed,HIV1B-gag,15,55,54,843,10,0,0,0,0,0,10,0,0,10, +HIV1-B-FR-K03455-seed,HIV1B-gag,15,89,55,844,10,0,0,0,0,0,0,0,0,10, +HIV1-B-FR-K03455-seed,HIV1B-gag,15,90,56,845,0,0,0,10,0,0,0,0,0,10,""" nuc_file = StringIO() default_sequence_report.read(aligned_reads) @@ -1345,11 +1345,11 @@ def test_nuc_large_majority_insertion_offset(default_sequence_report): # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ expected_text = """\ -HIV1-B-FR-K03455-seed,HIV1B-gag,15,82,52,841,10,0,0,0,0,0,0,0,0,10 -HIV1-B-FR-K03455-seed,HIV1B-gag,15,83,53,842,10,0,0,0,0,0,0,0,0,10 -HIV1-B-FR-K03455-seed,HIV1B-gag,15,84,54,843,10,0,0,0,0,0,10,0,0,10 -HIV1-B-FR-K03455-seed,HIV1B-gag,15,118,55,844,10,0,0,0,0,0,0,0,0,10 -HIV1-B-FR-K03455-seed,HIV1B-gag,15,119,56,845,0,0,0,10,0,0,0,0,0,10""" +HIV1-B-FR-K03455-seed,HIV1B-gag,15,82,52,841,10,0,0,0,0,0,0,0,0,10, +HIV1-B-FR-K03455-seed,HIV1B-gag,15,83,53,842,10,0,0,0,0,0,0,0,0,10, +HIV1-B-FR-K03455-seed,HIV1B-gag,15,84,54,843,10,0,0,0,0,0,10,0,0,10, +HIV1-B-FR-K03455-seed,HIV1B-gag,15,118,55,844,10,0,0,0,0,0,0,0,0,10, +HIV1-B-FR-K03455-seed,HIV1B-gag,15,119,56,845,0,0,0,10,0,0,0,0,0,10,""" nuc_file = StringIO() default_sequence_report.read(aligned_reads) @@ -1376,13 +1376,13 @@ def test_nuc_large_majority_insertion_frameshift(default_sequence_report): # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ expected_text = """\ -HIV1-B-FR-K03455-seed,HIV1B-gag,15,51,50,839,10,0,0,0,0,0,0,0,0,10 -HIV1-B-FR-K03455-seed,HIV1B-gag,15,52,51,840,10,0,0,0,0,0,0,0,0,10 -HIV1-B-FR-K03455-seed,HIV1B-gag,15,53,52,841,10,0,0,0,0,0,0,0,0,10 -HIV1-B-FR-K03455-seed,HIV1B-gag,15,54,53,842,10,0,0,0,0,0,10,0,0,10 -HIV1-B-FR-K03455-seed,HIV1B-gag,15,88,54,843,10,0,0,0,0,0,0,0,0,10 -HIV1-B-FR-K03455-seed,HIV1B-gag,15,89,55,844,10,0,0,0,0,0,0,0,0,10 -HIV1-B-FR-K03455-seed,HIV1B-gag,15,90,56,845,0,0,0,10,0,0,0,0,0,10""" +HIV1-B-FR-K03455-seed,HIV1B-gag,15,51,50,839,10,0,0,0,0,0,0,0,0,10, +HIV1-B-FR-K03455-seed,HIV1B-gag,15,52,51,840,10,0,0,0,0,0,0,0,0,10, +HIV1-B-FR-K03455-seed,HIV1B-gag,15,53,52,841,10,0,0,0,0,0,0,0,0,10, +HIV1-B-FR-K03455-seed,HIV1B-gag,15,54,53,842,10,0,0,0,0,0,10,0,0,10, +HIV1-B-FR-K03455-seed,HIV1B-gag,15,88,54,843,10,0,0,0,0,0,0,0,0,10, +HIV1-B-FR-K03455-seed,HIV1B-gag,15,89,55,844,10,0,0,0,0,0,0,0,0,10, +HIV1-B-FR-K03455-seed,HIV1B-gag,15,90,56,845,0,0,0,10,0,0,0,0,0,10,""" nuc_file = StringIO() default_sequence_report.read(aligned_reads) @@ -1409,13 +1409,13 @@ def test_nuc_large_insertion_not_multiple_of_three(default_sequence_report): # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ expected_text = """\ -HIV1-B-FR-K03455-seed,HIV1B-gag,15,51,50,839,10,0,0,0,0,0,0,0,0,10 -HIV1-B-FR-K03455-seed,HIV1B-gag,15,52,51,840,10,0,0,0,0,0,0,0,0,10 -HIV1-B-FR-K03455-seed,HIV1B-gag,15,53,52,841,10,0,0,0,0,0,0,0,0,10 -HIV1-B-FR-K03455-seed,HIV1B-gag,15,54,53,842,10,0,0,0,0,0,10,0,0,10 -HIV1-B-FR-K03455-seed,HIV1B-gag,15,89,54,843,10,0,0,0,0,0,0,0,0,10 -HIV1-B-FR-K03455-seed,HIV1B-gag,15,90,55,844,10,0,0,0,0,0,0,0,0,10 -HIV1-B-FR-K03455-seed,HIV1B-gag,15,91,56,845,0,0,0,10,0,0,0,0,0,10""" +HIV1-B-FR-K03455-seed,HIV1B-gag,15,51,50,839,10,0,0,0,0,0,0,0,0,10, +HIV1-B-FR-K03455-seed,HIV1B-gag,15,52,51,840,10,0,0,0,0,0,0,0,0,10, +HIV1-B-FR-K03455-seed,HIV1B-gag,15,53,52,841,10,0,0,0,0,0,0,0,0,10, +HIV1-B-FR-K03455-seed,HIV1B-gag,15,54,53,842,10,0,0,0,0,0,10,0,0,10, +HIV1-B-FR-K03455-seed,HIV1B-gag,15,89,54,843,10,0,0,0,0,0,0,0,0,10, +HIV1-B-FR-K03455-seed,HIV1B-gag,15,90,55,844,10,0,0,0,0,0,0,0,0,10, +HIV1-B-FR-K03455-seed,HIV1B-gag,15,91,56,845,0,0,0,10,0,0,0,0,0,10,""" nuc_file = StringIO() default_sequence_report.read(aligned_reads) @@ -1451,14 +1451,14 @@ def test_merge_extra_counts_insertion(projects, default_sequence_report): HIV1-B-FR-K03455-seed,HIV1B-gag,15,197,70,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10,0,0,0,0,0,0,0,10,0,0,10 HIV1-B-FR-K03455-seed,HIV1B-gag,15,212,71,0,0,0,0,0,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10""" # seed,region,q-cutoff,query.nuc.pos,refseq.nuc.pos,genome.pos, - # A,C,G,T,N,del,ins,clip,v3_overlap,coverage + # A,C,G,T,N,del,ins,clip,v3_overlap,coverage,exact_coverage expected_insertion = """\ -HIV1-B-FR-K03455-seed,HIV1B-gag,15,197,208,997,10,0,0,0,0,0,0,0,0,10 -HIV1-B-FR-K03455-seed,HIV1B-gag,15,198,209,998,0,10,0,0,0,0,0,0,0,10 -HIV1-B-FR-K03455-seed,HIV1B-gag,15,199,210,999,10,0,0,0,0,0,10,0,0,10 -HIV1-B-FR-K03455-seed,HIV1B-gag,15,212,211,1000,0,0,10,0,0,0,0,0,0,10 -HIV1-B-FR-K03455-seed,HIV1B-gag,15,213,212,1001,0,0,10,0,0,0,0,0,0,10 -HIV1-B-FR-K03455-seed,HIV1B-gag,15,214,213,1002,10,0,0,0,0,0,0,0,0,10""" +HIV1-B-FR-K03455-seed,HIV1B-gag,15,197,208,997,10,0,0,0,0,0,0,0,0,10, +HIV1-B-FR-K03455-seed,HIV1B-gag,15,198,209,998,0,10,0,0,0,0,0,0,0,10, +HIV1-B-FR-K03455-seed,HIV1B-gag,15,199,210,999,10,0,0,0,0,0,10,0,0,10, +HIV1-B-FR-K03455-seed,HIV1B-gag,15,212,211,1000,0,0,10,0,0,0,0,0,0,10, +HIV1-B-FR-K03455-seed,HIV1B-gag,15,213,212,1001,0,0,10,0,0,0,0,0,0,10, +HIV1-B-FR-K03455-seed,HIV1B-gag,15,214,213,1002,10,0,0,0,0,0,0,0,0,10,""" nuc_csv = StringIO() amino_csv = StringIO() @@ -1491,13 +1491,13 @@ def test_merge_extra_counts_insertion_vpr(projects, default_sequence_report): HIV1-B-FR-K03455-seed,15,0,10,0,{read_seq} """) # seed,region,q-cutoff,query.nuc.pos,refseq.nuc.pos,genome.pos,\ -# A,C,G,T,N,del,ins,clip,v3_overlap,coverage +# A,C,G,T,N,del,ins,clip,v3_overlap,coverage,exact_coverage expected_insertion = """\ -HIV1-B-FR-K03455-seed,HIV1B-vpr,15,239,239,5797,10,0,0,0,0,0,0,0,0,10 -HIV1-B-FR-K03455-seed,HIV1B-vpr,15,240,240,5798,0,0,10,0,0,0,0,0,0,10 -HIV1-B-FR-K03455-seed,HIV1B-vpr,15,241,241,5799,10,0,0,0,0,0,10,0,0,10 -HIV1-B-FR-K03455-seed,HIV1B-vpr,15,251,242,5800,10,0,0,0,0,0,0,0,0,10 -HIV1-B-FR-K03455-seed,HIV1B-vpr,15,252,243,5801,0,0,0,10,0,0,0,0,0,10""" +HIV1-B-FR-K03455-seed,HIV1B-vpr,15,239,239,5797,10,0,0,0,0,0,0,0,0,10, +HIV1-B-FR-K03455-seed,HIV1B-vpr,15,240,240,5798,0,0,10,0,0,0,0,0,0,10, +HIV1-B-FR-K03455-seed,HIV1B-vpr,15,241,241,5799,10,0,0,0,0,0,10,0,0,10, +HIV1-B-FR-K03455-seed,HIV1B-vpr,15,251,242,5800,10,0,0,0,0,0,0,0,0,10, +HIV1-B-FR-K03455-seed,HIV1B-vpr,15,252,243,5801,0,0,0,10,0,0,0,0,0,10,""" expected_amino_insertion = """\ HIV1-B-FR-K03455-seed,HIV1B-vpr,15,236,79,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10,0,0,0,0,0,0,0,0,0,0,0,10 HIV1-B-FR-K03455-seed,HIV1B-vpr,15,239,80,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10,0,0,0,0,0,0,0,0,0,10,0,0,10 @@ -1529,13 +1529,13 @@ def test_merge_extra_counts_insertion_vpr_noskip(projects, default_sequence_repo HIV1-B-FR-K03455-seed,15,0,10,0,{read_seq} """) # seed,region,q-cutoff,query.nuc.pos,refseq.nuc.pos,genome.pos,\ -# A,C,G,T,N,del,ins,clip,v3_overlap,coverage +# A,C,G,T,N,del,ins,clip,v3_overlap,coverage,exact_coverage expected_insertion = """\ -HIV1-B-FR-K03455-seed,HIV1B-vpr,15,238,239,5797,10,0,0,0,0,0,0,0,0,10 -HIV1-B-FR-K03455-seed,HIV1B-vpr,15,239,240,5798,0,0,10,0,0,0,0,0,0,10 -HIV1-B-FR-K03455-seed,HIV1B-vpr,15,240,241,5799,10,0,0,0,0,0,10,0,0,10 -HIV1-B-FR-K03455-seed,HIV1B-vpr,15,250,242,5800,10,0,0,0,0,0,0,0,0,10 -HIV1-B-FR-K03455-seed,HIV1B-vpr,15,251,243,5801,0,0,0,10,0,0,0,0,0,10""" +HIV1-B-FR-K03455-seed,HIV1B-vpr,15,238,239,5797,10,0,0,0,0,0,0,0,0,10, +HIV1-B-FR-K03455-seed,HIV1B-vpr,15,239,240,5798,0,0,10,0,0,0,0,0,0,10, +HIV1-B-FR-K03455-seed,HIV1B-vpr,15,240,241,5799,10,0,0,0,0,0,10,0,0,10, +HIV1-B-FR-K03455-seed,HIV1B-vpr,15,250,242,5800,10,0,0,0,0,0,0,0,0,10, +HIV1-B-FR-K03455-seed,HIV1B-vpr,15,251,243,5801,0,0,0,10,0,0,0,0,0,10,""" expected_amino_insertion = """\ HIV1-B-FR-K03455-seed,HIV1B-vpr,15,235,79,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10,0,0,0,0,0,0,0,0,0,0,0,10 HIV1-B-FR-K03455-seed,HIV1B-vpr,15,238,80,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10,0,0,0,0,0,0,0,0,0,10,0,0,10 @@ -1567,11 +1567,11 @@ def test_merge_extra_counts_insertion_nsp12(projects, default_sequence_report): SARS-CoV-2-seed,15,0,10,0,{read_seq} """) # seed,region,q-cutoff,query.nuc.pos,refseq.nuc.pos,genome.pos,\ -# A,C,G,T,N,del,ins,clip,v3_overlap,coverage +# A,C,G,T,N,del,ins,clip,v3_overlap,coverage,exact_coverage expected_insertion = """\ -SARS-CoV-2-seed,SARS-CoV-2-nsp12,15,157,157,13598,0,0,10,0,0,0,0,0,0,10 -SARS-CoV-2-seed,SARS-CoV-2-nsp12,15,158,158,13599,0,0,0,10,0,0,10,0,0,10 -SARS-CoV-2-seed,SARS-CoV-2-nsp12,15,168,159,13600,10,0,0,0,0,0,0,0,0,10""" +SARS-CoV-2-seed,SARS-CoV-2-nsp12,15,157,157,13598,0,0,10,0,0,0,0,0,0,10, +SARS-CoV-2-seed,SARS-CoV-2-nsp12,15,158,158,13599,0,0,0,10,0,0,10,0,0,10, +SARS-CoV-2-seed,SARS-CoV-2-nsp12,15,168,159,13600,10,0,0,0,0,0,0,0,0,10,""" expected_amino_insertion = """\ SARS-CoV-2-seed,SARS-CoV-2-nsp12,15,153,52,0,0,0,0,0,0,0,0,0,0,0,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10 SARS-CoV-2-seed,SARS-CoV-2-nsp12,15,156,53,0,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10,0,0,10 @@ -2077,18 +2077,18 @@ def test_nucleotide_coordinates(default_sequence_report): expected_report = """\ seed,region,q-cutoff,query.nuc.pos,refseq.nuc.pos,genome.pos,\ -A,C,G,T,N,del,ins,clip,v3_overlap,coverage -SARS-CoV-2-seed,SARS-CoV-2-TRS-B-8,15,1,1,28260,9,0,0,0,0,0,0,0,0,9 -SARS-CoV-2-seed,SARS-CoV-2-TRS-B-8,15,2,2,28261,0,9,0,0,0,0,0,0,0,9 -SARS-CoV-2-seed,SARS-CoV-2-TRS-B-8,15,3,3,28262,0,0,9,0,0,0,0,0,0,9 -SARS-CoV-2-seed,SARS-CoV-2-TRS-B-8,15,4,4,28263,9,0,0,0,0,0,0,0,0,9 -SARS-CoV-2-seed,SARS-CoV-2-TRS-B-8,15,5,5,28264,9,0,0,0,0,0,0,0,0,9 -SARS-CoV-2-seed,SARS-CoV-2-TRS-B-8,15,6,6,28265,0,9,0,0,0,0,0,0,0,9 -SARS-CoV-2-seed,SARS-CoV-2-TRS-B-8,15,7,7,28266,9,0,0,0,0,0,0,0,0,9 -SARS-CoV-2-seed,SARS-CoV-2-TRS-B-8,15,8,8,28267,9,0,0,0,0,0,0,0,0,9 -SARS-CoV-2-seed,SARS-CoV-2-TRS-B-8,15,9,9,28268,9,0,0,0,0,0,0,0,0,9 -SARS-CoV-2-seed,SARS-CoV-2-TRS-B-8,15,10,10,28269,0,9,0,0,0,0,0,0,0,9 -SARS-CoV-2-seed,SARS-CoV-2-TRS-B-8,15,11,11,28270,0,0,0,9,0,0,0,0,0,9 +A,C,G,T,N,del,ins,clip,v3_overlap,coverage,exact_coverage +SARS-CoV-2-seed,SARS-CoV-2-TRS-B-8,15,1,1,28260,9,0,0,0,0,0,0,0,0,9, +SARS-CoV-2-seed,SARS-CoV-2-TRS-B-8,15,2,2,28261,0,9,0,0,0,0,0,0,0,9, +SARS-CoV-2-seed,SARS-CoV-2-TRS-B-8,15,3,3,28262,0,0,9,0,0,0,0,0,0,9, +SARS-CoV-2-seed,SARS-CoV-2-TRS-B-8,15,4,4,28263,9,0,0,0,0,0,0,0,0,9, +SARS-CoV-2-seed,SARS-CoV-2-TRS-B-8,15,5,5,28264,9,0,0,0,0,0,0,0,0,9, +SARS-CoV-2-seed,SARS-CoV-2-TRS-B-8,15,6,6,28265,0,9,0,0,0,0,0,0,0,9, +SARS-CoV-2-seed,SARS-CoV-2-TRS-B-8,15,7,7,28266,9,0,0,0,0,0,0,0,0,9, +SARS-CoV-2-seed,SARS-CoV-2-TRS-B-8,15,8,8,28267,9,0,0,0,0,0,0,0,0,9, +SARS-CoV-2-seed,SARS-CoV-2-TRS-B-8,15,9,9,28268,9,0,0,0,0,0,0,0,0,9, +SARS-CoV-2-seed,SARS-CoV-2-TRS-B-8,15,10,10,28269,0,9,0,0,0,0,0,0,0,9, +SARS-CoV-2-seed,SARS-CoV-2-TRS-B-8,15,11,11,28270,0,0,0,9,0,0,0,0,0,9, """ report_file = StringIO() @@ -2117,26 +2117,26 @@ def test_minimap_overlap(default_sequence_report, projects): # A,C,G,T expected_text = """\ -HIV1-B-FR-K03455-seed,INT,15,51,262,4491,0,0,9,0,0,0,0,0,0,9 -HIV1-B-FR-K03455-seed,INT,15,52,263,4492,0,0,0,9,0,0,0,0,0,9 -HIV1-B-FR-K03455-seed,INT,15,53,264,4493,0,0,0,9,0,0,0,0,0,9 -HIV1-B-FR-K03455-seed,INT,15,54,265,4494,9,0,0,0,0,0,0,0,0,9 -HIV1-B-FR-K03455-seed,INT,15,55,266,4495,0,0,0,9,0,0,0,0,0,9 -HIV1-B-FR-K03455-seed,INT,15,56,267,4496,0,0,0,9,0,0,0,0,0,9 -HIV1-B-FR-K03455-seed,INT,15,57,268,4497,0,9,0,0,0,0,0,0,0,9 -HIV1-B-FR-K03455-seed,INT,15,58,269,4498,0,9,0,0,0,0,0,0,0,9 -HIV1-B-FR-K03455-seed,INT,15,59,270,4499,9,0,0,0,0,0,0,0,0,9 -HIV1-B-FR-K03455-seed,INT,15,60,271,4500,0,0,9,0,0,0,0,0,0,9 -HIV1-B-FR-K03455-seed,RT,15,61,452,3001,9,0,0,0,0,0,0,0,0,9 -HIV1-B-FR-K03455-seed,RT,15,62,453,3002,0,0,9,0,0,0,0,0,0,9 -HIV1-B-FR-K03455-seed,RT,15,63,454,3003,0,0,9,0,0,0,0,0,0,9 -HIV1-B-FR-K03455-seed,RT,15,64,455,3004,0,0,9,0,0,0,0,0,0,9 -HIV1-B-FR-K03455-seed,RT,15,65,456,3005,9,0,0,0,0,0,0,0,0,9 -HIV1-B-FR-K03455-seed,RT,15,66,457,3006,0,0,0,9,0,0,0,0,0,9 -HIV1-B-FR-K03455-seed,RT,15,67,458,3007,0,0,9,0,0,0,0,0,0,9 -HIV1-B-FR-K03455-seed,RT,15,68,459,3008,0,0,9,0,0,0,0,0,0,9 -HIV1-B-FR-K03455-seed,RT,15,69,460,3009,9,0,0,0,0,0,0,0,0,9 -HIV1-B-FR-K03455-seed,RT,15,70,461,3010,9,0,0,0,0,0,0,0,0,9""" +HIV1-B-FR-K03455-seed,INT,15,51,262,4491,0,0,9,0,0,0,0,0,0,9, +HIV1-B-FR-K03455-seed,INT,15,52,263,4492,0,0,0,9,0,0,0,0,0,9, +HIV1-B-FR-K03455-seed,INT,15,53,264,4493,0,0,0,9,0,0,0,0,0,9, +HIV1-B-FR-K03455-seed,INT,15,54,265,4494,9,0,0,0,0,0,0,0,0,9, +HIV1-B-FR-K03455-seed,INT,15,55,266,4495,0,0,0,9,0,0,0,0,0,9, +HIV1-B-FR-K03455-seed,INT,15,56,267,4496,0,0,0,9,0,0,0,0,0,9, +HIV1-B-FR-K03455-seed,INT,15,57,268,4497,0,9,0,0,0,0,0,0,0,9, +HIV1-B-FR-K03455-seed,INT,15,58,269,4498,0,9,0,0,0,0,0,0,0,9, +HIV1-B-FR-K03455-seed,INT,15,59,270,4499,9,0,0,0,0,0,0,0,0,9, +HIV1-B-FR-K03455-seed,INT,15,60,271,4500,0,0,9,0,0,0,0,0,0,9, +HIV1-B-FR-K03455-seed,RT,15,61,452,3001,9,0,0,0,0,0,0,0,0,9, +HIV1-B-FR-K03455-seed,RT,15,62,453,3002,0,0,9,0,0,0,0,0,0,9, +HIV1-B-FR-K03455-seed,RT,15,63,454,3003,0,0,9,0,0,0,0,0,0,9, +HIV1-B-FR-K03455-seed,RT,15,64,455,3004,0,0,9,0,0,0,0,0,0,9, +HIV1-B-FR-K03455-seed,RT,15,65,456,3005,9,0,0,0,0,0,0,0,0,9, +HIV1-B-FR-K03455-seed,RT,15,66,457,3006,0,0,0,9,0,0,0,0,0,9, +HIV1-B-FR-K03455-seed,RT,15,67,458,3007,0,0,9,0,0,0,0,0,0,9, +HIV1-B-FR-K03455-seed,RT,15,68,459,3008,0,0,9,0,0,0,0,0,0,9, +HIV1-B-FR-K03455-seed,RT,15,69,460,3009,9,0,0,0,0,0,0,0,0,9, +HIV1-B-FR-K03455-seed,RT,15,70,461,3010,9,0,0,0,0,0,0,0,0,9,""" report_file = StringIO() default_sequence_report.write_nuc_header(report_file) default_sequence_report.read(aligned_reads) @@ -2195,11 +2195,11 @@ def test_minimap_gap(default_sequence_report, projects): """) # A,C,G,T expected_text = """\ -HIV1-B-FR-K03455-seed,HIV1B-gag,15,493,493,1282,9,0,0,0,0,0,0,0,0,9 -HIV1-B-FR-K03455-seed,HIV1B-gag,15,,494,1283,0,0,0,0,0,9,0,0,0,9 +HIV1-B-FR-K03455-seed,HIV1B-gag,15,493,493,1282,9,0,0,0,0,0,0,0,0,9, +HIV1-B-FR-K03455-seed,HIV1B-gag,15,,494,1283,0,0,0,0,0,9,0,0,0,9, ... -HIV1-B-FR-K03455-seed,HIV1B-gag,15,,1072,1861,0,0,0,0,0,9,0,0,0,9 -HIV1-B-FR-K03455-seed,HIV1B-gag,15,494,1073,1862,9,0,0,0,0,0,0,0,0,9""" +HIV1-B-FR-K03455-seed,HIV1B-gag,15,,1072,1861,0,0,0,0,0,9,0,0,0,9, +HIV1-B-FR-K03455-seed,HIV1B-gag,15,494,1073,1862,9,0,0,0,0,0,0,0,0,9,""" report_file = StringIO() default_sequence_report.write_nuc_header(report_file) default_sequence_report.read(aligned_reads) @@ -2230,11 +2230,11 @@ def test_minimap_gap_around_start(default_sequence_report, projects): HIV1-B-FR-K03455-seed,15,0,9,0,{read_seq} """) expected_text = """\ -HIV1-B-FR-K03455-seed,GP41,15,,1037,8794,0,0,0,0,0,9,0,0,0,9 -HIV1-B-FR-K03455-seed,GP41,15,,1038,8795,0,0,0,0,0,9,0,0,0,9 -HIV1-B-FR-K03455-seed,HIV1B-8796,15,,1,8796,0,0,0,0,0,9,0,0,0,9 -HIV1-B-FR-K03455-seed,HIV1B-nef,15,,1,8797,0,0,0,0,0,9,0,0,0,9 -HIV1-B-FR-K03455-seed,HIV1B-nef,15,,2,8798,0,0,0,0,0,9,0,0,0,9""" +HIV1-B-FR-K03455-seed,GP41,15,,1037,8794,0,0,0,0,0,9,0,0,0,9, +HIV1-B-FR-K03455-seed,GP41,15,,1038,8795,0,0,0,0,0,9,0,0,0,9, +HIV1-B-FR-K03455-seed,HIV1B-8796,15,,1,8796,0,0,0,0,0,9,0,0,0,9, +HIV1-B-FR-K03455-seed,HIV1B-nef,15,,1,8797,0,0,0,0,0,9,0,0,0,9, +HIV1-B-FR-K03455-seed,HIV1B-nef,15,,2,8798,0,0,0,0,0,9,0,0,0,9,""" report_file = StringIO() default_sequence_report.write_nuc_header(report_file) default_sequence_report.read(aligned_reads) @@ -2266,9 +2266,9 @@ def test_minimap_reading_frame(default_sequence_report, projects): """) # A,C,G,T expected_text = """\ -HIV1-B-FR-K03455-seed,HIV1B-gag,15,190,1503,2292,9,0,0,0,0,0,0,0,0,9 -HIV1-B-FR-K03455-seed,PR,15,151,1,2253,0,9,0,0,0,0,0,0,0,9 -HIV1-B-FR-K03455-seed,PR,15,152,2,2254,0,9,0,0,0,0,0,0,0,9""" +HIV1-B-FR-K03455-seed,HIV1B-gag,15,190,1503,2292,9,0,0,0,0,0,0,0,0,9, +HIV1-B-FR-K03455-seed,PR,15,151,1,2253,0,9,0,0,0,0,0,0,0,9, +HIV1-B-FR-K03455-seed,PR,15,152,2,2254,0,9,0,0,0,0,0,0,0,9,""" report_file = StringIO() default_sequence_report.write_nuc_header(report_file) default_sequence_report.read(aligned_reads)