From dc0ff467ff128abc40f6e1b3090ac58f4c2d96d9 Mon Sep 17 00:00:00 2001 From: Kez Cleal <42997789+kcleal@users.noreply.github.com> Date: Fri, 31 Jan 2025 10:59:43 +0000 Subject: [PATCH 01/14] Dysgu_dev <-- master (#121) * Dysgu dev (#115) * dysgu_dev <- master (#108) * correct usage of update_filter_value in filter_normals It seems like at some point the usage of update_filter_value changed, because in several spots, it is missing the sample_name argument. This breaks `dysgu filter --keep-all`. * Update main.yml * Update main.yml * Update requirements.txt * updated build process to pyproject.toml (#103) * v1.6.6 updated README * Update main.yml * Added pyproject.toml * Updated pyproject.toml and setup.py * Update pyproject.toml --------- Co-authored-by: Dr. K. D. Murray <1560490+kdm9@users.noreply.github.com> * Better recall+precision for long-reads. Improved merging pipeline. Faster runtime. Code refactoring. WIP # [skip ci] * Fixed issues with overriding presets. --mode option updated # [skip ci] * Fixed issues with overriding presets. --mode option updated to support r10 and revio, new presets available. --no-gt flat removed # [skip ci] * Fixed CLI issues for tests # [skip ci] * Fixed API issue * v1.7.0 * added deprecated warning for mode * Added filter for high divergence reads (long-reads only) # [skip ci] * Better read merging for long reads. New clustering methods for long reads. Improved runtime. * Update main.yml * Update main.yml * Update osx-deps * Update main.yml macos11 * Update main.yml * Update main.yml * Update main.yml * Update osx-deps * Update osx-deps * Update osx-deps * Update osx-deps * Update main.yml * Update osx-deps * Update main.yml * Updated workflow * Updated workflow * Updated workflow * Update main.yml * Updated workflow --------- Co-authored-by: Dr. K. D. Murray <1560490+kdm9@users.noreply.github.com> * Fixes compile error for clang * Fixed merge None type error. Added wb3 as default compression level for long-reads (run-pipeline) (#116) * Updated workflow * Updated workflow * Fixed compiler warning --------- Co-authored-by: Dr. K. D. Murray <1560490+kdm9@users.noreply.github.com> --- .github/workflows/main.yml | 2 -- dysgu/call_component.pyx | 18 ++++++++---------- dysgu/graph.pyx | 4 ++-- dysgu/map_set_utils.pxd | 4 ++-- dysgu/map_set_utils.pyx | 12 ++++++------ setup.py | 1 - 6 files changed, 18 insertions(+), 23 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 33b9b6f..bf6a5fa 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -11,8 +11,6 @@ jobs: strategy: matrix: os: [ubuntu-20.04, macOS-14] - #os: [ubuntu-latest, macOS-11] - steps: - uses: actions/checkout@v4 - name: Set correct paths for Homebrew on macOS diff --git a/dysgu/call_component.pyx b/dysgu/call_component.pyx index 94a608d..71b3a84 100644 --- a/dysgu/call_component.pyx +++ b/dysgu/call_component.pyx @@ -105,7 +105,7 @@ cdef base_quals_aligned_clipped(AlignedSegment a): cdef float clipped_base_quals = 0 cdef int left_clip = 0 cdef int right_clip = 0 - clip_sizes(a, left_clip, right_clip) + clip_sizes(a, &left_clip, &right_clip) clipped_bases = left_clip + right_clip cdef const unsigned char[:] quals = a.query_qualities cdef int i @@ -154,7 +154,7 @@ cdef count_attributes2(reads1, reads2, spanning, float insert_ppf, generic_ins, er.n_unmapped_mates += 1 left_clip = 0 right_clip = 0 - clip_sizes_hard(a, left_clip, right_clip) + clip_sizes_hard(a, &left_clip, &right_clip) if left_clip > 0 and right_clip > 0: er.double_clips += 1 has_sa = a.has_tag("SA") @@ -206,7 +206,7 @@ cdef count_attributes2(reads1, reads2, spanning, float insert_ppf, generic_ins, left_clip = 0 right_clip = 0 - clip_sizes(a, left_clip, right_clip) + clip_sizes(a, &left_clip, &right_clip) if left_clip or right_clip: er.sc += 1 if a.flag & 1: # paired read @@ -222,7 +222,7 @@ cdef count_attributes2(reads1, reads2, spanning, float insert_ppf, generic_ins, er.NP += 1 left_clip = 0 right_clip = 0 - clip_sizes_hard(a, left_clip, right_clip) + clip_sizes_hard(a, &left_clip, &right_clip) if left_clip > 0 and right_clip > 0: er.double_clips += 1 if a.has_tag("SA"): @@ -507,7 +507,7 @@ cdef make_generic_insertion_item(aln, int insert_size, int insert_std): v_item.svtype = "BND" left_clip = 0 right_clip = 0 - clip_sizes(aln, left_clip, right_clip) + clip_sizes(aln, &left_clip, &right_clip) clip_s = max(left_clip, right_clip) rand_insert_pos = 100 if not clip_s else clip_s v_item.inferred_sv_len = 0 if rand_insert_pos < 0 else rand_insert_pos @@ -945,6 +945,7 @@ cdef linear_scan_clustering(spanning, bint hp_tag): def process_spanning(bint paired_end, spanning_alignments, float divergence, length_extend, informative, generic_insertions, float insert_ppf, bint to_assemble, bint hp_tag): + # echo("PROCESS SPANNING") cdef int min_found_support = 0 cdef str svtype, jointype @@ -996,6 +997,7 @@ def process_spanning(bint paired_end, spanning_alignments, float divergence, len # 1.7.0 # svlen = int(np.median([sp.cigar_item.len for sp in spanning_alignments])) + posA = spanning_alignments[best_index].pos posB = spanning_alignments[best_index].end er.preciseA = True @@ -2020,8 +2022,7 @@ cdef list multi(data, bam, int insert_size, int insert_stdev, float insert_ppf, for (u, v), d in data.s_between: #.items(): rd_u = get_reads(bam, d[0], data.reads, n2n, add_to_buffer, info) # [(Nodeinfo, alignment)..] rd_v = get_reads(bam, d[1], data.reads, n2n, add_to_buffer, info) - # echo("rd_u", [rr[1].qname for rr in rd_u]) - # echo("rd_v", [rr[1].qname for rr in rd_v]) + total_reads = len(rd_u) + len(rd_v) buffered_reads += total_reads if add_to_buffer and buffered_reads > 50000: @@ -2106,9 +2107,6 @@ cpdef list call_from_block_model(bam, data, clip_length, insert_size, insert_std n_parts = len(data.parts) if data.parts else 0 events = [] info = data.info - # echo(data.parts) - # echo(data.s_between) - # echo(data.s_within) if data.reads is None: data.reads = {} # next deal with info - need to filter these into the partitions, then deal with them in single / one_edge diff --git a/dysgu/graph.pyx b/dysgu/graph.pyx index 78a0e6e..cd0c984 100644 --- a/dysgu/graph.pyx +++ b/dysgu/graph.pyx @@ -252,7 +252,7 @@ cdef class ClipScoper: unordered_set[int]& clustered_nodes): cdef int clip_left = 0 cdef int clip_right = 0 - clip_sizes(r, clip_left, clip_right) + clip_sizes(r, &clip_left, &clip_right) if chrom != self.current_chrom: self.scope_left.clear() self.scope_right.clear() @@ -1368,7 +1368,7 @@ cpdef tuple construct_graph(genome_scanner, infile, int max_dist, int clustering pos2 = -1 left_clip_size = 0 right_clip_size = 0 - clip_sizes_hard(r, left_clip_size, right_clip_size) # soft and hard-clips + clip_sizes_hard(r, &left_clip_size, &right_clip_size) # soft and hard-clips if r.flag & 8 and clipped: # paired by inference # skip if both ends are clipped, usually means its a chunk of badly mapped sequence # if not (left_clip_size and right_clip_size) and ((paired_end and good_quality_clip(r, 20)) or (not paired_end and) ): diff --git a/dysgu/map_set_utils.pxd b/dysgu/map_set_utils.pxd index 2dcb72b..866a729 100644 --- a/dysgu/map_set_utils.pxd +++ b/dysgu/map_set_utils.pxd @@ -266,9 +266,9 @@ cdef extern from "" namespace "std" nogil: cdef int cigar_exists(AlignedSegment r) -cdef void clip_sizes(AlignedSegment r, int& left, int& right) +cdef void clip_sizes(AlignedSegment r, int* left, int* right) -cdef void clip_sizes_hard(AlignedSegment r, int& left, int& right) +cdef void clip_sizes_hard(AlignedSegment r, int* left, int* right) cdef int cigar_clip(AlignedSegment r, int clip_length) diff --git a/dysgu/map_set_utils.pyx b/dysgu/map_set_utils.pyx index 4b69999..e6ab2e0 100644 --- a/dysgu/map_set_utils.pyx +++ b/dysgu/map_set_utils.pyx @@ -223,7 +223,7 @@ cdef int cigar_exists(AlignedSegment r): return 0 -cdef void clip_sizes(AlignedSegment r, int& left, int& right): +cdef void clip_sizes(AlignedSegment r, int* left, int* right): cdef uint32_t cigar_value cdef uint32_t cigar_l cdef uint32_t *cigar_p @@ -235,14 +235,14 @@ cdef void clip_sizes(AlignedSegment r, int& left, int& right): cigar_value = cigar_p[0] opp = cigar_value & 15 if opp == 4: - left = cigar_value >> 4 + left[0] = cigar_value >> 4 cigar_value = cigar_p[cigar_l - 1] opp = cigar_value & 15 if opp == 4: - right = cigar_value >> 4 + right[0] = cigar_value >> 4 -cdef void clip_sizes_hard(AlignedSegment r, int& left, int& right): +cdef void clip_sizes_hard(AlignedSegment r, int* left, int* right): cdef uint32_t cigar_value cdef uint32_t cigar_l cdef uint32_t *cigar_p @@ -255,11 +255,11 @@ cdef void clip_sizes_hard(AlignedSegment r, int& left, int& right): cigar_value = cigar_p[0] opp = cigar_value & 15 if opp == 4 or opp == 5: - left = cigar_value >> 4 + left[0] = cigar_value >> 4 cigar_value = cigar_p[cigar_l - 1] opp = cigar_value & 15 if opp == 4 or opp == 5: - right = cigar_value >> 4 + right[0] = cigar_value >> 4 cdef int cigar_clip(AlignedSegment r, int clip_length): diff --git a/setup.py b/setup.py index ee20dde..4fdfa27 100644 --- a/setup.py +++ b/setup.py @@ -46,7 +46,6 @@ def get_extra_args(): extras = get_extra_args() + ["-Wno-sign-compare", "-Wno-unused-function", "-Wno-unused-result", '-Wno-ignored-qualifiers', "-Wno-deprecated-declarations", "-fpermissive", - "-Wno-unreachable-code-fallthrough", ] def get_extension_modules(): From 5702658354c2554b6017be493561592799336ccc Mon Sep 17 00:00:00 2001 From: kcleal Date: Fri, 31 Jan 2025 11:33:22 +0000 Subject: [PATCH 02/14] Fixed compile errors. Add logging for HP tag usage --- dysgu/call_component.pyx | 7 ++++--- dysgu/cluster.pyx | 3 +++ dysgu/extra_metrics.pyx | 2 +- dysgu/map_set_utils.pyx | 2 +- 4 files changed, 9 insertions(+), 5 deletions(-) diff --git a/dysgu/call_component.pyx b/dysgu/call_component.pyx index 71b3a84..39a850c 100644 --- a/dysgu/call_component.pyx +++ b/dysgu/call_component.pyx @@ -1459,8 +1459,8 @@ cdef tuple mask_soft_clips(AlignedSegment a, AlignedSegment b): cdef int left_clipB = 0 cdef int right_clipB = 0 - clip_sizes_hard(a, left_clipA, right_clipA) - clip_sizes_hard(b, left_clipB, right_clipB) + clip_sizes_hard(a, &left_clipA, &right_clipA) + clip_sizes_hard(b, &left_clipB, &right_clipB) cdef int a_template_start = 0 cdef int b_template_start = 0 @@ -1643,7 +1643,8 @@ cdef call_from_reads(u_reads_info, v_reads_info, int insert_size, int insert_std cdef AlignedSegment a, b cdef uint32_t cigar_l_a, cigar_l_b - cdef uint32_t *cigar_p_a, *cigar_p_b + cdef uint32_t *cigar_p_a + cdef uint32_t *cigar_p_b for qname in [k for k in grp_u if k in grp_v]: # Qname found on both sides diff --git a/dysgu/cluster.pyx b/dysgu/cluster.pyx index ad3aea4..1b21d90 100644 --- a/dysgu/cluster.pyx +++ b/dysgu/cluster.pyx @@ -249,6 +249,9 @@ def pipe1(args, infile, kind, regions, ibam, ref_genome, sample_name, bam_iter=N sites_index = sites_adder.sites_index logging.info("Graph constructed") + if not args['no_phase'] and hp_tag_found: + logging.info("Using HP tag") + auto_support = False if args["min_support"] != "auto": args["min_support"] = int(args["min_support"]) diff --git a/dysgu/extra_metrics.pyx b/dysgu/extra_metrics.pyx index 80d7160..aabf72e 100644 --- a/dysgu/extra_metrics.pyx +++ b/dysgu/extra_metrics.pyx @@ -95,7 +95,7 @@ cdef float soft_clip_qual_corr(reads): quals = r.query_qualities left_clip = 0 right_clip = 0 - clip_sizes(r, left_clip, right_clip) + clip_sizes(r, &left_clip, &right_clip) if quals is None: continue diff --git a/dysgu/map_set_utils.pyx b/dysgu/map_set_utils.pyx index e6ab2e0..c882661 100644 --- a/dysgu/map_set_utils.pyx +++ b/dysgu/map_set_utils.pyx @@ -269,7 +269,7 @@ cdef int cigar_clip(AlignedSegment r, int clip_length): return 0 cdef int left = 0 cdef int right = 0 - clip_sizes(r, left, right) + clip_sizes(r, &left, &right) if left >= clip_length or right >= clip_length: return 1 return 0 From 634102f3bb75310cdc63a62714aa8f97e3161413 Mon Sep 17 00:00:00 2001 From: kcleal Date: Fri, 31 Jan 2025 11:50:35 +0000 Subject: [PATCH 03/14] Added superintervals dependency --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 390e8d1..91c0451 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,6 +28,7 @@ dependencies = [ "networkx >= 2.4", "scikit-learn >= 0.22", "sortedcontainers", + "superintervals", "lightgbm" ] From c4e1484a985aa50d246e8037b23c3d7faf2ff4c8 Mon Sep 17 00:00:00 2001 From: kcleal Date: Mon, 24 Feb 2025 13:05:56 +0000 Subject: [PATCH 04/14] Added PSET phaseset HP haplotype tag and AF allele frequency tags to output --- dysgu/call_component.pyx | 57 +++++--- dysgu/cluster.pyx | 2 - dysgu/coverage.pyx | 3 + dysgu/io_funcs.pyx | 47 +++---- dysgu/map_set_utils.pxd | 3 +- dysgu/map_set_utils.pyx | 10 +- dysgu/post_call.py | 273 ++++++++++++++++++++++----------------- dysgu/view.py | 3 + 8 files changed, 223 insertions(+), 175 deletions(-) diff --git a/dysgu/call_component.pyx b/dysgu/call_component.pyx index 39a850c..0d5ebcf 100644 --- a/dysgu/call_component.pyx +++ b/dysgu/call_component.pyx @@ -119,6 +119,32 @@ cdef base_quals_aligned_clipped(AlignedSegment a): return aligned_base_quals, aligned_bases, clipped_base_quals, clipped_bases +cdef collect_phase_tags(bint get_hp_tag, AlignedSegment a, EventResult_t er): + if not get_hp_tag: + return + if not a.has_tag("HP"): + if er.haplotype is None or 'u' not in er.haplotype: + er.haplotype = {'u': 1} + else: + er.haplotype['u'] += 1 + return + hp_tag = a.get_tag("HP") + if not er.haplotype: + er.haplotype = {hp_tag: 1} + elif hp_tag not in er.haplotype: + er.haplotype[hp_tag] = 1 + else: + er.haplotype[hp_tag] += 1 + if a.has_tag("PS"): + pg_tag = a.get_tag("PS") + if not er.phase_set: + er.phase_set = {pg_tag: 1} + elif pg_tag not in er.phase_set: + er.phase_set[pg_tag] = 1 + else: + er.phase_set[pg_tag] += 1 + + cdef count_attributes2(reads1, reads2, spanning, float insert_ppf, generic_ins, EventResult_t er, bint paired_end_reads, bint get_hp_tag): cdef float NMpri = 0 @@ -145,6 +171,7 @@ cdef count_attributes2(reads1, reads2, spanning, float insert_ppf, generic_ins, cdef float a_bases, large_gaps, n_small_gaps cdef AlignedSegment a for index, a in enumerate(itertools.chain(reads1, reads2, [i.read_a for i in generic_ins])): + seen.add(a.qname) flag = a.flag if flag & 2: er.NP += 1 @@ -162,16 +189,11 @@ cdef count_attributes2(reads1, reads2, spanning, float insert_ppf, generic_ins, n_sa += a.get_tag("SA").count(";") if a.has_tag("XA"): n_xa += a.get_tag("XA").count(";") - if get_hp_tag and a.has_tag("HP"): - hp_tag = a.get_tag("HP") - if not er.haplotypes: - er.haplotypes = {hp_tag: 1} - elif hp_tag not in er.haplotypes: - er.haplotypes[hp_tag] = 1 - else: - er.haplotypes[hp_tag] += 1 + + collect_phase_tags(get_hp_tag, a, er) a_bases, large_gaps, n_small_gaps = n_aligned_bases(a) + if a_bases > 0: n_gaps += n_small_gaps / a_bases if flag & 2304: # Supplementary (and not primary if -M if flagged using bwa) @@ -190,7 +212,7 @@ cdef count_attributes2(reads1, reads2, spanning, float insert_ppf, generic_ins, total_pri += 1 MAPQpri += a.mapq if paired_end_reads: - if index >= len(reads2) and a.qname in paired_end: + if index >= len(reads1) and a.qname in paired_end: er.pe += 1 else: paired_end.add(a.qname) @@ -217,6 +239,7 @@ cdef count_attributes2(reads1, reads2, spanning, float insert_ppf, generic_ins, clipped_bases += cb for a in spanning: + seen.add(a.qname) flag = a.flag if flag & 2: er.NP += 1 @@ -229,16 +252,10 @@ cdef count_attributes2(reads1, reads2, spanning, float insert_ppf, generic_ins, n_sa += a.get_tag("SA").count(";") if a.has_tag("XA"): n_xa += a.get_tag("XA").count(";") - if get_hp_tag and a.has_tag("HP"): - hp_tag = a.get_tag("HP") - if not er.haplotypes: - er.haplotypes = {hp_tag: 1} - elif hp_tag not in er.haplotypes: - er.haplotypes[hp_tag] = 1 - else: - er.haplotypes[hp_tag] += 1 + collect_phase_tags(get_hp_tag, a, er) a_bases, large_gaps, n_small_gaps = n_aligned_bases(a) + if a_bases > 0: n_gaps += n_small_gaps / a_bases if flag & 2304: # Supplementary @@ -257,7 +274,7 @@ cdef count_attributes2(reads1, reads2, spanning, float insert_ppf, generic_ins, total_pri += 1 if paired_end_reads: if a.qname in paired_end: # If two primary reads from same pair - er.pe += 2 + er.pe += 1 else: paired_end.add(a.qname) if a.has_tag("NM"): @@ -296,6 +313,8 @@ cdef count_attributes2(reads1, reads2, spanning, float insert_ppf, generic_ins, else: er.clip_qual_ratio = 0 + er.a_freq += len(seen) + cdef int within_read_end_position(int event_pos, CigarItem cigar_item): cdef int end @@ -918,7 +937,7 @@ cdef linear_scan_clustering(spanning, bint hp_tag): hp_count += 1 lengths = [s.cigar_item.len for s in hp_spanning] cluster_lengths(clusters, lengths, hp_spanning, eps) - # Merge near-identical haplotypes + # Merge near-identical haplotype if len(clusters) > 1: cl_srt = sorted([ [ np.mean([c.cigar_item.len for c in clst]), clst ] for clst in clusters], key=lambda x: x[0]) merged_haps = [cl_srt[0]] diff --git a/dysgu/cluster.pyx b/dysgu/cluster.pyx index 1b21d90..6ce56ad 100644 --- a/dysgu/cluster.pyx +++ b/dysgu/cluster.pyx @@ -511,8 +511,6 @@ def pipe1(args, infile, kind, regions, ibam, ref_genome, sample_name, bam_iter=N preliminaries = post_call.get_ref_base(preliminaries, ref_genome, args["symbolic_sv_size"]) - # preliminaries = post_call.filter_microsatellite_non_diploid(preliminaries) - preliminaries = extra_metrics.sample_level_density(preliminaries, regions) preliminaries = coverage_analyser.normalize_coverage_values(preliminaries) diff --git a/dysgu/coverage.pyx b/dysgu/coverage.pyx index 47e0f37..32ca3e5 100644 --- a/dysgu/coverage.pyx +++ b/dysgu/coverage.pyx @@ -621,5 +621,8 @@ def get_raw_coverage_information(events, regions, regions_depth, infile, max_cov if r.chrA != r.chrB: r.svlen = 1000000 r.mcov = max_depth + + r.a_freq = r.a_freq / reads_10kb + r.a_freq = round(max(0, min(r.a_freq, 1.0)), 2) new_events.append(r) return new_events diff --git a/dysgu/io_funcs.pyx b/dysgu/io_funcs.pyx index 4ac0bea..4d52d56 100644 --- a/dysgu/io_funcs.pyx +++ b/dysgu/io_funcs.pyx @@ -138,13 +138,13 @@ cpdef list col_names(small_output): if small_output: return ["chrA", "posA", "chrB", "posB", "sample", "event_id", "grp_id", "n_in_grp", "kind", "type", "svtype", "join_type", "cipos95A", "cipos95B", 'contigA', 'contigB', "svlen", "svlen_precise", "rep", "gc", - ["GT", "GQ", "MAPQpri", "MAPQsupp", "su", "spanning", "pe", "supp", "sc", "bnd", "NMbase", + ["GT", "GQ", "phase_set", "haplotype", "a_freq", "MAPQpri", "MAPQsupp", "su", "spanning", "pe", "supp", "sc", "bnd", "NMbase", "raw_reads_10kb", "neigh10kb", "plus", "minus", "remap_score", "remap_ed", "bad_clip_count", "fcc", "inner_cn", "outer_cn", "prob"] ] else: return ["chrA", "posA", "chrB", "posB", "sample", "event_id", "grp_id", "n_in_grp", "kind", "type", "svtype", "join_type", "cipos95A", "cipos95B", 'contigA', 'contigB', "svlen", "svlen_precise", "rep", "gc", - ["GT", "GQ", "NMpri", "NMsupp", "NMbase", "MAPQpri", "MAPQsupp", "NP", + ["GT", "GQ", "phase_set", "haplotype", "a_freq", "NMpri", "NMsupp", "NMbase", "MAPQpri", "MAPQsupp", "NP", "maxASsupp", "su", "spanning", "pe", "supp", "sc", "bnd", "sqc", "scw", "clip_qual_ratio", "block_edge", "raw_reads_10kb", "mcov", "linked", "neigh", "neigh10kb", "ref_bases", "plus", @@ -157,11 +157,6 @@ def make_main_record(r, dysgu_version, index, format_f, df_rows, add_kind, small rep, repsc, lenprec = 0, 0, 1 mean_prob, max_prob = None, None debug = False - # if abs(r['posA'] - 39084726) < 10: - # echo('found', dict(r)) - # echo(len(format_f) > 1) - # echo([(int(v["su"]), v["posA"], v["event_id"], k) for k, v in df_rows.items()]) - # debug = True if len(format_f) > 1: best = sorted([(int(v["su"]), k) for k, v in df_rows.items()], reverse=True)[0][1] probs = [v["prob"] for k, v in df_rows.items()] @@ -273,9 +268,9 @@ def make_main_record(r, dysgu_version, index, format_f, df_rows, add_kind, small info_extras += [f"MeanPROB={round(mean_prob, 3)}", f"MaxPROB={round(max_prob, 3)}"] if small_output: - fmt_keys = "GT:GQ:MAPQP:MAPQS:SU:WR:PE:SR:SC:BND:NMB:COV:NEIGH10:PS:MS:RMS:RED:BCC:FCC:ICN:OCN:PROB" + fmt_keys = "GT:GQ:PSET:HP:AF:MAPQP:MAPQS:SU:WR:PE:SR:SC:BND:NMB:COV:NEIGH10:PS:MS:RMS:RED:BCC:FCC:ICN:OCN:PROB" else: - fmt_keys = "GT:GQ:NMP:NMS:NMB:MAPQP:MAPQS:NP:MAS:SU:WR:PE:SR:SC:BND:SQC:SCW:SQR:BE:COV:MCOV:LNK:NEIGH:NEIGH10:RB:PS:MS:SBT:NG:NSA:NXA:NMU:NDC:RMS:RED:BCC:FCC:STL:RAS:FAS:ICN:OCN:CMP:RR:JIT:PROB" + fmt_keys = "GT:GQ:PSET:HP:AF:NMP:NMS:NMB:MAPQP:MAPQS:NP:MAS:SU:WR:PE:SR:SC:BND:SQC:SCW:SQR:BE:COV:MCOV:LNK:NEIGH:NEIGH10:RB:PS:MS:SBT:NG:NSA:NXA:NMU:NDC:RMS:RED:BCC:FCC:STL:RAS:FAS:ICN:OCN:CMP:RR:JIT:PROB" if "variant_seq" in r and isinstance(r["variant_seq"], str): if r['svtype'] == "INS" or r.variant_seq: @@ -318,13 +313,13 @@ def make_main_record(r, dysgu_version, index, format_f, df_rows, add_kind, small def get_fmt(r, small_output): if small_output: - v = [r["GT"], r["GQ"], r['MAPQpri'], r['MAPQsupp'], r['su'], r['spanning'], r['pe'], r['supp'], r['sc'], r['bnd'], r['NMbase'], + v = [r["GT"], r["GQ"], r["phase_set"], r["haplotype"], r['a_freq'], r['MAPQpri'], r['MAPQsupp'], r['su'], r['spanning'], r['pe'], r['supp'], r['sc'], r['bnd'], r['NMbase'], r['raw_reads_10kb'], r['neigh10kb'], r["plus"], r["minus"], r["remap_score"], r["remap_ed"], r["bad_clip_count"], round(r["fcc"], 3), round(r["inner_cn"], 3), round(r["outer_cn"], 3), r['prob'] ] return v else: - v = [r["GT"], r["GQ"], r['NMpri'], r['NMsupp'], r['NMbase'], r['MAPQpri'], + v = [r["GT"], r["GQ"], r["phase_set"], r["haplotype"], r['a_freq'], r['NMpri'], r['NMsupp'], r['NMbase'], r['MAPQpri'], r['MAPQsupp'], r['NP'], r['maxASsupp'], r['su'], r['spanning'], r['pe'], r['supp'], r['sc'], r['bnd'], round(r['sqc'], 2), round(r['scw'], 1), round(r['clip_qual_ratio'], 3), r['block_edge'], r['raw_reads_10kb'], round(r['mcov'], 2), int(r['linked']), r['neigh'], r['neigh10kb'], r['ref_bases'], r["plus"], r["minus"], round(r["strand_binom_t"], 4), r['n_gaps'], round(r["n_sa"], 2), @@ -407,6 +402,9 @@ def get_header(contig_names=""): ##ALT= ##FORMAT= ##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= ##FORMAT= ##FORMAT= ##FORMAT== 30 bp are ignored"> @@ -460,7 +458,6 @@ def get_header(contig_names=""): def to_vcf(df, args, names, outfile, show_names=True, contig_names="", header=None, small_output_f=True, sort_output=True, n_fields=None): - outfile.write(get_header(contig_names) + "\t" + "\t".join(names) + "\n") if show_names: @@ -478,21 +475,19 @@ def to_vcf(df, args, names, outfile, show_names=True, contig_names="", header=N count = 0 recs = [] - if args is not None: - add_kind = args["add_kind"] == "True" - if args["metrics"]: - small_output_f = False - if args["verbosity"] == '0': - df["contigA"] = [''] * len(df) - df["contigB"] = [''] * len(df) - elif args["verbosity"] == '1': - has_alt = [True if isinstance(i, str) and i[0] != '<' else False for i in df['variant_seq']] - df["contigA"] = ['' if a else c for c, a in zip(df['contigA'], has_alt)] - df["contigB"] = ['' if a else c for c, a in zip(df['contigB'], has_alt)] - - n_fields = len(col_names(small_output_f)[-1]) - else: + + add_kind = args["add_kind"] == "True" + if args["metrics"]: small_output_f = False + if args["verbosity"] == '0': + df["contigA"] = [''] * len(df) + df["contigB"] = [''] * len(df) + elif args["verbosity"] == '1': + has_alt = [True if (isinstance(i, str) and len(i) >= 1 and i[0] != '<') else False for i in df['variant_seq']] + df["contigA"] = ['' if a else c for c, a in zip(df['contigA'], has_alt)] + df["contigB"] = ['' if a else c for c, a in zip(df['contigB'], has_alt)] + + n_fields = len(col_names(small_output_f)[-1]) for idx, r in df.iterrows(): if idx in seen_idx: diff --git a/dysgu/map_set_utils.pxd b/dysgu/map_set_utils.pxd index 866a729..f27962a 100644 --- a/dysgu/map_set_utils.pxd +++ b/dysgu/map_set_utils.pxd @@ -297,4 +297,5 @@ cdef class EventResult: cdef public bint preciseA, preciseB, linked, modified, remapped cdef public int8_t svlen_precise cdef public object contig, contig2, contig_cigar, contig2_cigar, svtype, join_type, chrA, chrB, exp_seq, sample, type, \ - partners, GQ, GT, kind, ref_seq, variant_seq, left_ins_seq, right_ins_seq, site_info, qnames, haplotypes + partners, GQ, GT, kind, ref_seq, variant_seq, left_ins_seq, right_ins_seq, site_info + cdef public object qnames, haplotype, phase_set, a_freq diff --git a/dysgu/map_set_utils.pyx b/dysgu/map_set_utils.pyx index c882661..fec01cc 100644 --- a/dysgu/map_set_utils.pyx +++ b/dysgu/map_set_utils.pyx @@ -436,16 +436,8 @@ cdef class EventResult: self.n_sa = 0 self.n_gaps = 0 self.compress = 0 + self.a_freq = 0 self.qnames = set([]) def __repr__(self): return str(to_dict(self)) - # return str(self.to_dict()) - - # def __getstate__(self): # for pickling - # return to_dict(self) - # # return self.to_dict() - # - # def __setstate__(self, d): - # for k, v in d.items(): - # self.__setattr__(k, v) diff --git a/dysgu/post_call.py b/dysgu/post_call.py index cdfa231..931389e 100644 --- a/dysgu/post_call.py +++ b/dysgu/post_call.py @@ -9,8 +9,8 @@ import glob import gzip import pandas as pd -# import networkx as nx -# from itertools import combinations +import networkx as nx +from itertools import combinations pd.options.mode.chained_assignment = None from dysgu.scikitbio._ssw_wrapper import StripedSmithWaterman from dysgu.consensus import compute_rep @@ -324,6 +324,57 @@ def median(arr, start, end): return -1 +def get_ref_base(events, ref_genome, symbolic_sv_size): + for e in events: + if e.posA == 0: + e.posA = 1 # !! + if not e.ref_seq and not e.svtype == "BND": + if symbolic_sv_size == -1 or e.svtype in ("INS", "TRA", "INV"): + if e.posA == 0: + e.posA = 1 + try: + base = ref_genome.fetch(e.chrA, e.posA - 1, e.posA).upper() + e.ref_seq = base + except: + pass + else: + bases = "" + if e.svlen < symbolic_sv_size: + start = e.posA + end = e.posB + else: + start = e.posA - 1 + end - e.posA + try: + bases = ref_genome.fetch(e.chrA, start, end).upper() + except: + pass + if e.svtype == "DEL": + e.ref_seq = bases + e.variant_seq = bases[0] if bases else "" + else: + e.variant_seq = bases + e.ref_seq = bases[0] if bases else "" + + # + # + # + # try: + # bases = ref_genome.fetch(e.chrA, e.posA, e.posB).upper() + # + # except: + # pass + # e.ref_seq = bases + # e.variant_seq = bases[0] if len(bases) else "" + # else: + # try: + # bases = ref_genome.fetch(e.chrA, e.posA - 1, e.posA).upper() + # except: + # pass + # e.ref_seq = bases + return events + + def nCk(n,k): f = math.factorial return f(n) // f(k) // f(n-k) @@ -352,38 +403,6 @@ def strand_binom_t(events): return events -def get_ref_base(events, ref_genome, symbolic_sv_size): - for e in events: - if not e.ref_seq and not e.svtype == "BND": - if symbolic_sv_size == -1 or e.svtype == "INS" or e.svtype == "TRA": - if e.posA == 0: - e.posA = 1 - try: - base = ref_genome.fetch(e.chrA, e.posA - 1, e.posA).upper() - e.ref_seq = base - except: - pass - else: - if e.svlen < symbolic_sv_size: - if e.posA == 0: - e.posA = 1 - try: - bases = ref_genome.fetch(e.chrA, e.posA, e.posB).upper() - e.ref_seq = bases - e.variant_seq = bases[0] if len(bases) else "" - except: - pass - else: - if e.posA == 0: - e.posA = 1 - try: - base = ref_genome.fetch(e.chrA, e.posA - 1, e.posA).upper() - e.ref_seq = base - except: - pass - return events - - # from svtyper with minor modifications # https://github.com/hall-lab/svtyper/blob/master/svtyper/singlesample.py # efficient combinatorial function to handle extremely large numbers @@ -593,33 +612,102 @@ def is_reciprocal_overlapping(x1,x2, y1,y2): e2.GT = "0/1" -# def phase_haplotypes(events): -# qname_2_eventidx = defaultdict(set) -# for idx, e in enumerate(events): -# if e.qnames and e.GT.replace('/', '|') != '1|1' and e.GT[0] != '.': -# for qname in e.qnames: -# qname_2_eventidx[qname].add(idx) -# -# G = nx.Graph() -# for qname, idxs in qname_2_eventidx.items(): -# for u, v in combinations(idxs, 2): -# if G.has_edge(u, v): -# G[u][v]['weight'] += 1 -# else: -# G.add_edge(u, v, weight=1) -# bad_edges = [] -# for e in G.edges(data=True): -# if e[2]['weight'] < 2: -# bad_edges.append(e) -# G.remove_edges_from(bad_edges) -# echo('G', len(G.edges())) -# for component in nx.connected_components(G): -# echo('component', component) -# gts = [events[idx].GT for idx in component] -# echo('gts', gts) -# echo(list(f'{e.chrA}:{e.posA}({e.svlen})' for e in [events[idx] for idx in component])) -# sub = G.subgraph(component) -# echo(sub.edges(data=True)) +def low_ps_support(r, support_fraction=0.1): + min_support = round(1.5 + support_fraction * r.su) + return min_support + + + +def join_phase_sets(events, ps_id): + # Join phase sets if support is greater than threshold + G = nx.Graph() + for idx, e in enumerate(events): + if not e.phase_set: + e.phase_set = '' + continue + if len(e.phase_set) > 1: + if e.GT not in '1|11/1': # only heterozygous variants + threshold = low_ps_support(e) + passing_ps = [k for k, v in e.phase_set.items() if v >= threshold] + e.phase_set = passing_ps + if len(passing_ps) > 1: + G.add_edges_from(combinations(passing_ps, 2)) + else: + # Choose PS with maximum support + e.phase_set = str(max(e.phase_set, key=e.phase_set.get)) + else: + e.phase_set = str(list(e.phase_set.keys())[0]) + + new_phase_set = {} + for sub in nx.connected_components(G): + for n in sub: + new_phase_set[n] = str(ps_id) + ps_id += 1 + + for e in events: + if not e.phase_set or isinstance(e.phase_set, str): + continue + assert isinstance(e.phase_set, list) + new_p = list(set([new_phase_set[n] for n in e.phase_set if n in new_phase_set]))[0] + if not new_p: + e.phase_set = '' + else: + e.phase_set = str(new_p) + + +def get_hp_format(events): + max_ps = 0 + any_phase_set = False + keys = set([]) + for e in events: + if e.haplotype: + for k in e.haplotype: + if k != 'u': + keys.add(k) + keys = sorted(list(keys)) + for e in events: + if e.haplotype: + all_haps = e.haplotype + phased_haps = {k: v for k, v in e.haplotype.items() if k != 'u'} + n_haps = len(phased_haps) + n_phase_set = len(e.phase_set) if e.phase_set else 0 + if n_phase_set: + ps = max(e.phase_set, key=e.phase_set.get) + if ps > max_ps: + max_ps = ps + any_phase_set = True + else: + e.phase_set = "" + if n_haps >= 2: + if e.GT == "1/1": + e.GT = "1|1" + else: + best_hap = max(phased_haps, key=phased_haps.get) + if best_hap == 1: + e.GT = "1|0" + else: + e.GT = "0|1" + elif n_haps == 1: + if 1 in e.haplotype: + e.GT = "1|0" + else: + e.GT = "0|1" + + hp_string = '' + if n_haps >= 1: + unphased = all_haps['u'] if 'u' in all_haps else 0 + hp_string = '|'.join([str(phased_haps[k]) if k in phased_haps else '0' for k in keys]) + if unphased: + hp_string += f'_{unphased}' + if not hp_string: + hp_string = "." + + e.haplotype = hp_string + else: + e.haplotype = "." + + + return max_ps, any_phase_set def get_gt_metric2(events, mode, add_gt=True): @@ -677,68 +765,17 @@ def get_gt_metric2(events, mode, add_gt=True): fix_inconsistent_gt(events) - # any_haps = False - for e in events: - if e.haplotypes: - # any_haps = True - n_haps = len(e.haplotypes) - if n_haps >= 2: - if e.GT == "1/1": - e.GT = "1|1" - else: - best_hap = max(e.haplotypes, key=e.haplotypes.get) - if best_hap == 1: - e.GT = "1|0" - else: - e.GT = "0|1" - elif n_haps == 1: - if 1 in e.haplotypes: - e.GT = "1|0" - else: - e.GT = "0|1" + max_ps, any_phase_set = get_hp_format(events) - # phase_haplotypes(events) + if any_phase_set: + join_phase_sets(events, max_ps + 1) + else: + for e in events: + e.phase_set = "." return events -# def filter_microsatellite_non_diploid(potential): -# tmp_list = defaultdict(list) -# max_dist = 50 -# half_d = (max_dist * 0.5) -# candidates = [] -# for idx in range(len(potential)): -# ei = potential[idx] -# if (ei.svtype != "DEL" and ei.svtype != "INS") or not ei.variant_seq: -# continue -# rep = compute_rep(ei.variant_seq) -# if rep < 0.4: -# continue -# candidates.append(idx) -# tmp_list[ei.chrA].append((ei.posA - ei.cipos95A - max_dist, ei.posA + ei.cipos95A + max_dist, idx)) -# if ei.chrA == ei.chrB and ei.svlen > half_d: -# tmp_list[ei.chrA].append((ei.posB - ei.cipos95B - max_dist, ei.posB + ei.cipos95B + max_dist, idx)) -# -# nc2 = {k: iitree(v, add_value=True) for k, v in tmp_list.items()} -# seen = set([]) -# to_drop = set([]) -# for idx in candidates: -# if idx in seen: -# continue -# ei = potential[idx] -# ols = set(nc2[ei.chrA].allOverlappingIntervals(ei.posA, ei.posA + 1)) -# ols2 = set(nc2[ei.chrB].allOverlappingIntervals(ei.posB, ei.posB + 1)) -# ols |= ols2 -# ols.add(idx) -# if len(ols) <= 2: -# continue -# bad = sorted([(i, potential[i]) for i in ols], key=lambda x: x[1].su, reverse=True)[2:] -# for jdx, p in bad: -# to_drop.add(jdx) -# seen |= ols -# return [p for i, p in enumerate(potential) if i not in to_drop] - - def compressability(events): for e in events: c1 = [] diff --git a/dysgu/view.py b/dysgu/view.py index 8fdf428..ca04718 100644 --- a/dysgu/view.py +++ b/dysgu/view.py @@ -362,6 +362,9 @@ def vcf_to_df(path): "JIT": ("jitter", float), "LEFT_SVINSSEQ": ("left_ins_seq", str), "RIGHT_SVINSSEQ": ("right_ins_seq", str), + "PSET": ("phase_set", str), + "HP": ("haplotype", str), + "AF": ("a_freq", float), } # df = df[df.posA == 110156314] df.rename(columns={k: v[0] for k, v in col_map.items()}, inplace=True) From 672a90fd1be98273cd41742b3f118adbd2344023 Mon Sep 17 00:00:00 2001 From: kcleal Date: Mon, 24 Feb 2025 14:20:02 +0000 Subject: [PATCH 05/14] Fixed index error --- dysgu/post_call.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dysgu/post_call.py b/dysgu/post_call.py index 931389e..c1119b9 100644 --- a/dysgu/post_call.py +++ b/dysgu/post_call.py @@ -648,11 +648,11 @@ def join_phase_sets(events, ps_id): if not e.phase_set or isinstance(e.phase_set, str): continue assert isinstance(e.phase_set, list) - new_p = list(set([new_phase_set[n] for n in e.phase_set if n in new_phase_set]))[0] + new_p = list(set([new_phase_set[n] for n in e.phase_set if n in new_phase_set])) if not new_p: e.phase_set = '' else: - e.phase_set = str(new_p) + e.phase_set = str(new_p[0]) def get_hp_format(events): From f86145b17a2eda41d2e6543503338ae643d8750a Mon Sep 17 00:00:00 2001 From: kcleal Date: Wed, 26 Feb 2025 10:16:11 +0000 Subject: [PATCH 06/14] Changed empty PSET and HP to 0 --- README.rst | 30 ++++++++++++++++++++++-------- dysgu/post_call.py | 6 +++--- 2 files changed, 25 insertions(+), 11 deletions(-) diff --git a/README.rst b/README.rst index 977a918..c79c873 100644 --- a/README.rst +++ b/README.rst @@ -42,7 +42,7 @@ for calling structural variants using paired-end or long read sequencing data. S ⚙️ Installation --------------- -Dysgu can be installed via pip using Python 3.10 - 3.12 and has been tested on linux and MacOS. +Dysgu can be installed via pip using Python >=3.10 and has been tested on linux and MacOS. The list of python packages needed can be found in requirements.txt. To install:: @@ -79,7 +79,8 @@ For help use:: dysgu --help dysgu command --help -To use the python-API see the `documentation `_, or `jupyter notebook `_, +To use the python-API see the `documentation `_, +or `jupyter notebook `_, 🎯 Calling SVs @@ -87,13 +88,16 @@ To use the python-API see the `documentation svs.vcf -This will first run the `fetch` command that will create a temporary bam file plus other analysis files in the directory `temp_dir`. These temporary files are then analysed using the `call` program. +This will first run the `fetch` command that will create a temporary bam file plus other analysis files +in the directory `temp_dir`. These temporary files are then analysed using the `call` program. To make use of multiprocessing, set the "-p" parameter:: dysgu run -p4 reference.fa temp_dir input.bam > svs.vcf @@ -104,14 +108,24 @@ Dysgu also accepts reads from stdin. In this example, the --clean flag will remo Long reads ********** -Dysgy is designed to work with long reads aligned using minimap2 or ngmlr. Use the 'call' pipeline if starting with a bam file, or 'run' if starting with a cram:: +Dysgy is designed to work with long reads aligned using minimap2 or ngmlr. Use the 'call' pipeline if +starting with a bam file, or 'run' if starting with a cram:: dysgu call --mode pacbio-revio reference.fa temp_dir input.bam > svs.vcf dysgu call --mode nanopore-r10 reference.fa temp_dir input.bam > svs.vcf -Presets are available using the `--mode` option for PacBio `pacbio-sequel2 | pacbio-revio`, and ONT `nanopore-r9 | nanopore-r10` -If you are using using reads with higher error rates, or are unsure of the read-accuracy, it is recommended to set '--divergence auto', to infer a more conservative sequence divergence. The default is set at 0.02 which can be too stringent for lower accuracy reads and will result in more reads being filtered and lower sensitivity:: +Presets are available using the `--mode` option for PacBio `pacbio-sequel2 | pacbio-revio`, +and ONT `nanopore-r9 | nanopore-r10`. + +Since v1.8, higher calling accuracy can be achieved by adding "haplotags" to your input bam/cram file +(using hiphase / whatshap / longshot etc). Dysgu reads HP and PS tags automatically and produces +phased output calls. + +If you are using using reads with higher error rates, or are unsure of the read-accuracy, +it is recommended to set '--divergence auto', to infer a more conservative sequence divergence. +The default is set at 0.02 which can be too stringent for lower accuracy reads and will result in +more reads being filtered and lower sensitivity:: dysgu call --divergence auto --mode nanopore reference.fa temp_dir input.bam > svs.vcf diff --git a/dysgu/post_call.py b/dysgu/post_call.py index c1119b9..945c7f8 100644 --- a/dysgu/post_call.py +++ b/dysgu/post_call.py @@ -700,11 +700,11 @@ def get_hp_format(events): if unphased: hp_string += f'_{unphased}' if not hp_string: - hp_string = "." + hp_string = "0" e.haplotype = hp_string else: - e.haplotype = "." + e.haplotype = "0" return max_ps, any_phase_set @@ -771,7 +771,7 @@ def get_gt_metric2(events, mode, add_gt=True): join_phase_sets(events, max_ps + 1) else: for e in events: - e.phase_set = "." + e.phase_set = "-1" return events From 47d642cd6d9935541f8d7390f20a92e5722c4788 Mon Sep 17 00:00:00 2001 From: kcleal Date: Wed, 26 Feb 2025 10:20:37 +0000 Subject: [PATCH 07/14] PSET and HP default to empty string --- dysgu/post_call.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dysgu/post_call.py b/dysgu/post_call.py index 945c7f8..d400d10 100644 --- a/dysgu/post_call.py +++ b/dysgu/post_call.py @@ -700,11 +700,11 @@ def get_hp_format(events): if unphased: hp_string += f'_{unphased}' if not hp_string: - hp_string = "0" + hp_string = "" e.haplotype = hp_string else: - e.haplotype = "0" + e.haplotype = "" return max_ps, any_phase_set @@ -771,7 +771,7 @@ def get_gt_metric2(events, mode, add_gt=True): join_phase_sets(events, max_ps + 1) else: for e in events: - e.phase_set = "-1" + e.phase_set = "" return events From de5512f1527ffb55660205d8d2efbad936664ff4 Mon Sep 17 00:00:00 2001 From: kcleal Date: Wed, 26 Feb 2025 10:22:08 +0000 Subject: [PATCH 08/14] PSET and HP default to '-1' and '0' --- dysgu/post_call.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dysgu/post_call.py b/dysgu/post_call.py index d400d10..945c7f8 100644 --- a/dysgu/post_call.py +++ b/dysgu/post_call.py @@ -700,11 +700,11 @@ def get_hp_format(events): if unphased: hp_string += f'_{unphased}' if not hp_string: - hp_string = "" + hp_string = "0" e.haplotype = hp_string else: - e.haplotype = "" + e.haplotype = "0" return max_ps, any_phase_set @@ -771,7 +771,7 @@ def get_gt_metric2(events, mode, add_gt=True): join_phase_sets(events, max_ps + 1) else: for e in events: - e.phase_set = "" + e.phase_set = "-1" return events From f296b8452c7b20dbffd69c02aef1a55291614c8e Mon Sep 17 00:00:00 2001 From: kcleal Date: Tue, 4 Mar 2025 15:18:14 +0000 Subject: [PATCH 09/14] #125 better calling of low support SVs --- dysgu/call_component.pyx | 64 ++++------- dysgu/cluster.pyx | 2 +- dysgu/coverage.pyx | 62 +++-------- dysgu/graph.pyx | 234 +++++++++++++++++++-------------------- dysgu/post_call.py | 9 +- pyproject.toml | 2 +- 6 files changed, 155 insertions(+), 218 deletions(-) diff --git a/dysgu/call_component.pyx b/dysgu/call_component.pyx index 0d5ebcf..40c259b 100644 --- a/dysgu/call_component.pyx +++ b/dysgu/call_component.pyx @@ -672,7 +672,6 @@ cdef make_single_call(sub_informative, insert_size, insert_stdev, insert_ppf, mi as2 = None ref_bases = 0 if to_assemble or len(spanning_alignments) > 0: - # echo('MAKE SINGLE CALL') if er.preciseA: as1 = consensus.base_assemble(u_reads, er.posA, 500) if as1 and (er.svtype != "TRA" or (as1['contig'] and (as1['contig'][0].islower() or as1['contig'][-1].islower()))): @@ -965,7 +964,6 @@ cdef linear_scan_clustering(spanning, bint hp_tag): def process_spanning(bint paired_end, spanning_alignments, float divergence, length_extend, informative, generic_insertions, float insert_ppf, bint to_assemble, bint hp_tag): - # echo("PROCESS SPANNING") cdef int min_found_support = 0 cdef str svtype, jointype cdef bint passed @@ -2055,33 +2053,10 @@ cdef list multi(data, bam, int insert_size, int insert_stdev, float insert_ppf, seen.remove(u) if v in seen: seen.remove(v) - events += one_edge(rd_u, rd_v, clip_length, insert_size, insert_stdev, insert_ppf, min_support, 1, assemble_contigs, sites_info, paired_end, hp_tag) - # finds reads that should be a single partition - # u_reads, v_reads, u_single, v_single = filter_single_partitions(rd_u, rd_v) - # if len(u_reads) > 0 and len(v_reads) > 0: - # events += one_edge(rd_u, rd_v, clip_length, insert_size, insert_stdev, insert_ppf, min_support, 1, assemble_contigs, - # sites_info, paired_end) - # if u_single: - # res = single(u_single, insert_size, insert_stdev, insert_ppf, clip_length, min_support, assemble_contigs, - # sites_info, paired_end, length_extend, divergence) - # if res: - # if isinstance(res, EventResult): - # events.append(res) - # else: - # events += res - # if v_single: - # res = single(v_single, insert_size, insert_stdev, insert_ppf, clip_length, min_support, assemble_contigs, - # sites_info, paired_end, length_extend, divergence) - # if res: - # if isinstance(res, EventResult): - # events.append(res) - # else: - # events += res - # Process any singles / unconnected blocks if seen: for part_key in seen: @@ -2101,23 +2076,28 @@ cdef list multi(data, bam, int insert_size, int insert_stdev, float insert_ppf, events += res # Check for events within clustered nodes - if data.s_within: - for k, d in data.s_within: #.items(): - o_count = out_counts[k] - i_counts = len(d) - if i_counts > max_single_size: - continue - if o_count > 0 and i_counts > (2*min_support) and i_counts > o_count: - rds = get_reads(bam, d, data.reads, data.n2n, 0, info) - if len(rds) < lower_bound_support or (len(sites_info) != 0 and len(rds) == 0): - continue - res = single(rds, insert_size, insert_stdev, insert_ppf, clip_length, min_support, assemble_contigs, - sites_info, paired_end, length_extend, divergence, hp_tag) - if res: - if isinstance(res, EventResult): - events.append(res) - else: - events += res + # if data.s_within: + # for k, d in data.s_within: + # o_count = out_counts[k] + # i_counts = len(d) + # + # if i_counts > max_single_size: + # continue + # # if o_count > 0 and i_counts >= (2*min_support) and i_counts > o_count: + # # if o_count > 0 or i_counts > 0: + # rds = get_reads(bam, d, data.reads, data.n2n, 0, info) + # if len(rds) < lower_bound_support or (len(sites_info) != 0 and len(rds) == 0): + # continue + # echo('Calling single 1') + # res = single(rds, insert_size, insert_stdev, insert_ppf, clip_length, min_support, assemble_contigs, + # sites_info, paired_end, length_extend, divergence, hp_tag) + # for r in res: + # echo(r.posA, r.posB, r.svlen) + # if res: + # if isinstance(res, EventResult): + # events.append(res) + # else: + # events += res return events diff --git a/dysgu/cluster.pyx b/dysgu/cluster.pyx index 6ce56ad..dd9b56c 100644 --- a/dysgu/cluster.pyx +++ b/dysgu/cluster.pyx @@ -273,7 +273,7 @@ def pipe1(args, infile, kind, regions, ibam, ref_genome, sample_name, bam_iter=N if args["pl"] == "pe": # reads with internal SVs can be detected at lower support lower_bound_support = min_support - 1 if min_support - 1 > 1 else 1 else: - lower_bound_support = 2 + lower_bound_support = min_support component_path = f"{tdir}/components.bin" cdef bytes cmp_file = component_path.encode("ascii") # write components to file if low-mem used diff --git a/dysgu/coverage.pyx b/dysgu/coverage.pyx index 32ca3e5..1df5430 100644 --- a/dysgu/coverage.pyx +++ b/dysgu/coverage.pyx @@ -112,9 +112,6 @@ cdef class GenomeScanner: self.procs = read_threads self.buff_size = buffer_size self.current_bin = [] - self.current_cov = 0 - self.current_chrom = 0 - self.current_pos = 0 self.current_cov_array = None self.reads_dropped = 0 self.depth_d = {} @@ -152,10 +149,10 @@ cdef class GenomeScanner: if aln.flag & 1284 or aln.mapq < mq_thresh or cigar_l == 0: # not primary, duplicate or unmapped? continue - self._add_to_bin_buffer(aln, tell) + yield aln, tell tell = 0 if self.no_tell else self.input_bam.tell() - while len(self.staged_reads) > 0: - yield self.staged_reads.popleft() + # yield aln, tell + # when 'call' command was run only. generate coverage track here else: self.cpp_cov_track.set_max_cov(self.max_cov) @@ -167,7 +164,7 @@ cdef class GenomeScanner: cigar_l = aln._delegate.core.n_cigar if cigar_l == 0 or aln.flag & 1284: - tell = 0 if self.no_tell else self.input_bam.tell() + # tell = 0 if self.no_tell else self.input_bam.tell() continue if aln.rname != self.current_tid: @@ -203,18 +200,14 @@ cdef class GenomeScanner: index_start += length if aln.mapq < mq_thresh or not good_read or not self.cpp_cov_track.cov_val_good(self.current_tid, aln.rname, pos): - tell = 0 if self.no_tell else self.input_bam.tell() continue - - self._add_to_bin_buffer(aln, tell) + yield aln, tell tell = 0 if self.no_tell else self.input_bam.tell() - while len(self.staged_reads) > 0: - yield self.staged_reads.popleft() + # yield aln, tell + if self.current_tid != -1 and self.current_tid <= self.input_bam.nreferences: out_path = "{}/{}.dysgu_chrom.bin".format(self.cov_track_path, self.input_bam.get_reference_name(self.current_tid)).encode("ascii") self.cpp_cov_track.write_track(out_path) - if len(self.current_bin) > 0: - yield self.current_bin else: logging.info("Searching --regions and collecting reads from mate-pair locations") @@ -291,16 +284,13 @@ cdef class GenomeScanner: continue if not self.cpp_cov_track.cov_val_good(self.current_tid, aln.rname, pos): continue - self._add_to_bin_buffer(aln, tell) + yield aln, tell tell = 0 if self.no_tell else self.input_bam.tell() - while len(self.staged_reads) > 0: - yield self.staged_reads.popleft() + # yield aln, tell if self.current_tid != -1 and self.current_tid <= self.input_bam.nreferences: out_path = "{}/{}.dysgu_chrom.bin".format(self.cov_track_path, self.input_bam.get_reference_name(self.current_tid)).encode("ascii") self.cpp_cov_track.write_track(out_path) - if len(self.current_bin) > 0: - yield self.current_bin def get_read_properties(self, int max_tlen, int insert_median, int insert_stdev, int read_len, ibam=None, find_divergence=False): @@ -352,7 +342,8 @@ cdef class GenomeScanner: continue tell = 0 if self.no_tell else self.input_bam.tell() if self.no_tell: - self._add_to_bin_buffer(a, tell) + self.staged_reads.append((a, tell)) + # self._add_to_bin_buffer(a, tell) if a.rname != self.current_tid: if self.current_tid != -1 and self.current_tid <= self.input_bam.nreferences: out_path = "{}/{}.dysgu_chrom.bin".format(self.cov_track_path.outpath, self.input_bam.get_reference_name(self.current_tid)).encode("ascii") @@ -447,7 +438,7 @@ cdef class GenomeScanner: # Read the rest of the genome, reads are sent in blocks cdef int total_reads = 0 for staged in self._get_reads(): - total_reads += len(staged) + total_reads += 1 # len(staged) yield staged if total_reads == 0: logging.critical("No reads found, finishing") @@ -464,33 +455,6 @@ cdef class GenomeScanner: elif self.no_tell: raise BufferError("Read buffer has overflowed, increase --buffer-size") - def _add_to_bin_buffer(self, AlignedSegment a, tell): - # Calculates coverage information on fly, drops high coverage regions, buffers reads - cdef int flag = a.flag - cdef uint32_t cigar_l = a._delegate.core.n_cigar - if flag & 1540 or cigar_l == 0: # or a.seq is None: - return - cdef int rname = a.rname - cdef int apos = a.pos - cdef int bin_pos = int(apos / 100) - cdef int ref_length - cdef str reference_name = "" - cdef int aend = a.reference_end - cdef float current_coverage - if self.current_chrom != rname: - self.current_chrom = rname - # in_roi = False - # if self.overlap_regions: - # in_roi = intersecter(self.overlap_regions, a.rname, apos, apos+1) - if rname == self.current_chrom and bin_pos == self.current_pos: - self.current_bin.append((a, tell)) - else: - if len(self.current_bin) != 0: - self.staged_reads.append(self.current_bin) - self.current_chrom = rname - self.current_pos = bin_pos - self.current_bin = [(a, tell)] - cdef float add_coverage(int start, int end, DTYPE_t[:] chrom_depth) nogil: cdef float fs = start / 100 @@ -622,7 +586,7 @@ def get_raw_coverage_information(events, regions, regions_depth, infile, max_cov r.svlen = 1000000 r.mcov = max_depth - r.a_freq = r.a_freq / reads_10kb + r.a_freq = r.a_freq / (reads_10kb + 1e-5) r.a_freq = round(max(0, min(r.a_freq, 1.0)), 2) new_events.append(r) return new_events diff --git a/dysgu/graph.pyx b/dysgu/graph.pyx index cd0c984..2f0353d 100644 --- a/dysgu/graph.pyx +++ b/dysgu/graph.pyx @@ -827,7 +827,6 @@ cdef void add_to_graph(Py_SimpleGraph G, AlignedSegment r, PairedEndScoper_t pe_ pe_scope.find_other_nodes(node_name, chrom, event_pos, chrom2, pos2, read_enum, length_from_cigar, trust_ins_len) - # other_nodes = pe_scope.find_other_nodes(node_name, chrom, event_pos, chrom2, pos2, read_enum, length_from_cigar, trust_ins_len) if not dereference(found_nodes_exact).empty(): for other_node in dereference(found_nodes_exact): if node_to_name.same_template(node_name, other_node): @@ -1289,144 +1288,135 @@ cpdef tuple construct_graph(genome_scanner, infile, int max_dist, int clustering cdef int n_checked_for_hp_tag = 0 if no_phase: n_checked_for_hp_tag = 10_001 - for chunk in genome_scanner.iter_genome(): - for r, tell in chunk: - if r.mapq < mapq_thresh: + + for r, tell in genome_scanner.iter_genome(): + if r.mapq < mapq_thresh: + continue + pos2 = -1 + event_pos = r.pos + added = False + clipped = 0 + events_to_add.clear() + cigar_l = r._delegate.core.n_cigar + cigar_p = bam_get_cigar(r._delegate) + if not paired_end: + if n_checked_for_hp_tag < 10_000: + n_checked_for_hp_tag += 1 + if r.has_tag("HP"): + hp_tag_found = True + n_checked_for_hp_tag = 10_001 + if r.has_tag("NM") and edit_distance_too_high(cigar_p, cigar_l, max_divergence, r.get_tag("NM")): continue - pos2 = -1 - event_pos = r.pos - added = False - clipped = 0 - events_to_add.clear() - cigar_l = r._delegate.core.n_cigar - cigar_p = bam_get_cigar(r._delegate) - if not paired_end: - if n_checked_for_hp_tag < 10_000: - n_checked_for_hp_tag += 1 - if r.has_tag("HP"): - hp_tag_found = True - n_checked_for_hp_tag = 10_001 - if r.has_tag("NM") and edit_distance_too_high(cigar_p, cigar_l, max_divergence, r.get_tag("NM")): + + if cigar_l > 1: + if r.has_tag("SA"): + # Set cigar-index to -1 means it is unset, will be determined during SA parsing + cigar_index = -1 + process_alignment(G, r, clip_l, max_dist, gettid, + overlap_regions, clustering_dist, pe_scope, + cigar_index, event_pos, paired_end, tell, genome_scanner, + template_edges, node_to_name, pos2, mapq_thresh, clip_scope, ReadEnum_t.SPLIT, + bad_clip_counter, mm_only, site_adder, 0, trust_ins_len) + added = True + for cigar_index in range(cigar_l): + cigar_value = cigar_p[cigar_index] + opp = cigar_value & 15 + length = cigar_value >> 4 + + if opp == 0 or opp == 7 or opp == 8: + if find_n_aligned_bases: + n_aligned_bases += length + event_pos += length continue - if cigar_l > 1: - if r.has_tag("SA"): - # Set cigar-index to -1 means it is unset, will be determined during SA parsing - cigar_index = -1 + if opp == 1: + if length >= min_sv_size: + pos2 = event_pos + length + events_to_add.push_back(make_cigar_event(opp, cigar_index, event_pos, pos2, length, ReadEnum_t.INSERTION)) + added = True + elif opp == 2: + if length >= min_sv_size: + pos2 = event_pos + length + events_to_add.push_back(make_cigar_event(opp, cigar_index, event_pos, pos2, length, ReadEnum_t.DELETION)) + added = True + event_pos += length + elif opp == 3: + if length >= min_sv_size: + pos2 = event_pos + length + events_to_add.push_back(make_cigar_event(opp, cigar_index, event_pos, pos2, length, ReadEnum_t.SKIP)) + added = True + event_pos += length + else: + if opp != 4 and opp != 5: + event_pos += length + elif opp == 4 and length >= clip_l: + clipped = 1 + + if (paired_end and not added and contigs) or not paired_end: + # Whole alignment will be used, try infer position from soft-clip + cigar_index = -1 + pos2 = -1 + left_clip_size = 0 + right_clip_size = 0 + clip_sizes_hard(r, &left_clip_size, &right_clip_size) # soft and hard-clips + if r.flag & 8 and clipped: # paired by inference + # skip if both ends are clipped, usually means its a chunk of badly mapped sequence + # if not (left_clip_size and right_clip_size) and ((paired_end and good_quality_clip(r, 20)) or (not paired_end and) ): + if not (left_clip_size and right_clip_size) and good_quality_clip(r, 20): + # Mate is unmapped, insertion type. Only add if soft-clip is available process_alignment(G, r, clip_l, max_dist, gettid, overlap_regions, clustering_dist, pe_scope, cigar_index, event_pos, paired_end, tell, genome_scanner, - template_edges, node_to_name, pos2, mapq_thresh, clip_scope, ReadEnum_t.SPLIT, - bad_clip_counter, mm_only, site_adder, 0, trust_ins_len) - added = True - for cigar_index in range(cigar_l): - cigar_value = cigar_p[cigar_index] - opp = cigar_value & 15 - length = cigar_value >> 4 - - if opp == 0 or opp == 7 or opp == 8: - if find_n_aligned_bases: - n_aligned_bases += length - event_pos += length - continue - - if opp == 1: - if length >= min_sv_size: - # Insertion type - # if event_pos < 998000 or event_pos > 999000: - # continue - pos2 = event_pos + length - # if not events_to_add.empty() and event_pos - events_to_add.back().event_pos < elide_threshold and events_to_add.back().read_enum == ReadEnum_t.INSERTION: - # continue - # else: - events_to_add.push_back(make_cigar_event(opp, cigar_index, event_pos, pos2, length, ReadEnum_t.INSERTION)) - added = True - elif opp == 2: - if length >= min_sv_size: - pos2 = event_pos + length - # if not events_to_add.empty() and event_pos - events_to_add.back().event_pos < elide_threshold and events_to_add.back().read_enum == ReadEnum_t.DELETION: - # continue - # else: - events_to_add.push_back(make_cigar_event(opp, cigar_index, event_pos, pos2, length, ReadEnum_t.DELETION)) - added = True - event_pos += length - elif opp == 3: - if length >= min_sv_size: - pos2 = event_pos + length - events_to_add.push_back(make_cigar_event(opp, cigar_index, event_pos, pos2, length, ReadEnum_t.SKIP)) - added = True - event_pos += length - else: - if opp != 4 and opp != 5: - event_pos += length - elif opp == 4 and length >= clip_l: - clipped = 1 - - if (paired_end and not added and contigs) or not paired_end: - # Whole alignment will be used, try infer position from soft-clip - cigar_index = -1 - pos2 = -1 - left_clip_size = 0 - right_clip_size = 0 - clip_sizes_hard(r, &left_clip_size, &right_clip_size) # soft and hard-clips - if r.flag & 8 and clipped: # paired by inference - # skip if both ends are clipped, usually means its a chunk of badly mapped sequence - # if not (left_clip_size and right_clip_size) and ((paired_end and good_quality_clip(r, 20)) or (not paired_end and) ): - if not (left_clip_size and right_clip_size) and good_quality_clip(r, 20): - # Mate is unmapped, insertion type. Only add if soft-clip is available + template_edges, node_to_name, + pos2, mapq_thresh, clip_scope, ReadEnum_t.BREAKEND, bad_clip_counter, + mm_only, site_adder, 0, trust_ins_len) + else: + # Use whole read, could be normal or discordant + if not paired_end: + if max(left_clip_size, right_clip_size) > 250: + read_enum = ReadEnum_t.BREAKEND + if left_clip_size > right_clip_size: + event_pos = r.pos # else reference_end is used process_alignment(G, r, clip_l, max_dist, gettid, overlap_regions, clustering_dist, pe_scope, cigar_index, event_pos, paired_end, tell, genome_scanner, template_edges, node_to_name, - pos2, mapq_thresh, clip_scope, ReadEnum_t.BREAKEND, bad_clip_counter, + pos2, mapq_thresh, clip_scope, read_enum, bad_clip_counter, mm_only, site_adder, 0, trust_ins_len) - else: - # Use whole read, could be normal or discordant - if not paired_end: - if max(left_clip_size, right_clip_size) > 250: - read_enum = ReadEnum_t.BREAKEND - if left_clip_size > right_clip_size: - event_pos = r.pos # else reference_end is used - process_alignment(G, r, clip_l, max_dist, gettid, - overlap_regions, clustering_dist, pe_scope, - cigar_index, event_pos, paired_end, tell, genome_scanner, - template_edges, node_to_name, - pos2, mapq_thresh, clip_scope, read_enum, bad_clip_counter, - mm_only, site_adder, 0, trust_ins_len) + else: + if r.flag & 2 and abs(r.tlen) < max_dist and r.rname == r.rnext: + if not clipped: + continue + read_enum = ReadEnum_t.BREAKEND else: - if r.flag & 2 and abs(r.tlen) < max_dist and r.rname == r.rnext: - if not clipped: - continue - read_enum = ReadEnum_t.BREAKEND - else: - read_enum = ReadEnum_t.DISCORDANT + read_enum = ReadEnum_t.DISCORDANT - if left_clip_size or right_clip_size: - if left_clip_size > right_clip_size: - event_pos = r.pos # else reference_end is used - process_alignment(G, r, clip_l, max_dist, gettid, - overlap_regions, clustering_dist, pe_scope, - cigar_index, event_pos, paired_end, tell, genome_scanner, - template_edges, node_to_name, - pos2, mapq_thresh, clip_scope, read_enum, bad_clip_counter, - mm_only, site_adder, 0, trust_ins_len) - # process within-read events - if not events_to_add.empty(): - itr_events = events_to_add.begin() - while itr_events != events_to_add.end(): - v = dereference(itr_events) - if v.cigar_skip: - pos2 = v.pos2 - else: - pos2 = v.event_pos + v.length # fall back on original cigar event length + if left_clip_size or right_clip_size: + if left_clip_size > right_clip_size: + event_pos = r.pos # else reference_end is used process_alignment(G, r, clip_l, max_dist, gettid, overlap_regions, clustering_dist, pe_scope, - v.cigar_index, v.event_pos, paired_end, tell, genome_scanner, + cigar_index, event_pos, paired_end, tell, genome_scanner, template_edges, node_to_name, - v.pos2, mapq_thresh, clip_scope, v.read_enum, bad_clip_counter, - mm_only, site_adder, v.length, trust_ins_len) - preincrement(itr_events) + pos2, mapq_thresh, clip_scope, read_enum, bad_clip_counter, + mm_only, site_adder, 0, trust_ins_len) + # process within-read events + if not events_to_add.empty(): + itr_events = events_to_add.begin() + while itr_events != events_to_add.end(): + v = dereference(itr_events) + if v.cigar_skip: + pos2 = v.pos2 + else: + pos2 = v.event_pos + v.length # fall back on original cigar event length + process_alignment(G, r, clip_l, max_dist, gettid, + overlap_regions, clustering_dist, pe_scope, + v.cigar_index, v.event_pos, paired_end, tell, genome_scanner, + template_edges, node_to_name, + v.pos2, mapq_thresh, clip_scope, v.read_enum, bad_clip_counter, + mm_only, site_adder, v.length, trust_ins_len) + preincrement(itr_events) add_template_edges(G, template_edges) if site_adder: diff --git a/dysgu/post_call.py b/dysgu/post_call.py index 945c7f8..4ae45cb 100644 --- a/dysgu/post_call.py +++ b/dysgu/post_call.py @@ -2,7 +2,7 @@ import numpy as np from dysgu.map_set_utils import echo from dysgu import re_map -from dysgu.io_funcs import reverse_complement, intersecter #, iitree +from dysgu.io_funcs import reverse_complement, intersecter import zlib import math import pickle @@ -17,8 +17,11 @@ from collections import defaultdict, Counter import os import warnings -from sklearn.exceptions import InconsistentVersionWarning -warnings.filterwarnings(action='ignore', category=InconsistentVersionWarning) +try: + from sklearn.exceptions import InconsistentVersionWarning + warnings.filterwarnings(action='ignore', category=InconsistentVersionWarning) +except ImportError: + pass def get_badclip_metric(events, bad_clip_counter, bam, regions): diff --git a/pyproject.toml b/pyproject.toml index 91c0451..13eaf63 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,7 +10,7 @@ build-backend = "setuptools.build_meta" [project] name = "dysgu" -version = "1.8.0" +version = "1.8.1" description = "Structural variant calling" authors = [ { name = "Kez Cleal", email = "clealk@cardiff.ac.uk" } From 8713824fccb15ff44e8d943b8af257b8f4a3d3fb Mon Sep 17 00:00:00 2001 From: kcleal Date: Tue, 4 Mar 2025 15:26:16 +0000 Subject: [PATCH 10/14] #125 better calling of low support SVs --- .github/workflows/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index bf6a5fa..bdf396d 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -10,7 +10,7 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - os: [ubuntu-20.04, macOS-14] + os: [ubuntu-22.04, macOS-14] steps: - uses: actions/checkout@v4 - name: Set correct paths for Homebrew on macOS From 02d46b90cf8b7ac97bfa7efe5a58f0cd3083aa4f Mon Sep 17 00:00:00 2001 From: kcleal Date: Tue, 4 Mar 2025 16:06:57 +0000 Subject: [PATCH 11/14] Updated requirements.txt --- pyproject.toml | 2 +- requirements.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 13eaf63..7c50a01 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,7 +28,7 @@ dependencies = [ "networkx >= 2.4", "scikit-learn >= 0.22", "sortedcontainers", - "superintervals", + "superintervals >= 0.2.10", "lightgbm" ] diff --git a/requirements.txt b/requirements.txt index 96ee1eb..9d57a6b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,5 +7,5 @@ pandas pysam>=0.22 networkx>=2.4 scikit-learn>=0.22 -superintervals +superintervals>=0.2.10 lightgbm From 27e28507bfbfdfddcd3e272c574b835abf865ac8 Mon Sep 17 00:00:00 2001 From: kcleal Date: Wed, 5 Mar 2025 13:35:20 +0000 Subject: [PATCH 12/14] Fixed large record output, switched to symbolic --- dysgu/call_component.pyx | 67 +++++++++++++++++++++++++++------------- dysgu/post_call.py | 59 ++++++++++++++--------------------- 2 files changed, 68 insertions(+), 58 deletions(-) diff --git a/dysgu/call_component.pyx b/dysgu/call_component.pyx index 40c259b..e18ceb6 100644 --- a/dysgu/call_component.pyx +++ b/dysgu/call_component.pyx @@ -2076,28 +2076,51 @@ cdef list multi(data, bam, int insert_size, int insert_stdev, float insert_ppf, events += res # Check for events within clustered nodes - # if data.s_within: - # for k, d in data.s_within: - # o_count = out_counts[k] - # i_counts = len(d) - # - # if i_counts > max_single_size: - # continue - # # if o_count > 0 and i_counts >= (2*min_support) and i_counts > o_count: - # # if o_count > 0 or i_counts > 0: - # rds = get_reads(bam, d, data.reads, data.n2n, 0, info) - # if len(rds) < lower_bound_support or (len(sites_info) != 0 and len(rds) == 0): - # continue - # echo('Calling single 1') - # res = single(rds, insert_size, insert_stdev, insert_ppf, clip_length, min_support, assemble_contigs, - # sites_info, paired_end, length_extend, divergence, hp_tag) - # for r in res: - # echo(r.posA, r.posB, r.svlen) - # if res: - # if isinstance(res, EventResult): - # events.append(res) - # else: - # events += res + # if paired_end and data.s_within: + # for item in events: + # echo(item.svlen) + # echo('s_within', data.s_within) + # echo('parts', data.parts) + # echo('s_between', data.s_between) + + # for k, d in data.s_within: + # o_count = out_counts[k] + # i_counts = len(d) + # + # if i_counts > max_single_size: + # continue + # # if o_count > 0 and i_counts >= (2*min_support) and i_counts > o_count: + # # if o_count > 0 or i_counts > 0: + # rds = get_reads(bam, d, data.reads, data.n2n, 0, info) + # if len(rds) < lower_bound_support or (len(sites_info) != 0 and len(rds) == 0): + # continue + # res = single(rds, insert_size, insert_stdev, insert_ppf, clip_length, min_support, assemble_contigs, + # sites_info, paired_end, length_extend, divergence, hp_tag) + # if res: + # if isinstance(res, EventResult): + # events.append(res) + # else: + # events += res + # for item in events: + # echo(item.svlen) + # echo() + if paired_end and data.s_within: + for k, d in data.s_within: + o_count = out_counts[k] + i_counts = len(d) + if i_counts > max_single_size: + continue + if o_count > 0 and i_counts > (2 * min_support) and i_counts > o_count: + rds = get_reads(bam, d, data.reads, data.n2n, 0, info) + if len(rds) < lower_bound_support or (len(sites_info) != 0 and len(rds) == 0): + continue + res = single(rds, insert_size, insert_stdev, insert_ppf, clip_length, min_support, assemble_contigs, + sites_info, paired_end, length_extend, divergence, hp_tag) + if res: + if isinstance(res, EventResult): + events.append(res) + else: + events += res return events diff --git a/dysgu/post_call.py b/dysgu/post_call.py index 4ae45cb..6f2eb9c 100644 --- a/dysgu/post_call.py +++ b/dysgu/post_call.py @@ -330,51 +330,38 @@ def median(arr, start, end): def get_ref_base(events, ref_genome, symbolic_sv_size): for e in events: if e.posA == 0: - e.posA = 1 # !! + e.posA = 1 if not e.ref_seq and not e.svtype == "BND": if symbolic_sv_size == -1 or e.svtype in ("INS", "TRA", "INV"): - if e.posA == 0: - e.posA = 1 try: base = ref_genome.fetch(e.chrA, e.posA - 1, e.posA).upper() e.ref_seq = base except: pass else: - bases = "" - if e.svlen < symbolic_sv_size: - start = e.posA + if e.svlen < symbolic_sv_size: # Fetch the variant sequence + start = e.posA - 1 # Adjust to 0-based and include preceding base end = e.posB - else: - start = e.posA - 1 - end - e.posA - try: - bases = ref_genome.fetch(e.chrA, start, end).upper() - except: - pass - if e.svtype == "DEL": - e.ref_seq = bases - e.variant_seq = bases[0] if bases else "" - else: - e.variant_seq = bases - e.ref_seq = bases[0] if bases else "" - - # - # - # - # try: - # bases = ref_genome.fetch(e.chrA, e.posA, e.posB).upper() - # - # except: - # pass - # e.ref_seq = bases - # e.variant_seq = bases[0] if len(bases) else "" - # else: - # try: - # bases = ref_genome.fetch(e.chrA, e.posA - 1, e.posA).upper() - # except: - # pass - # e.ref_seq = bases + try: + bases = ref_genome.fetch(e.chrA, start, end).upper() + except: + bases = "" + if e.svtype == "DEL": + e.ref_seq = bases + e.variant_seq = bases[0] if bases else "" + else: # For duplications, etc. + e.variant_seq = bases + e.ref_seq = bases[0] if bases else "" + else: # Use symbolic representation + try: + base = ref_genome.fetch(e.chrA, e.posA - 1, e.posA).upper() + e.ref_seq = base + e.variant_seq = base # For non-DEL SVs + if e.svtype == "DEL": + e.variant_seq = base + except: + pass + return events From 561d5914c622a02fa6511a5a1a52aeb3abb6b4a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ond=C5=99ej=20Sloup?= Date: Sun, 6 Apr 2025 11:52:42 +0200 Subject: [PATCH 13/14] fix: relax input BAM file validation (#131) --- dysgu/coverage.pyx | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/dysgu/coverage.pyx b/dysgu/coverage.pyx index 1df5430..7b30641 100644 --- a/dysgu/coverage.pyx +++ b/dysgu/coverage.pyx @@ -328,14 +328,8 @@ cdef class GenomeScanner: if "SO" in hd: if hd["SO"] == "unsorted": raise ValueError("Input bam file must be sorted") - - prev_alignment = None + for a in file_iter: - # double check with index - if prev_alignment and prev_alignment.reference_id > a.reference_id: - raise ValueError("Input bam file must be sorted") - prev_alignment = a - if ibam is None: cigar_l = a._delegate.core.n_cigar if a.flag & 1284 or a.mapq < self.mapq_threshold or cigar_l == 0: From d885e1cf54ff014abe488490f8afc1d029a43be7 Mon Sep 17 00:00:00 2001 From: kcleal Date: Mon, 7 Apr 2025 11:41:45 +0100 Subject: [PATCH 14/14] Zero based POS --- dysgu/post_call.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dysgu/post_call.py b/dysgu/post_call.py index 6f2eb9c..0f3c97a 100644 --- a/dysgu/post_call.py +++ b/dysgu/post_call.py @@ -339,8 +339,8 @@ def get_ref_base(events, ref_genome, symbolic_sv_size): except: pass else: - if e.svlen < symbolic_sv_size: # Fetch the variant sequence - start = e.posA - 1 # Adjust to 0-based and include preceding base + if e.svlen < symbolic_sv_size: # Fetch the variant sequence + start = e.posA end = e.posB try: bases = ref_genome.fetch(e.chrA, start, end).upper()