From dc0ff467ff128abc40f6e1b3090ac58f4c2d96d9 Mon Sep 17 00:00:00 2001
From: Kez Cleal <42997789+kcleal@users.noreply.github.com>
Date: Fri, 31 Jan 2025 10:59:43 +0000
Subject: [PATCH 01/14] Dysgu_dev <-- master (#121)

* Dysgu dev (#115)

* dysgu_dev <- master (#108)

* correct usage of update_filter_value in filter_normals

It seems like at some point the usage of update_filter_value changed, because in several spots, it is missing the sample_name argument. This breaks `dysgu filter --keep-all`.

* Update main.yml

* Update main.yml

* Update requirements.txt

* updated build process to pyproject.toml (#103)

* v1.6.6 updated README

* Update main.yml

* Added pyproject.toml

* Updated pyproject.toml and setup.py

* Update pyproject.toml

---------

Co-authored-by: Dr. K. D. Murray <1560490+kdm9@users.noreply.github.com>

* Better recall+precision for long-reads. Improved merging pipeline. Faster runtime. Code refactoring. WIP # [skip ci]

* Fixed issues with overriding presets. --mode option updated # [skip ci]

* Fixed issues with overriding presets. --mode option updated to support r10 and revio, new presets available. --no-gt flat removed # [skip ci]

* Fixed CLI issues for tests # [skip ci]

* Fixed API issue

* v1.7.0

* added deprecated warning for mode

* Added filter for high divergence reads (long-reads only) # [skip ci]

* Better read merging for long reads. New clustering methods for long reads. Improved runtime.

* Update main.yml

* Update main.yml

* Update osx-deps

* Update main.yml macos11

* Update main.yml

* Update main.yml

* Update main.yml

* Update osx-deps

* Update osx-deps

* Update osx-deps

* Update osx-deps

* Update main.yml

* Update osx-deps

* Update main.yml

* Updated workflow

* Updated workflow

* Updated workflow

* Update main.yml

* Updated workflow

---------

Co-authored-by: Dr. K. D. Murray <1560490+kdm9@users.noreply.github.com>

* Fixes compile error for clang

* Fixed merge None type error. Added wb3 as default compression level for long-reads (run-pipeline) (#116)

* Updated workflow

* Updated workflow

* Fixed compiler warning

---------

Co-authored-by: Dr. K. D. Murray <1560490+kdm9@users.noreply.github.com>
---
 .github/workflows/main.yml |  2 --
 dysgu/call_component.pyx   | 18 ++++++++----------
 dysgu/graph.pyx            |  4 ++--
 dysgu/map_set_utils.pxd    |  4 ++--
 dysgu/map_set_utils.pyx    | 12 ++++++------
 setup.py                   |  1 -
 6 files changed, 18 insertions(+), 23 deletions(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 33b9b6f..bf6a5fa 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -11,8 +11,6 @@ jobs:
     strategy:
       matrix:
         os: [ubuntu-20.04, macOS-14]
-        #os: [ubuntu-latest, macOS-11]
-
     steps:
       - uses: actions/checkout@v4
       - name: Set correct paths for Homebrew on macOS
diff --git a/dysgu/call_component.pyx b/dysgu/call_component.pyx
index 94a608d..71b3a84 100644
--- a/dysgu/call_component.pyx
+++ b/dysgu/call_component.pyx
@@ -105,7 +105,7 @@ cdef base_quals_aligned_clipped(AlignedSegment a):
     cdef float clipped_base_quals = 0
     cdef int left_clip = 0
     cdef int right_clip = 0
-    clip_sizes(a, left_clip, right_clip)
+    clip_sizes(a, &left_clip, &right_clip)
     clipped_bases = left_clip + right_clip
     cdef const unsigned char[:] quals = a.query_qualities
     cdef int i
@@ -154,7 +154,7 @@ cdef count_attributes2(reads1, reads2, spanning, float insert_ppf, generic_ins,
             er.n_unmapped_mates += 1
         left_clip = 0
         right_clip = 0
-        clip_sizes_hard(a, left_clip, right_clip)
+        clip_sizes_hard(a, &left_clip, &right_clip)
         if left_clip > 0 and right_clip > 0:
             er.double_clips += 1
         has_sa = a.has_tag("SA")
@@ -206,7 +206,7 @@ cdef count_attributes2(reads1, reads2, spanning, float insert_ppf, generic_ins,
 
         left_clip = 0
         right_clip = 0
-        clip_sizes(a, left_clip, right_clip)
+        clip_sizes(a, &left_clip, &right_clip)
         if left_clip or right_clip:
             er.sc += 1
         if a.flag & 1:  # paired read
@@ -222,7 +222,7 @@ cdef count_attributes2(reads1, reads2, spanning, float insert_ppf, generic_ins,
             er.NP += 1
         left_clip = 0
         right_clip = 0
-        clip_sizes_hard(a, left_clip, right_clip)
+        clip_sizes_hard(a, &left_clip, &right_clip)
         if left_clip > 0 and right_clip > 0:
             er.double_clips += 1
         if a.has_tag("SA"):
@@ -507,7 +507,7 @@ cdef make_generic_insertion_item(aln, int insert_size, int insert_std):
         v_item.svtype = "BND"
         left_clip = 0
         right_clip = 0
-        clip_sizes(aln, left_clip, right_clip)
+        clip_sizes(aln, &left_clip, &right_clip)
         clip_s = max(left_clip, right_clip)
         rand_insert_pos = 100 if not clip_s else clip_s
     v_item.inferred_sv_len = 0 if rand_insert_pos < 0 else rand_insert_pos
@@ -945,6 +945,7 @@ cdef linear_scan_clustering(spanning, bint hp_tag):
 
 def process_spanning(bint paired_end, spanning_alignments, float divergence, length_extend, informative,
                      generic_insertions, float insert_ppf, bint to_assemble, bint hp_tag):
+
     # echo("PROCESS SPANNING")
     cdef int min_found_support = 0
     cdef str svtype, jointype
@@ -996,6 +997,7 @@ def process_spanning(bint paired_end, spanning_alignments, float divergence, len
 
         # 1.7.0
         # svlen = int(np.median([sp.cigar_item.len for sp in spanning_alignments]))
+
         posA = spanning_alignments[best_index].pos
         posB = spanning_alignments[best_index].end
         er.preciseA = True
@@ -2020,8 +2022,7 @@ cdef list multi(data, bam, int insert_size, int insert_stdev, float insert_ppf,
         for (u, v), d in data.s_between: #.items():
             rd_u = get_reads(bam, d[0], data.reads, n2n, add_to_buffer, info)   # [(Nodeinfo, alignment)..]
             rd_v = get_reads(bam, d[1], data.reads, n2n, add_to_buffer, info)
-            # echo("rd_u", [rr[1].qname for rr in rd_u])
-            # echo("rd_v", [rr[1].qname for rr in rd_v])
+
             total_reads = len(rd_u) + len(rd_v)
             buffered_reads += total_reads
             if add_to_buffer and buffered_reads > 50000:
@@ -2106,9 +2107,6 @@ cpdef list call_from_block_model(bam, data, clip_length, insert_size, insert_std
     n_parts = len(data.parts) if data.parts else 0
     events = []
     info = data.info
-    # echo(data.parts)
-    # echo(data.s_between)
-    # echo(data.s_within)
     if data.reads is None:
         data.reads = {}
     # next deal with info - need to filter these into the partitions, then deal with them in single / one_edge
diff --git a/dysgu/graph.pyx b/dysgu/graph.pyx
index 78a0e6e..cd0c984 100644
--- a/dysgu/graph.pyx
+++ b/dysgu/graph.pyx
@@ -252,7 +252,7 @@ cdef class ClipScoper:
                unordered_set[int]& clustered_nodes):
         cdef int clip_left = 0
         cdef int clip_right = 0
-        clip_sizes(r, clip_left, clip_right)
+        clip_sizes(r, &clip_left, &clip_right)
         if chrom != self.current_chrom:
             self.scope_left.clear()
             self.scope_right.clear()
@@ -1368,7 +1368,7 @@ cpdef tuple construct_graph(genome_scanner, infile, int max_dist, int clustering
                 pos2 = -1
                 left_clip_size = 0
                 right_clip_size = 0
-                clip_sizes_hard(r, left_clip_size, right_clip_size)  # soft and hard-clips
+                clip_sizes_hard(r, &left_clip_size, &right_clip_size)  # soft and hard-clips
                 if r.flag & 8 and clipped:  # paired by inference
                     # skip if both ends are clipped, usually means its a chunk of badly mapped sequence
                     # if not (left_clip_size and right_clip_size) and ((paired_end and good_quality_clip(r, 20)) or (not paired_end and) ):
diff --git a/dysgu/map_set_utils.pxd b/dysgu/map_set_utils.pxd
index 2dcb72b..866a729 100644
--- a/dysgu/map_set_utils.pxd
+++ b/dysgu/map_set_utils.pxd
@@ -266,9 +266,9 @@ cdef extern from "<map>" namespace "std" nogil:
 
 cdef int cigar_exists(AlignedSegment r)
 
-cdef void clip_sizes(AlignedSegment r, int& left, int& right)
+cdef void clip_sizes(AlignedSegment r, int* left, int* right)
 
-cdef void clip_sizes_hard(AlignedSegment r, int& left, int& right)
+cdef void clip_sizes_hard(AlignedSegment r, int* left, int* right)
 
 cdef int cigar_clip(AlignedSegment r, int clip_length)
 
diff --git a/dysgu/map_set_utils.pyx b/dysgu/map_set_utils.pyx
index 4b69999..e6ab2e0 100644
--- a/dysgu/map_set_utils.pyx
+++ b/dysgu/map_set_utils.pyx
@@ -223,7 +223,7 @@ cdef int cigar_exists(AlignedSegment r):
     return 0
 
 
-cdef void clip_sizes(AlignedSegment r, int& left, int& right):
+cdef void clip_sizes(AlignedSegment r, int* left, int* right):
     cdef uint32_t cigar_value
     cdef uint32_t cigar_l
     cdef uint32_t *cigar_p
@@ -235,14 +235,14 @@ cdef void clip_sizes(AlignedSegment r, int& left, int& right):
     cigar_value = cigar_p[0]
     opp = <int> cigar_value & 15
     if opp == 4:
-        left = <int> cigar_value >> 4
+        left[0] = <int> cigar_value >> 4
     cigar_value = cigar_p[cigar_l - 1]
     opp = <int> cigar_value & 15
     if opp == 4:
-        right = <int> cigar_value >> 4
+        right[0] = <int> cigar_value >> 4
 
 
-cdef void clip_sizes_hard(AlignedSegment r, int& left, int& right):
+cdef void clip_sizes_hard(AlignedSegment r, int* left, int* right):
     cdef uint32_t cigar_value
     cdef uint32_t cigar_l
     cdef uint32_t *cigar_p
@@ -255,11 +255,11 @@ cdef void clip_sizes_hard(AlignedSegment r, int& left, int& right):
     cigar_value = cigar_p[0]
     opp = <int> cigar_value & 15
     if opp == 4 or opp == 5:
-        left = <int> cigar_value >> 4
+        left[0] = <int> cigar_value >> 4
     cigar_value = cigar_p[cigar_l - 1]
     opp = <int> cigar_value & 15
     if opp == 4 or opp == 5:
-        right = <int> cigar_value >> 4
+        right[0] = <int> cigar_value >> 4
 
 
 cdef int cigar_clip(AlignedSegment r, int clip_length):
diff --git a/setup.py b/setup.py
index ee20dde..4fdfa27 100644
--- a/setup.py
+++ b/setup.py
@@ -46,7 +46,6 @@ def get_extra_args():
 extras = get_extra_args() + ["-Wno-sign-compare", "-Wno-unused-function",
                              "-Wno-unused-result", '-Wno-ignored-qualifiers',
                              "-Wno-deprecated-declarations", "-fpermissive",
-                             "-Wno-unreachable-code-fallthrough",
                              ]
 
 def get_extension_modules():

From 5702658354c2554b6017be493561592799336ccc Mon Sep 17 00:00:00 2001
From: kcleal <clealk@cardiff.ac.uk>
Date: Fri, 31 Jan 2025 11:33:22 +0000
Subject: [PATCH 02/14] Fixed compile errors. Add logging for HP tag usage

---
 dysgu/call_component.pyx | 7 ++++---
 dysgu/cluster.pyx        | 3 +++
 dysgu/extra_metrics.pyx  | 2 +-
 dysgu/map_set_utils.pyx  | 2 +-
 4 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/dysgu/call_component.pyx b/dysgu/call_component.pyx
index 71b3a84..39a850c 100644
--- a/dysgu/call_component.pyx
+++ b/dysgu/call_component.pyx
@@ -1459,8 +1459,8 @@ cdef tuple mask_soft_clips(AlignedSegment a, AlignedSegment b):
     cdef int left_clipB = 0
     cdef int right_clipB = 0
 
-    clip_sizes_hard(a, left_clipA, right_clipA)
-    clip_sizes_hard(b, left_clipB, right_clipB)
+    clip_sizes_hard(a, &left_clipA, &right_clipA)
+    clip_sizes_hard(b, &left_clipB, &right_clipB)
 
     cdef int a_template_start = 0
     cdef int b_template_start = 0
@@ -1643,7 +1643,8 @@ cdef call_from_reads(u_reads_info, v_reads_info, int insert_size, int insert_std
 
     cdef AlignedSegment a, b
     cdef uint32_t cigar_l_a, cigar_l_b
-    cdef uint32_t *cigar_p_a, *cigar_p_b
+    cdef uint32_t *cigar_p_a
+    cdef uint32_t *cigar_p_b
 
 
     for qname in [k for k in grp_u if k in grp_v]:  # Qname found on both sides
diff --git a/dysgu/cluster.pyx b/dysgu/cluster.pyx
index ad3aea4..1b21d90 100644
--- a/dysgu/cluster.pyx
+++ b/dysgu/cluster.pyx
@@ -249,6 +249,9 @@ def pipe1(args, infile, kind, regions, ibam, ref_genome, sample_name, bam_iter=N
         sites_index = sites_adder.sites_index
     logging.info("Graph constructed")
 
+    if not args['no_phase'] and hp_tag_found:
+        logging.info("Using HP tag")
+
     auto_support = False
     if args["min_support"] != "auto":
         args["min_support"] = int(args["min_support"])
diff --git a/dysgu/extra_metrics.pyx b/dysgu/extra_metrics.pyx
index 80d7160..aabf72e 100644
--- a/dysgu/extra_metrics.pyx
+++ b/dysgu/extra_metrics.pyx
@@ -95,7 +95,7 @@ cdef float soft_clip_qual_corr(reads):
         quals = r.query_qualities
         left_clip = 0
         right_clip = 0
-        clip_sizes(r, left_clip, right_clip)
+        clip_sizes(r, &left_clip, &right_clip)
 
         if quals is None:
             continue
diff --git a/dysgu/map_set_utils.pyx b/dysgu/map_set_utils.pyx
index e6ab2e0..c882661 100644
--- a/dysgu/map_set_utils.pyx
+++ b/dysgu/map_set_utils.pyx
@@ -269,7 +269,7 @@ cdef int cigar_clip(AlignedSegment r, int clip_length):
         return 0
     cdef int left = 0
     cdef int right = 0
-    clip_sizes(r, left, right)
+    clip_sizes(r, &left, &right)
     if left >= clip_length or right >= clip_length:
         return 1
     return 0

From 634102f3bb75310cdc63a62714aa8f97e3161413 Mon Sep 17 00:00:00 2001
From: kcleal <clealk@cardiff.ac.uk>
Date: Fri, 31 Jan 2025 11:50:35 +0000
Subject: [PATCH 03/14] Added superintervals dependency

---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index 390e8d1..91c0451 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -28,6 +28,7 @@ dependencies = [
     "networkx >= 2.4",
     "scikit-learn >= 0.22",
     "sortedcontainers",
+    "superintervals",
     "lightgbm"
 ]
 

From c4e1484a985aa50d246e8037b23c3d7faf2ff4c8 Mon Sep 17 00:00:00 2001
From: kcleal <clealk@cardiff.ac.uk>
Date: Mon, 24 Feb 2025 13:05:56 +0000
Subject: [PATCH 04/14] Added PSET phaseset HP haplotype tag and AF allele
 frequency tags to output

---
 dysgu/call_component.pyx |  57 +++++---
 dysgu/cluster.pyx        |   2 -
 dysgu/coverage.pyx       |   3 +
 dysgu/io_funcs.pyx       |  47 +++----
 dysgu/map_set_utils.pxd  |   3 +-
 dysgu/map_set_utils.pyx  |  10 +-
 dysgu/post_call.py       | 273 ++++++++++++++++++++++-----------------
 dysgu/view.py            |   3 +
 8 files changed, 223 insertions(+), 175 deletions(-)

diff --git a/dysgu/call_component.pyx b/dysgu/call_component.pyx
index 39a850c..0d5ebcf 100644
--- a/dysgu/call_component.pyx
+++ b/dysgu/call_component.pyx
@@ -119,6 +119,32 @@ cdef base_quals_aligned_clipped(AlignedSegment a):
     return aligned_base_quals, aligned_bases, clipped_base_quals, clipped_bases
 
 
+cdef collect_phase_tags(bint get_hp_tag, AlignedSegment a, EventResult_t er):
+    if not get_hp_tag:
+        return
+    if not a.has_tag("HP"):
+        if er.haplotype is None or 'u' not in er.haplotype:
+            er.haplotype = {'u': 1}
+        else:
+            er.haplotype['u'] += 1
+        return
+    hp_tag = a.get_tag("HP")
+    if not er.haplotype:
+        er.haplotype = {hp_tag: 1}
+    elif hp_tag not in er.haplotype:
+        er.haplotype[hp_tag] = 1
+    else:
+        er.haplotype[hp_tag] += 1
+    if a.has_tag("PS"):
+        pg_tag = a.get_tag("PS")
+        if not er.phase_set:
+            er.phase_set = {pg_tag: 1}
+        elif pg_tag not in er.phase_set:
+            er.phase_set[pg_tag] = 1
+        else:
+            er.phase_set[pg_tag] += 1
+
+
 cdef count_attributes2(reads1, reads2, spanning, float insert_ppf, generic_ins,
                        EventResult_t er, bint paired_end_reads, bint get_hp_tag):
     cdef float NMpri = 0
@@ -145,6 +171,7 @@ cdef count_attributes2(reads1, reads2, spanning, float insert_ppf, generic_ins,
     cdef float a_bases, large_gaps, n_small_gaps
     cdef AlignedSegment a
     for index, a in enumerate(itertools.chain(reads1, reads2, [i.read_a for i in generic_ins])):
+        seen.add(a.qname)
         flag = a.flag
         if flag & 2:
             er.NP += 1
@@ -162,16 +189,11 @@ cdef count_attributes2(reads1, reads2, spanning, float insert_ppf, generic_ins,
             n_sa += a.get_tag("SA").count(";")
         if a.has_tag("XA"):
             n_xa += a.get_tag("XA").count(";")
-        if get_hp_tag and a.has_tag("HP"):
-            hp_tag = a.get_tag("HP")
-            if not er.haplotypes:
-                er.haplotypes = {hp_tag: 1}
-            elif hp_tag not in er.haplotypes:
-                er.haplotypes[hp_tag] = 1
-            else:
-                er.haplotypes[hp_tag] += 1
+
+        collect_phase_tags(get_hp_tag, a, er)
 
         a_bases, large_gaps, n_small_gaps = n_aligned_bases(a)
+
         if a_bases > 0:
             n_gaps += n_small_gaps / a_bases
         if flag & 2304:  # Supplementary (and not primary if -M if flagged using bwa)
@@ -190,7 +212,7 @@ cdef count_attributes2(reads1, reads2, spanning, float insert_ppf, generic_ins,
             total_pri += 1
             MAPQpri += a.mapq
             if paired_end_reads:
-                if index >= len(reads2) and a.qname in paired_end:
+                if index >= len(reads1) and a.qname in paired_end:
                     er.pe += 1
                 else:
                     paired_end.add(a.qname)
@@ -217,6 +239,7 @@ cdef count_attributes2(reads1, reads2, spanning, float insert_ppf, generic_ins,
             clipped_bases += cb
 
     for a in spanning:
+        seen.add(a.qname)
         flag = a.flag
         if flag & 2:
             er.NP += 1
@@ -229,16 +252,10 @@ cdef count_attributes2(reads1, reads2, spanning, float insert_ppf, generic_ins,
             n_sa += a.get_tag("SA").count(";")
         if a.has_tag("XA"):
             n_xa += a.get_tag("XA").count(";")
-        if get_hp_tag and a.has_tag("HP"):
-            hp_tag = a.get_tag("HP")
-            if not er.haplotypes:
-                er.haplotypes = {hp_tag: 1}
-            elif hp_tag not in er.haplotypes:
-                er.haplotypes[hp_tag] = 1
-            else:
-                er.haplotypes[hp_tag] += 1
 
+        collect_phase_tags(get_hp_tag, a, er)
         a_bases, large_gaps, n_small_gaps = n_aligned_bases(a)
+
         if a_bases > 0:
             n_gaps += n_small_gaps / a_bases
         if flag & 2304:  # Supplementary
@@ -257,7 +274,7 @@ cdef count_attributes2(reads1, reads2, spanning, float insert_ppf, generic_ins,
             total_pri += 1
             if paired_end_reads:
                 if a.qname in paired_end:  # If two primary reads from same pair
-                    er.pe += 2
+                    er.pe += 1
                 else:
                     paired_end.add(a.qname)
             if a.has_tag("NM"):
@@ -296,6 +313,8 @@ cdef count_attributes2(reads1, reads2, spanning, float insert_ppf, generic_ins,
     else:
         er.clip_qual_ratio = 0
 
+    er.a_freq += len(seen)
+
 
 cdef int within_read_end_position(int event_pos, CigarItem cigar_item):
     cdef int end
@@ -918,7 +937,7 @@ cdef linear_scan_clustering(spanning, bint hp_tag):
                 hp_count += 1
                 lengths = [s.cigar_item.len for s in hp_spanning]
                 cluster_lengths(clusters, lengths, hp_spanning, eps)
-            # Merge near-identical haplotypes
+            # Merge near-identical haplotype
             if len(clusters) > 1:
                 cl_srt = sorted([ [ np.mean([c.cigar_item.len for c in clst]), clst ] for clst in clusters], key=lambda x: x[0])
                 merged_haps = [cl_srt[0]]
diff --git a/dysgu/cluster.pyx b/dysgu/cluster.pyx
index 1b21d90..6ce56ad 100644
--- a/dysgu/cluster.pyx
+++ b/dysgu/cluster.pyx
@@ -511,8 +511,6 @@ def pipe1(args, infile, kind, regions, ibam, ref_genome, sample_name, bam_iter=N
 
     preliminaries = post_call.get_ref_base(preliminaries, ref_genome, args["symbolic_sv_size"])
 
-    # preliminaries = post_call.filter_microsatellite_non_diploid(preliminaries)
-
     preliminaries = extra_metrics.sample_level_density(preliminaries, regions)
     preliminaries = coverage_analyser.normalize_coverage_values(preliminaries)
 
diff --git a/dysgu/coverage.pyx b/dysgu/coverage.pyx
index 47e0f37..32ca3e5 100644
--- a/dysgu/coverage.pyx
+++ b/dysgu/coverage.pyx
@@ -621,5 +621,8 @@ def get_raw_coverage_information(events, regions, regions_depth, infile, max_cov
         if r.chrA != r.chrB:
             r.svlen = 1000000
         r.mcov = max_depth
+
+        r.a_freq = r.a_freq / reads_10kb
+        r.a_freq = round(max(0, min(r.a_freq, 1.0)), 2)
         new_events.append(r)
     return new_events
diff --git a/dysgu/io_funcs.pyx b/dysgu/io_funcs.pyx
index 4ac0bea..4d52d56 100644
--- a/dysgu/io_funcs.pyx
+++ b/dysgu/io_funcs.pyx
@@ -138,13 +138,13 @@ cpdef list col_names(small_output):
 
     if small_output:
         return ["chrA", "posA", "chrB", "posB", "sample", "event_id", "grp_id", "n_in_grp", "kind", "type", "svtype", "join_type", "cipos95A", "cipos95B", 'contigA', 'contigB', "svlen", "svlen_precise", "rep", "gc",
-          ["GT", "GQ", "MAPQpri", "MAPQsupp", "su", "spanning", "pe", "supp", "sc", "bnd", "NMbase",
+          ["GT", "GQ", "phase_set", "haplotype", "a_freq", "MAPQpri", "MAPQsupp", "su", "spanning", "pe", "supp", "sc", "bnd", "NMbase",
          "raw_reads_10kb", "neigh10kb", "plus",
                 "minus", "remap_score", "remap_ed", "bad_clip_count", "fcc", "inner_cn", "outer_cn", "prob"]
             ]
     else:
         return ["chrA", "posA", "chrB", "posB", "sample", "event_id", "grp_id", "n_in_grp", "kind", "type", "svtype", "join_type", "cipos95A", "cipos95B", 'contigA', 'contigB', "svlen", "svlen_precise",  "rep", "gc",
-          ["GT", "GQ", "NMpri", "NMsupp", "NMbase", "MAPQpri", "MAPQsupp", "NP",
+          ["GT", "GQ", "phase_set", "haplotype", "a_freq",  "NMpri", "NMsupp", "NMbase", "MAPQpri", "MAPQsupp", "NP",
           "maxASsupp",  "su", "spanning", "pe", "supp", "sc", "bnd", "sqc", "scw", "clip_qual_ratio", "block_edge",
          "raw_reads_10kb", "mcov",
           "linked", "neigh", "neigh10kb",  "ref_bases", "plus",
@@ -157,11 +157,6 @@ def make_main_record(r, dysgu_version, index, format_f, df_rows, add_kind, small
     rep, repsc, lenprec = 0, 0, 1
     mean_prob, max_prob = None, None
     debug = False
-    # if abs(r['posA'] - 39084726) < 10:
-    #     echo('found', dict(r))
-    #     echo(len(format_f) > 1)
-    #     echo([(int(v["su"]), v["posA"], v["event_id"], k) for k, v in df_rows.items()])
-    #     debug = True
     if len(format_f) > 1:
         best = sorted([(int(v["su"]), k) for k, v in df_rows.items()], reverse=True)[0][1]
         probs = [v["prob"] for k, v in df_rows.items()]
@@ -273,9 +268,9 @@ def make_main_record(r, dysgu_version, index, format_f, df_rows, add_kind, small
         info_extras += [f"MeanPROB={round(mean_prob, 3)}", f"MaxPROB={round(max_prob, 3)}"]
 
     if small_output:
-        fmt_keys = "GT:GQ:MAPQP:MAPQS:SU:WR:PE:SR:SC:BND:NMB:COV:NEIGH10:PS:MS:RMS:RED:BCC:FCC:ICN:OCN:PROB"
+        fmt_keys = "GT:GQ:PSET:HP:AF:MAPQP:MAPQS:SU:WR:PE:SR:SC:BND:NMB:COV:NEIGH10:PS:MS:RMS:RED:BCC:FCC:ICN:OCN:PROB"
     else:
-        fmt_keys = "GT:GQ:NMP:NMS:NMB:MAPQP:MAPQS:NP:MAS:SU:WR:PE:SR:SC:BND:SQC:SCW:SQR:BE:COV:MCOV:LNK:NEIGH:NEIGH10:RB:PS:MS:SBT:NG:NSA:NXA:NMU:NDC:RMS:RED:BCC:FCC:STL:RAS:FAS:ICN:OCN:CMP:RR:JIT:PROB"
+        fmt_keys = "GT:GQ:PSET:HP:AF:NMP:NMS:NMB:MAPQP:MAPQS:NP:MAS:SU:WR:PE:SR:SC:BND:SQC:SCW:SQR:BE:COV:MCOV:LNK:NEIGH:NEIGH10:RB:PS:MS:SBT:NG:NSA:NXA:NMU:NDC:RMS:RED:BCC:FCC:STL:RAS:FAS:ICN:OCN:CMP:RR:JIT:PROB"
 
     if "variant_seq" in r and isinstance(r["variant_seq"], str):
         if r['svtype'] == "INS" or r.variant_seq:
@@ -318,13 +313,13 @@ def make_main_record(r, dysgu_version, index, format_f, df_rows, add_kind, small
 
 def get_fmt(r, small_output):
     if small_output:
-        v = [r["GT"], r["GQ"], r['MAPQpri'], r['MAPQsupp'], r['su'], r['spanning'], r['pe'], r['supp'], r['sc'], r['bnd'], r['NMbase'],
+        v = [r["GT"], r["GQ"], r["phase_set"], r["haplotype"], r['a_freq'], r['MAPQpri'], r['MAPQsupp'], r['su'], r['spanning'], r['pe'], r['supp'], r['sc'], r['bnd'], r['NMbase'],
              r['raw_reads_10kb'], r['neigh10kb'], r["plus"], r["minus"], r["remap_score"], r["remap_ed"],
              r["bad_clip_count"], round(r["fcc"], 3), round(r["inner_cn"], 3), round(r["outer_cn"], 3), r['prob']
              ]
         return v
     else:
-        v = [r["GT"], r["GQ"], r['NMpri'], r['NMsupp'], r['NMbase'], r['MAPQpri'],
+        v = [r["GT"], r["GQ"],  r["phase_set"], r["haplotype"], r['a_freq'], r['NMpri'], r['NMsupp'], r['NMbase'], r['MAPQpri'],
              r['MAPQsupp'], r['NP'], r['maxASsupp'], r['su'], r['spanning'], r['pe'], r['supp'],
              r['sc'], r['bnd'], round(r['sqc'], 2), round(r['scw'], 1), round(r['clip_qual_ratio'], 3), r['block_edge'], r['raw_reads_10kb'], round(r['mcov'], 2), int(r['linked']), r['neigh'], r['neigh10kb'],
              r['ref_bases'], r["plus"], r["minus"], round(r["strand_binom_t"], 4), r['n_gaps'], round(r["n_sa"], 2),
@@ -407,6 +402,9 @@ def get_header(contig_names=""):
 ##ALT=<ID=TRA,Description="Translocation">
 ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
 ##FORMAT=<ID=GQ,Number=1,Type=Float,Description="Genotype quality phred scaled">
+##FORMAT=<ID=PSET,Number=1,Type=String,Description="Phase-set ID for phased SVs">
+##FORMAT=<ID=HP,Number=1,Type=String,Description="Phased read support HP1[|HP2|...HPn]_unphased. Leading underscore (e.g. _4) indicates all reads unphased. No underscore implies no unphased reads">
+##FORMAT=<ID=AF,Number=1,Type=Float,Description="Allele frequency">
 ##FORMAT=<ID=NMP,Number=1,Type=Float,Description="Mean edit distance for primary alignments supporting the variant">
 ##FORMAT=<ID=NMS,Number=1,Type=Float,Description="Mean edit distance for supplementary alignments supporting the variant">
 ##FORMAT=<ID=NMB,Number=1,Type=Float,Description="Mean basic, edit distance. Gaps >= 30 bp are ignored">
@@ -460,7 +458,6 @@ def get_header(contig_names=""):
 def to_vcf(df, args, names, outfile, show_names=True,  contig_names="", header=None,
            small_output_f=True, sort_output=True, n_fields=None):
 
-
     outfile.write(get_header(contig_names) + "\t" + "\t".join(names) + "\n")
 
     if show_names:
@@ -478,21 +475,19 @@ def to_vcf(df, args, names, outfile, show_names=True,  contig_names="", header=N
 
     count = 0
     recs = []
-    if args is not None:
-        add_kind = args["add_kind"] == "True"
-        if args["metrics"]:
-            small_output_f = False
-        if args["verbosity"] == '0':
-            df["contigA"] = [''] * len(df)
-            df["contigB"] = [''] * len(df)
-        elif args["verbosity"] == '1':
-            has_alt = [True if isinstance(i, str) and i[0] != '<' else False for i in df['variant_seq']]
-            df["contigA"] = ['' if a else c for c, a in zip(df['contigA'], has_alt)]
-            df["contigB"] = ['' if a else c for c, a in zip(df['contigB'], has_alt)]
-
-        n_fields = len(col_names(small_output_f)[-1])
-    else:
+
+    add_kind = args["add_kind"] == "True"
+    if args["metrics"]:
         small_output_f = False
+    if args["verbosity"] == '0':
+        df["contigA"] = [''] * len(df)
+        df["contigB"] = [''] * len(df)
+    elif args["verbosity"] == '1':
+        has_alt = [True if (isinstance(i, str) and len(i) >= 1 and i[0] != '<') else False for i in df['variant_seq']]
+        df["contigA"] = ['' if a else c for c, a in zip(df['contigA'], has_alt)]
+        df["contigB"] = ['' if a else c for c, a in zip(df['contigB'], has_alt)]
+
+    n_fields = len(col_names(small_output_f)[-1])
 
     for idx, r in df.iterrows():
         if idx in seen_idx:
diff --git a/dysgu/map_set_utils.pxd b/dysgu/map_set_utils.pxd
index 866a729..f27962a 100644
--- a/dysgu/map_set_utils.pxd
+++ b/dysgu/map_set_utils.pxd
@@ -297,4 +297,5 @@ cdef class EventResult:
     cdef public bint preciseA, preciseB, linked, modified, remapped
     cdef public int8_t svlen_precise
     cdef public object contig, contig2, contig_cigar, contig2_cigar, svtype, join_type, chrA, chrB, exp_seq, sample, type, \
-        partners, GQ, GT, kind, ref_seq, variant_seq, left_ins_seq, right_ins_seq, site_info, qnames, haplotypes
+        partners, GQ, GT, kind, ref_seq, variant_seq, left_ins_seq, right_ins_seq, site_info
+    cdef public object qnames, haplotype, phase_set, a_freq
diff --git a/dysgu/map_set_utils.pyx b/dysgu/map_set_utils.pyx
index c882661..fec01cc 100644
--- a/dysgu/map_set_utils.pyx
+++ b/dysgu/map_set_utils.pyx
@@ -436,16 +436,8 @@ cdef class EventResult:
         self.n_sa = 0
         self.n_gaps = 0
         self.compress = 0
+        self.a_freq = 0
         self.qnames = set([])
 
     def __repr__(self):
         return str(to_dict(self))
-        # return str(self.to_dict())
-
-    # def __getstate__(self):  # for pickling
-    #     return to_dict(self)
-    #     # return self.to_dict()
-    #
-    # def __setstate__(self, d):
-    #     for k, v in d.items():
-    #         self.__setattr__(k, v)
diff --git a/dysgu/post_call.py b/dysgu/post_call.py
index cdfa231..931389e 100644
--- a/dysgu/post_call.py
+++ b/dysgu/post_call.py
@@ -9,8 +9,8 @@
 import glob
 import gzip
 import pandas as pd
-# import networkx as nx
-# from itertools import combinations
+import networkx as nx
+from itertools import combinations
 pd.options.mode.chained_assignment = None
 from dysgu.scikitbio._ssw_wrapper import StripedSmithWaterman
 from dysgu.consensus import compute_rep
@@ -324,6 +324,57 @@ def median(arr, start, end):
     return -1
 
 
+def get_ref_base(events, ref_genome, symbolic_sv_size):
+    for e in events:
+        if e.posA == 0:
+            e.posA = 1  # !!
+        if not e.ref_seq and not e.svtype == "BND":
+            if symbolic_sv_size == -1 or e.svtype in ("INS", "TRA", "INV"):
+                if e.posA == 0:
+                    e.posA = 1
+                try:
+                    base = ref_genome.fetch(e.chrA, e.posA - 1, e.posA).upper()
+                    e.ref_seq = base
+                except:
+                    pass
+            else:
+                bases = ""
+                if e.svlen < symbolic_sv_size:
+                    start = e.posA
+                    end = e.posB
+                else:
+                    start = e.posA - 1
+                    end - e.posA
+                try:
+                    bases = ref_genome.fetch(e.chrA, start, end).upper()
+                except:
+                    pass
+                if e.svtype == "DEL":
+                    e.ref_seq = bases
+                    e.variant_seq = bases[0] if bases else ""
+                else:
+                    e.variant_seq = bases
+                    e.ref_seq = bases[0] if bases else ""
+
+                #
+                #
+                #
+                #     try:
+                #         bases = ref_genome.fetch(e.chrA, e.posA, e.posB).upper()
+                #
+                #     except:
+                #         pass
+                #     e.ref_seq = bases
+                #     e.variant_seq = bases[0] if len(bases) else ""
+                # else:
+                #     try:
+                #         bases = ref_genome.fetch(e.chrA, e.posA - 1, e.posA).upper()
+                #     except:
+                #         pass
+                #     e.ref_seq = bases
+    return events
+
+
 def nCk(n,k):
     f = math.factorial
     return f(n) // f(k) // f(n-k)
@@ -352,38 +403,6 @@ def strand_binom_t(events):
     return events
 
 
-def get_ref_base(events, ref_genome, symbolic_sv_size):
-    for e in events:
-        if not e.ref_seq and not e.svtype == "BND":
-            if symbolic_sv_size == -1 or e.svtype == "INS" or e.svtype == "TRA":
-                if e.posA == 0:
-                    e.posA = 1
-                try:
-                    base = ref_genome.fetch(e.chrA, e.posA - 1, e.posA).upper()
-                    e.ref_seq = base
-                except:
-                    pass
-            else:
-                if e.svlen < symbolic_sv_size:
-                    if e.posA == 0:
-                        e.posA = 1
-                    try:
-                        bases = ref_genome.fetch(e.chrA, e.posA, e.posB).upper()
-                        e.ref_seq = bases
-                        e.variant_seq = bases[0] if len(bases) else ""
-                    except:
-                        pass
-                else:
-                    if e.posA == 0:
-                        e.posA = 1
-                    try:
-                        base = ref_genome.fetch(e.chrA, e.posA - 1, e.posA).upper()
-                        e.ref_seq = base
-                    except:
-                        pass
-    return events
-
-
 # from svtyper with minor modifications
 # https://github.com/hall-lab/svtyper/blob/master/svtyper/singlesample.py
 # efficient combinatorial function to handle extremely large numbers
@@ -593,33 +612,102 @@ def is_reciprocal_overlapping(x1,x2, y1,y2):
                 e2.GT = "0/1"
 
 
-# def phase_haplotypes(events):
-#     qname_2_eventidx = defaultdict(set)
-#     for idx, e in enumerate(events):
-#         if e.qnames and e.GT.replace('/', '|') != '1|1' and e.GT[0] != '.':
-#             for qname in e.qnames:
-#                 qname_2_eventidx[qname].add(idx)
-#
-#     G = nx.Graph()
-#     for qname, idxs in qname_2_eventidx.items():
-#         for u, v in combinations(idxs, 2):
-#             if G.has_edge(u, v):
-#                 G[u][v]['weight'] += 1
-#             else:
-#                 G.add_edge(u, v, weight=1)
-#     bad_edges = []
-#     for e in G.edges(data=True):
-#         if e[2]['weight'] < 2:
-#             bad_edges.append(e)
-#     G.remove_edges_from(bad_edges)
-#     echo('G', len(G.edges()))
-#     for component in nx.connected_components(G):
-#         echo('component', component)
-#         gts = [events[idx].GT for idx in component]
-#         echo('gts', gts)
-#         echo(list(f'{e.chrA}:{e.posA}({e.svlen})' for e in [events[idx] for idx in component]))
-#         sub = G.subgraph(component)
-#         echo(sub.edges(data=True))
+def low_ps_support(r, support_fraction=0.1):
+    min_support = round(1.5 + support_fraction * r.su)
+    return min_support
+
+
+
+def join_phase_sets(events, ps_id):
+    # Join phase sets if support is greater than threshold
+    G = nx.Graph()
+    for idx, e in enumerate(events):
+        if not e.phase_set:
+            e.phase_set = ''
+            continue
+        if len(e.phase_set) > 1:
+            if e.GT not in '1|11/1':  # only heterozygous variants
+                threshold = low_ps_support(e)
+                passing_ps = [k for k, v in e.phase_set.items() if v >= threshold]
+                e.phase_set = passing_ps
+                if len(passing_ps) > 1:
+                    G.add_edges_from(combinations(passing_ps, 2))
+            else:
+                # Choose PS with maximum support
+                e.phase_set = str(max(e.phase_set, key=e.phase_set.get))
+        else:
+            e.phase_set = str(list(e.phase_set.keys())[0])
+
+    new_phase_set = {}
+    for sub in nx.connected_components(G):
+        for n in sub:
+            new_phase_set[n] = str(ps_id)
+        ps_id += 1
+
+    for e in events:
+        if not e.phase_set or isinstance(e.phase_set, str):
+            continue
+        assert isinstance(e.phase_set, list)
+        new_p = list(set([new_phase_set[n] for n in e.phase_set if n in new_phase_set]))[0]
+        if not new_p:
+            e.phase_set = ''
+        else:
+            e.phase_set = str(new_p)
+
+
+def get_hp_format(events):
+    max_ps = 0
+    any_phase_set = False
+    keys = set([])
+    for e in events:
+        if e.haplotype:
+            for k in e.haplotype:
+                if k != 'u':
+                    keys.add(k)
+    keys = sorted(list(keys))
+    for e in events:
+        if e.haplotype:
+            all_haps = e.haplotype
+            phased_haps = {k: v for k, v in e.haplotype.items() if k != 'u'}
+            n_haps = len(phased_haps)
+            n_phase_set = len(e.phase_set) if e.phase_set else 0
+            if n_phase_set:
+                ps = max(e.phase_set, key=e.phase_set.get)
+                if ps > max_ps:
+                    max_ps = ps
+                any_phase_set = True
+            else:
+                e.phase_set = ""
+            if n_haps >= 2:
+                if e.GT == "1/1":
+                    e.GT = "1|1"
+                else:
+                    best_hap = max(phased_haps, key=phased_haps.get)
+                    if best_hap == 1:
+                        e.GT = "1|0"
+                    else:
+                        e.GT = "0|1"
+            elif n_haps == 1:
+                if 1 in e.haplotype:
+                    e.GT = "1|0"
+                else:
+                    e.GT = "0|1"
+
+            hp_string = ''
+            if n_haps >= 1:
+                unphased = all_haps['u'] if 'u' in all_haps else 0
+                hp_string = '|'.join([str(phased_haps[k]) if k in phased_haps else '0' for k in keys])
+                if unphased:
+                    hp_string += f'_{unphased}'
+                if not hp_string:
+                    hp_string = "."
+
+            e.haplotype = hp_string
+        else:
+            e.haplotype = "."
+
+
+    return max_ps, any_phase_set
 
 
 def get_gt_metric2(events, mode, add_gt=True):
@@ -677,68 +765,17 @@ def get_gt_metric2(events, mode, add_gt=True):
 
     fix_inconsistent_gt(events)
 
-    # any_haps = False
-    for e in events:
-        if e.haplotypes:
-            # any_haps = True
-            n_haps = len(e.haplotypes)
-            if n_haps >= 2:
-                if e.GT == "1/1":
-                    e.GT = "1|1"
-                else:
-                    best_hap = max(e.haplotypes, key=e.haplotypes.get)
-                    if best_hap == 1:
-                        e.GT = "1|0"
-                    else:
-                        e.GT = "0|1"
-            elif n_haps == 1:
-                if 1 in e.haplotypes:
-                    e.GT = "1|0"
-                else:
-                    e.GT = "0|1"
+    max_ps, any_phase_set = get_hp_format(events)
 
-    # phase_haplotypes(events)
+    if any_phase_set:
+        join_phase_sets(events, max_ps + 1)
+    else:
+        for e in events:
+            e.phase_set = "."
 
     return events
 
 
-# def filter_microsatellite_non_diploid(potential):
-#     tmp_list = defaultdict(list)
-#     max_dist = 50
-#     half_d = (max_dist * 0.5)
-#     candidates = []
-#     for idx in range(len(potential)):
-#         ei = potential[idx]
-#         if (ei.svtype != "DEL" and ei.svtype != "INS") or not ei.variant_seq:
-#             continue
-#         rep = compute_rep(ei.variant_seq)
-#         if rep < 0.4:
-#             continue
-#         candidates.append(idx)
-#         tmp_list[ei.chrA].append((ei.posA - ei.cipos95A - max_dist, ei.posA + ei.cipos95A + max_dist, idx))
-#         if ei.chrA == ei.chrB and ei.svlen > half_d:
-#             tmp_list[ei.chrA].append((ei.posB - ei.cipos95B - max_dist, ei.posB + ei.cipos95B + max_dist, idx))
-#
-#     nc2 = {k: iitree(v, add_value=True) for k, v in tmp_list.items()}
-#     seen = set([])
-#     to_drop = set([])
-#     for idx in candidates:
-#         if idx in seen:
-#             continue
-#         ei = potential[idx]
-#         ols = set(nc2[ei.chrA].allOverlappingIntervals(ei.posA, ei.posA + 1))
-#         ols2 = set(nc2[ei.chrB].allOverlappingIntervals(ei.posB, ei.posB + 1))
-#         ols |= ols2
-#         ols.add(idx)
-#         if len(ols) <= 2:
-#             continue
-#         bad = sorted([(i, potential[i]) for i in ols], key=lambda x: x[1].su, reverse=True)[2:]
-#         for jdx, p in bad:
-#             to_drop.add(jdx)
-#         seen |= ols
-#     return [p for i, p in enumerate(potential) if i not in to_drop]
-
-
 def compressability(events):
     for e in events:
         c1 = []
diff --git a/dysgu/view.py b/dysgu/view.py
index 8fdf428..ca04718 100644
--- a/dysgu/view.py
+++ b/dysgu/view.py
@@ -362,6 +362,9 @@ def vcf_to_df(path):
                "JIT": ("jitter", float),
                "LEFT_SVINSSEQ": ("left_ins_seq", str),
                "RIGHT_SVINSSEQ": ("right_ins_seq", str),
+               "PSET": ("phase_set", str),
+               "HP": ("haplotype", str),
+               "AF": ("a_freq", float),
                }
     # df = df[df.posA == 110156314]
     df.rename(columns={k: v[0] for k, v in col_map.items()}, inplace=True)

From 672a90fd1be98273cd41742b3f118adbd2344023 Mon Sep 17 00:00:00 2001
From: kcleal <clealk@cardiff.ac.uk>
Date: Mon, 24 Feb 2025 14:20:02 +0000
Subject: [PATCH 05/14] Fixed index error

---
 dysgu/post_call.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dysgu/post_call.py b/dysgu/post_call.py
index 931389e..c1119b9 100644
--- a/dysgu/post_call.py
+++ b/dysgu/post_call.py
@@ -648,11 +648,11 @@ def join_phase_sets(events, ps_id):
         if not e.phase_set or isinstance(e.phase_set, str):
             continue
         assert isinstance(e.phase_set, list)
-        new_p = list(set([new_phase_set[n] for n in e.phase_set if n in new_phase_set]))[0]
+        new_p = list(set([new_phase_set[n] for n in e.phase_set if n in new_phase_set]))
         if not new_p:
             e.phase_set = ''
         else:
-            e.phase_set = str(new_p)
+            e.phase_set = str(new_p[0])
 
 
 def get_hp_format(events):

From f86145b17a2eda41d2e6543503338ae643d8750a Mon Sep 17 00:00:00 2001
From: kcleal <clealk@cardiff.ac.uk>
Date: Wed, 26 Feb 2025 10:16:11 +0000
Subject: [PATCH 06/14] Changed empty PSET and HP to 0

---
 README.rst         | 30 ++++++++++++++++++++++--------
 dysgu/post_call.py |  6 +++---
 2 files changed, 25 insertions(+), 11 deletions(-)

diff --git a/README.rst b/README.rst
index 977a918..c79c873 100644
--- a/README.rst
+++ b/README.rst
@@ -42,7 +42,7 @@ for calling structural variants using paired-end or long read sequencing data. S
 ⚙️ Installation
 ---------------
 
-Dysgu can be installed via pip using Python 3.10 - 3.12 and has been tested on linux and MacOS.
+Dysgu can be installed via pip using Python >=3.10 and has been tested on linux and MacOS.
 The list of python packages needed can be found in requirements.txt.
 To install::
 
@@ -79,7 +79,8 @@ For help use::
     dysgu --help
     dysgu command --help
 
-To use the python-API see the `documentation <https://kcleal.github.io/dysgu/API.html>`_, or `jupyter notebook <https://github.com/kcleal/dysgu/blob/master/dysgu_api_demo.ipynb>`_,
+To use the python-API see the `documentation <https://kcleal.github.io/dysgu/API.html>`_,
+or `jupyter notebook <https://github.com/kcleal/dysgu/blob/master/dysgu_api_demo.ipynb>`_,
 
 
 🎯 Calling SVs
@@ -87,13 +88,16 @@ To use the python-API see the `documentation <https://kcleal.github.io/dysgu/API
 
 Paired-end reads
 ****************
-Dysgu was developed to work with paired-end reads aligned using bwa mem, although other aligners will work. To call SVs, a sorted and indexed .bam/cram is needed plus an indexed reference genome (fasta format). Also a working directory must
-be provided to store temporary files. There are a few ways to run dysgu depending on the type of data you have.
+Dysgu was developed to work with paired-end reads aligned using bwa mem, although other aligners will work.
+To call SVs, a sorted and indexed .bam/cram is needed plus an indexed reference genome (fasta format).
+Also a working directory must be provided to store temporary files.
+There are a few ways to run dysgu depending on the type of data you have.
 For paired-end data the `run` command is recommended which wraps `fetch` and `call`::
 
     dysgu run reference.fa temp_dir input.bam > svs.vcf
 
-This will first run the `fetch` command that will create a temporary bam file plus other analysis files in the directory `temp_dir`. These temporary files are then analysed using the `call` program.
+This will first run the `fetch` command that will create a temporary bam file plus other analysis files
+in the directory `temp_dir`. These temporary files are then analysed using the `call` program.
 To make use of multiprocessing, set the "-p" parameter::
 
     dysgu run -p4 reference.fa temp_dir input.bam > svs.vcf
@@ -104,14 +108,24 @@ Dysgu also accepts reads from stdin. In this example, the --clean flag will remo
 
 Long reads
 **********
-Dysgy is designed to work with long reads aligned using minimap2 or ngmlr. Use the 'call' pipeline if starting with a bam file, or 'run' if starting with a cram::
+Dysgy is designed to work with long reads aligned using minimap2 or ngmlr. Use the 'call' pipeline if
+starting with a bam file, or 'run' if starting with a cram::
 
     dysgu call --mode pacbio-revio reference.fa temp_dir input.bam > svs.vcf
 
     dysgu call --mode nanopore-r10 reference.fa temp_dir input.bam > svs.vcf
 
-Presets are available using the `--mode` option for PacBio `pacbio-sequel2 | pacbio-revio`, and ONT `nanopore-r9 | nanopore-r10`
-If you are using using reads with higher error rates, or are unsure of the read-accuracy, it is recommended to set '--divergence auto', to infer a more conservative sequence divergence. The default is set at 0.02 which can be too stringent for lower accuracy reads and will result in more reads being filtered and lower sensitivity::
+Presets are available using the `--mode` option for PacBio `pacbio-sequel2 | pacbio-revio`,
+and ONT `nanopore-r9 | nanopore-r10`.
+
+Since v1.8, higher calling accuracy can be achieved by adding "haplotags" to your input bam/cram file
+(using hiphase / whatshap / longshot etc). Dysgu reads HP and PS tags automatically and produces
+phased output calls.
+
+If you are using using reads with higher error rates, or are unsure of the read-accuracy,
+it is recommended to set '--divergence auto', to infer a more conservative sequence divergence.
+The default is set at 0.02 which can be too stringent for lower accuracy reads and will result in
+more reads being filtered and lower sensitivity::
 
     dysgu call --divergence auto --mode nanopore reference.fa temp_dir input.bam > svs.vcf
 
diff --git a/dysgu/post_call.py b/dysgu/post_call.py
index c1119b9..945c7f8 100644
--- a/dysgu/post_call.py
+++ b/dysgu/post_call.py
@@ -700,11 +700,11 @@ def get_hp_format(events):
                 if unphased:
                     hp_string += f'_{unphased}'
                 if not hp_string:
-                    hp_string = "."
+                    hp_string = "0"
 
             e.haplotype = hp_string
         else:
-            e.haplotype = "."
+            e.haplotype = "0"
 
 
     return max_ps, any_phase_set
@@ -771,7 +771,7 @@ def get_gt_metric2(events, mode, add_gt=True):
         join_phase_sets(events, max_ps + 1)
     else:
         for e in events:
-            e.phase_set = "."
+            e.phase_set = "-1"
 
     return events
 

From 47d642cd6d9935541f8d7390f20a92e5722c4788 Mon Sep 17 00:00:00 2001
From: kcleal <clealk@cardiff.ac.uk>
Date: Wed, 26 Feb 2025 10:20:37 +0000
Subject: [PATCH 07/14] PSET and HP default to empty string

---
 dysgu/post_call.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/dysgu/post_call.py b/dysgu/post_call.py
index 945c7f8..d400d10 100644
--- a/dysgu/post_call.py
+++ b/dysgu/post_call.py
@@ -700,11 +700,11 @@ def get_hp_format(events):
                 if unphased:
                     hp_string += f'_{unphased}'
                 if not hp_string:
-                    hp_string = "0"
+                    hp_string = ""
 
             e.haplotype = hp_string
         else:
-            e.haplotype = "0"
+            e.haplotype = ""
 
 
     return max_ps, any_phase_set
@@ -771,7 +771,7 @@ def get_gt_metric2(events, mode, add_gt=True):
         join_phase_sets(events, max_ps + 1)
     else:
         for e in events:
-            e.phase_set = "-1"
+            e.phase_set = ""
 
     return events
 

From de5512f1527ffb55660205d8d2efbad936664ff4 Mon Sep 17 00:00:00 2001
From: kcleal <clealk@cardiff.ac.uk>
Date: Wed, 26 Feb 2025 10:22:08 +0000
Subject: [PATCH 08/14] PSET and HP default to '-1' and '0'

---
 dysgu/post_call.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/dysgu/post_call.py b/dysgu/post_call.py
index d400d10..945c7f8 100644
--- a/dysgu/post_call.py
+++ b/dysgu/post_call.py
@@ -700,11 +700,11 @@ def get_hp_format(events):
                 if unphased:
                     hp_string += f'_{unphased}'
                 if not hp_string:
-                    hp_string = ""
+                    hp_string = "0"
 
             e.haplotype = hp_string
         else:
-            e.haplotype = ""
+            e.haplotype = "0"
 
 
     return max_ps, any_phase_set
@@ -771,7 +771,7 @@ def get_gt_metric2(events, mode, add_gt=True):
         join_phase_sets(events, max_ps + 1)
     else:
         for e in events:
-            e.phase_set = ""
+            e.phase_set = "-1"
 
     return events
 

From f296b8452c7b20dbffd69c02aef1a55291614c8e Mon Sep 17 00:00:00 2001
From: kcleal <clealk@cardiff.ac.uk>
Date: Tue, 4 Mar 2025 15:18:14 +0000
Subject: [PATCH 09/14] #125 better calling of low support SVs

---
 dysgu/call_component.pyx |  64 ++++-------
 dysgu/cluster.pyx        |   2 +-
 dysgu/coverage.pyx       |  62 +++--------
 dysgu/graph.pyx          | 234 +++++++++++++++++++--------------------
 dysgu/post_call.py       |   9 +-
 pyproject.toml           |   2 +-
 6 files changed, 155 insertions(+), 218 deletions(-)

diff --git a/dysgu/call_component.pyx b/dysgu/call_component.pyx
index 0d5ebcf..40c259b 100644
--- a/dysgu/call_component.pyx
+++ b/dysgu/call_component.pyx
@@ -672,7 +672,6 @@ cdef make_single_call(sub_informative, insert_size, insert_stdev, insert_ppf, mi
     as2 = None
     ref_bases = 0
     if to_assemble or len(spanning_alignments) > 0:
-        # echo('MAKE SINGLE CALL')
         if er.preciseA:
             as1 = consensus.base_assemble(u_reads, er.posA, 500)
             if as1 and (er.svtype != "TRA" or (as1['contig'] and (as1['contig'][0].islower() or as1['contig'][-1].islower()))):
@@ -965,7 +964,6 @@ cdef linear_scan_clustering(spanning, bint hp_tag):
 def process_spanning(bint paired_end, spanning_alignments, float divergence, length_extend, informative,
                      generic_insertions, float insert_ppf, bint to_assemble, bint hp_tag):
 
-    # echo("PROCESS SPANNING")
     cdef int min_found_support = 0
     cdef str svtype, jointype
     cdef bint passed
@@ -2055,33 +2053,10 @@ cdef list multi(data, bam, int insert_size, int insert_stdev, float insert_ppf,
                 seen.remove(u)
             if v in seen:
                 seen.remove(v)
-
             events += one_edge(rd_u, rd_v, clip_length, insert_size, insert_stdev, insert_ppf, min_support, 1,
                                assemble_contigs,
                                sites_info, paired_end, hp_tag)
 
-            # finds reads that should be a single partition
-            # u_reads, v_reads, u_single, v_single = filter_single_partitions(rd_u, rd_v)
-            # if len(u_reads) > 0 and len(v_reads) > 0:
-            #     events += one_edge(rd_u, rd_v, clip_length, insert_size, insert_stdev, insert_ppf, min_support, 1, assemble_contigs,
-            #                        sites_info, paired_end)
-            # if u_single:
-            #     res = single(u_single, insert_size, insert_stdev, insert_ppf, clip_length, min_support, assemble_contigs,
-            #                  sites_info, paired_end, length_extend, divergence)
-            #     if res:
-            #         if isinstance(res, EventResult):
-            #             events.append(res)
-            #         else:
-            #             events += res
-            # if v_single:
-            #     res = single(v_single, insert_size, insert_stdev, insert_ppf, clip_length, min_support, assemble_contigs,
-            #                  sites_info, paired_end, length_extend, divergence)
-            #     if res:
-            #         if isinstance(res, EventResult):
-            #             events.append(res)
-            #         else:
-            #             events += res
-
     # Process any singles / unconnected blocks
     if seen:
         for part_key in seen:
@@ -2101,23 +2076,28 @@ cdef list multi(data, bam, int insert_size, int insert_stdev, float insert_ppf,
                         events += res
 
     # Check for events within clustered nodes
-    if data.s_within:
-        for k, d in data.s_within:  #.items():
-            o_count = out_counts[k]
-            i_counts = len(d)
-            if i_counts > max_single_size:
-                continue
-            if o_count > 0 and i_counts > (2*min_support) and i_counts > o_count:
-                rds = get_reads(bam, d, data.reads, data.n2n, 0, info)
-                if len(rds) < lower_bound_support or (len(sites_info) != 0 and len(rds) == 0):
-                        continue
-                res = single(rds, insert_size, insert_stdev, insert_ppf, clip_length, min_support, assemble_contigs,
-                             sites_info, paired_end, length_extend, divergence, hp_tag)
-                if res:
-                    if isinstance(res, EventResult):
-                        events.append(res)
-                    else:
-                        events += res
+    # if data.s_within:
+    #     for k, d in data.s_within:
+    #         o_count = out_counts[k]
+    #         i_counts = len(d)
+    #
+    #         if i_counts > max_single_size:
+    #             continue
+    #         # if o_count > 0 and i_counts >= (2*min_support) and i_counts > o_count:
+    #         # if o_count > 0 or i_counts > 0:
+    #         rds = get_reads(bam, d, data.reads, data.n2n, 0, info)
+    #         if len(rds) < lower_bound_support or (len(sites_info) != 0 and len(rds) == 0):
+    #             continue
+    #         echo('Calling single 1')
+    #         res = single(rds, insert_size, insert_stdev, insert_ppf, clip_length, min_support, assemble_contigs,
+    #                      sites_info, paired_end, length_extend, divergence, hp_tag)
+    #         for r in res:
+    #             echo(r.posA, r.posB, r.svlen)
+    #         if res:
+    #             if isinstance(res, EventResult):
+    #                 events.append(res)
+    #             else:
+    #                 events += res
     return events
 
 
diff --git a/dysgu/cluster.pyx b/dysgu/cluster.pyx
index 6ce56ad..dd9b56c 100644
--- a/dysgu/cluster.pyx
+++ b/dysgu/cluster.pyx
@@ -273,7 +273,7 @@ def pipe1(args, infile, kind, regions, ibam, ref_genome, sample_name, bam_iter=N
     if args["pl"] == "pe":  # reads with internal SVs can be detected at lower support
         lower_bound_support = min_support - 1 if min_support - 1 > 1 else 1
     else:
-        lower_bound_support = 2
+        lower_bound_support = min_support
 
     component_path = f"{tdir}/components.bin"
     cdef bytes cmp_file = component_path.encode("ascii")  # write components to file if low-mem used
diff --git a/dysgu/coverage.pyx b/dysgu/coverage.pyx
index 32ca3e5..1df5430 100644
--- a/dysgu/coverage.pyx
+++ b/dysgu/coverage.pyx
@@ -112,9 +112,6 @@ cdef class GenomeScanner:
         self.procs = read_threads
         self.buff_size = buffer_size
         self.current_bin = []
-        self.current_cov = 0
-        self.current_chrom = 0
-        self.current_pos = 0
         self.current_cov_array = None
         self.reads_dropped = 0
         self.depth_d = {}
@@ -152,10 +149,10 @@ cdef class GenomeScanner:
 
                     if aln.flag & 1284 or aln.mapq < mq_thresh or cigar_l == 0:  # not primary, duplicate or unmapped?
                         continue
-                    self._add_to_bin_buffer(aln, tell)
+                    yield aln, tell
                     tell = 0 if self.no_tell else self.input_bam.tell()
-                    while len(self.staged_reads) > 0:
-                        yield self.staged_reads.popleft()
+                    # yield aln, tell
+
             # when 'call' command was run only. generate coverage track here
             else:
                 self.cpp_cov_track.set_max_cov(self.max_cov)
@@ -167,7 +164,7 @@ cdef class GenomeScanner:
 
                     cigar_l = aln._delegate.core.n_cigar
                     if cigar_l == 0 or aln.flag & 1284:
-                        tell = 0 if self.no_tell else self.input_bam.tell()
+                        # tell = 0 if self.no_tell else self.input_bam.tell()
                         continue
 
                     if aln.rname != self.current_tid:
@@ -203,18 +200,14 @@ cdef class GenomeScanner:
                             index_start += length
 
                     if aln.mapq < mq_thresh or not good_read or not self.cpp_cov_track.cov_val_good(self.current_tid, aln.rname, pos):
-                        tell = 0 if self.no_tell else self.input_bam.tell()
                         continue
-
-                    self._add_to_bin_buffer(aln, tell)
+                    yield aln, tell
                     tell = 0 if self.no_tell else self.input_bam.tell()
-                    while len(self.staged_reads) > 0:
-                        yield self.staged_reads.popleft()
+                    # yield aln, tell
+
                 if self.current_tid != -1 and self.current_tid <= self.input_bam.nreferences:
                     out_path = "{}/{}.dysgu_chrom.bin".format(self.cov_track_path, self.input_bam.get_reference_name(self.current_tid)).encode("ascii")
                     self.cpp_cov_track.write_track(out_path)
-            if len(self.current_bin) > 0:
-                yield self.current_bin
 
         else:
             logging.info("Searching --regions and collecting reads from mate-pair locations")
@@ -291,16 +284,13 @@ cdef class GenomeScanner:
                         continue
                     if not self.cpp_cov_track.cov_val_good(self.current_tid, aln.rname, pos):
                         continue
-                    self._add_to_bin_buffer(aln, tell)
+                    yield aln, tell
                     tell = 0 if self.no_tell else self.input_bam.tell()
-                    while len(self.staged_reads) > 0:
-                        yield self.staged_reads.popleft()
+                    # yield aln, tell
 
                 if self.current_tid != -1 and self.current_tid <= self.input_bam.nreferences:
                     out_path = "{}/{}.dysgu_chrom.bin".format(self.cov_track_path, self.input_bam.get_reference_name(self.current_tid)).encode("ascii")
                     self.cpp_cov_track.write_track(out_path)
-                if len(self.current_bin) > 0:
-                    yield self.current_bin
 
 
     def get_read_properties(self, int max_tlen, int insert_median, int insert_stdev, int read_len, ibam=None, find_divergence=False):
@@ -352,7 +342,8 @@ cdef class GenomeScanner:
                     continue
                 tell = 0 if self.no_tell else self.input_bam.tell()
                 if self.no_tell:
-                    self._add_to_bin_buffer(a, tell)
+                    self.staged_reads.append((a, tell))
+                    # self._add_to_bin_buffer(a, tell)
                     if a.rname != self.current_tid:
                         if self.current_tid != -1 and self.current_tid <= self.input_bam.nreferences:
                             out_path = "{}/{}.dysgu_chrom.bin".format(self.cov_track_path.outpath, self.input_bam.get_reference_name(self.current_tid)).encode("ascii")
@@ -447,7 +438,7 @@ cdef class GenomeScanner:
         # Read the rest of the genome, reads are sent in blocks
         cdef int total_reads = 0
         for staged in self._get_reads():
-            total_reads += len(staged)
+            total_reads += 1  # len(staged)
             yield staged
         if total_reads == 0:
             logging.critical("No reads found, finishing")
@@ -464,33 +455,6 @@ cdef class GenomeScanner:
         elif self.no_tell:
             raise BufferError("Read buffer has overflowed, increase --buffer-size")
 
-    def _add_to_bin_buffer(self, AlignedSegment a, tell):
-        # Calculates coverage information on fly, drops high coverage regions, buffers reads
-        cdef int flag = a.flag
-        cdef uint32_t cigar_l = a._delegate.core.n_cigar
-        if flag & 1540 or cigar_l == 0: # or a.seq is None:
-            return
-        cdef int rname = a.rname
-        cdef int apos = a.pos
-        cdef int bin_pos = int(apos / 100)
-        cdef int ref_length
-        cdef str reference_name = ""
-        cdef int aend = a.reference_end
-        cdef float current_coverage
-        if self.current_chrom != rname:
-            self.current_chrom = rname
-        # in_roi = False
-        # if self.overlap_regions:
-        #     in_roi = intersecter(self.overlap_regions, a.rname, apos, apos+1)
-        if rname == self.current_chrom and bin_pos == self.current_pos:
-            self.current_bin.append((a, tell))
-        else:
-            if len(self.current_bin) != 0:
-                self.staged_reads.append(self.current_bin)
-            self.current_chrom = rname
-            self.current_pos = bin_pos
-            self.current_bin = [(a, tell)]
-
 
 cdef float add_coverage(int start, int end, DTYPE_t[:] chrom_depth) nogil:
     cdef float fs = start / 100
@@ -622,7 +586,7 @@ def get_raw_coverage_information(events, regions, regions_depth, infile, max_cov
             r.svlen = 1000000
         r.mcov = max_depth
 
-        r.a_freq = r.a_freq / reads_10kb
+        r.a_freq = r.a_freq / (reads_10kb + 1e-5)
         r.a_freq = round(max(0, min(r.a_freq, 1.0)), 2)
         new_events.append(r)
     return new_events
diff --git a/dysgu/graph.pyx b/dysgu/graph.pyx
index cd0c984..2f0353d 100644
--- a/dysgu/graph.pyx
+++ b/dysgu/graph.pyx
@@ -827,7 +827,6 @@ cdef void add_to_graph(Py_SimpleGraph G, AlignedSegment r, PairedEndScoper_t pe_
         pe_scope.find_other_nodes(node_name, chrom, event_pos, chrom2, pos2, read_enum, length_from_cigar,
                                   trust_ins_len)
 
-        # other_nodes = pe_scope.find_other_nodes(node_name, chrom, event_pos, chrom2, pos2, read_enum, length_from_cigar, trust_ins_len)
         if not dereference(found_nodes_exact).empty():
             for other_node in dereference(found_nodes_exact):
                 if node_to_name.same_template(node_name, other_node):
@@ -1289,144 +1288,135 @@ cpdef tuple construct_graph(genome_scanner, infile, int max_dist, int clustering
     cdef int n_checked_for_hp_tag = 0
     if no_phase:
         n_checked_for_hp_tag = 10_001
-    for chunk in genome_scanner.iter_genome():
-        for r, tell in chunk:
-            if r.mapq < mapq_thresh:
+
+    for r, tell in genome_scanner.iter_genome():
+        if r.mapq < mapq_thresh:
+            continue
+        pos2 = -1
+        event_pos = r.pos
+        added = False
+        clipped = 0
+        events_to_add.clear()
+        cigar_l = r._delegate.core.n_cigar
+        cigar_p = bam_get_cigar(r._delegate)
+        if not paired_end:
+            if n_checked_for_hp_tag < 10_000:
+                n_checked_for_hp_tag += 1
+                if r.has_tag("HP"):
+                    hp_tag_found = <bint>True
+                    n_checked_for_hp_tag = 10_001
+            if r.has_tag("NM") and edit_distance_too_high(cigar_p, cigar_l, max_divergence, r.get_tag("NM")):
                 continue
-            pos2 = -1
-            event_pos = r.pos
-            added = False
-            clipped = 0
-            events_to_add.clear()
-            cigar_l = r._delegate.core.n_cigar
-            cigar_p = bam_get_cigar(r._delegate)
-            if not paired_end:
-                if n_checked_for_hp_tag < 10_000:
-                    n_checked_for_hp_tag += 1
-                    if r.has_tag("HP"):
-                        hp_tag_found = <bint>True
-                        n_checked_for_hp_tag = 10_001
-                if r.has_tag("NM") and edit_distance_too_high(cigar_p, cigar_l, max_divergence, r.get_tag("NM")):
+
+        if cigar_l > 1:
+            if r.has_tag("SA"):
+                # Set cigar-index to -1 means it is unset, will be determined during SA parsing
+                cigar_index = -1
+                process_alignment(G, r, clip_l, max_dist, gettid,
+                                  overlap_regions, clustering_dist, pe_scope,
+                                  cigar_index, event_pos, paired_end, tell, genome_scanner,
+                                  template_edges, node_to_name, pos2, mapq_thresh, clip_scope, ReadEnum_t.SPLIT,
+                                  bad_clip_counter, mm_only, site_adder, 0, trust_ins_len)
+                added = True
+            for cigar_index in range(cigar_l):
+                cigar_value = cigar_p[cigar_index]
+                opp = <int> cigar_value & 15
+                length = <int> cigar_value >> 4
+
+                if opp == 0 or opp == 7 or opp == 8:
+                    if find_n_aligned_bases:
+                        n_aligned_bases += length
+                    event_pos += length
                     continue
 
-            if cigar_l > 1:
-                if r.has_tag("SA"):
-                    # Set cigar-index to -1 means it is unset, will be determined during SA parsing
-                    cigar_index = -1
+                if opp == 1:
+                    if length >= min_sv_size:
+                        pos2 = event_pos + length
+                        events_to_add.push_back(make_cigar_event(opp, cigar_index, event_pos, pos2, length, ReadEnum_t.INSERTION))
+                        added = True
+                elif opp == 2:
+                    if length >= min_sv_size:
+                        pos2 = event_pos + length
+                        events_to_add.push_back(make_cigar_event(opp, cigar_index, event_pos, pos2, length, ReadEnum_t.DELETION))
+                        added = True
+                    event_pos += length
+                elif opp == 3:
+                    if length >= min_sv_size:
+                        pos2 = event_pos + length
+                        events_to_add.push_back(make_cigar_event(opp, cigar_index, event_pos, pos2, length, ReadEnum_t.SKIP))
+                        added = True
+                    event_pos += length
+                else:
+                    if opp != 4 and opp != 5:
+                        event_pos += length
+                    elif opp == 4 and length >= clip_l:
+                        clipped = 1
+
+        if (paired_end and not added and contigs) or not paired_end:
+            # Whole alignment will be used, try infer position from soft-clip
+            cigar_index = -1
+            pos2 = -1
+            left_clip_size = 0
+            right_clip_size = 0
+            clip_sizes_hard(r, &left_clip_size, &right_clip_size)  # soft and hard-clips
+            if r.flag & 8 and clipped:  # paired by inference
+                # skip if both ends are clipped, usually means its a chunk of badly mapped sequence
+                # if not (left_clip_size and right_clip_size) and ((paired_end and good_quality_clip(r, 20)) or (not paired_end and) ):
+                if not (left_clip_size and right_clip_size) and good_quality_clip(r, 20):
+                    # Mate is unmapped, insertion type. Only add if soft-clip is available
                     process_alignment(G, r, clip_l, max_dist, gettid,
                                       overlap_regions, clustering_dist, pe_scope,
                                       cigar_index, event_pos, paired_end, tell, genome_scanner,
-                                      template_edges, node_to_name, pos2, mapq_thresh, clip_scope, ReadEnum_t.SPLIT,
-                                      bad_clip_counter, mm_only, site_adder, 0, trust_ins_len)
-                    added = True
-                for cigar_index in range(cigar_l):
-                    cigar_value = cigar_p[cigar_index]
-                    opp = <int> cigar_value & 15
-                    length = <int> cigar_value >> 4
-
-                    if opp == 0 or opp == 7 or opp == 8:
-                        if find_n_aligned_bases:
-                            n_aligned_bases += length
-                        event_pos += length
-                        continue
-
-                    if opp == 1:
-                        if length >= min_sv_size:
-                            # Insertion type
-                            # if event_pos < 998000 or event_pos > 999000:
-                            #     continue
-                            pos2 = event_pos + length
-                            # if not events_to_add.empty() and event_pos - events_to_add.back().event_pos < elide_threshold and events_to_add.back().read_enum == ReadEnum_t.INSERTION:
-                            #     continue
-                            # else:
-                            events_to_add.push_back(make_cigar_event(opp, cigar_index, event_pos, pos2, length, ReadEnum_t.INSERTION))
-                            added = True
-                    elif opp == 2:
-                        if length >= min_sv_size:
-                            pos2 = event_pos + length
-                            # if not events_to_add.empty() and event_pos - events_to_add.back().event_pos < elide_threshold and events_to_add.back().read_enum == ReadEnum_t.DELETION:
-                            #     continue
-                            # else:
-                            events_to_add.push_back(make_cigar_event(opp, cigar_index, event_pos, pos2, length, ReadEnum_t.DELETION))
-                            added = True
-                        event_pos += length
-                    elif opp == 3:
-                        if length >= min_sv_size:
-                            pos2 = event_pos + length
-                            events_to_add.push_back(make_cigar_event(opp, cigar_index, event_pos, pos2, length, ReadEnum_t.SKIP))
-                            added = True
-                        event_pos += length
-                    else:
-                        if opp != 4 and opp != 5:
-                            event_pos += length
-                        elif opp == 4 and length >= clip_l:
-                            clipped = 1
-
-            if (paired_end and not added and contigs) or not paired_end:
-                # Whole alignment will be used, try infer position from soft-clip
-                cigar_index = -1
-                pos2 = -1
-                left_clip_size = 0
-                right_clip_size = 0
-                clip_sizes_hard(r, &left_clip_size, &right_clip_size)  # soft and hard-clips
-                if r.flag & 8 and clipped:  # paired by inference
-                    # skip if both ends are clipped, usually means its a chunk of badly mapped sequence
-                    # if not (left_clip_size and right_clip_size) and ((paired_end and good_quality_clip(r, 20)) or (not paired_end and) ):
-                    if not (left_clip_size and right_clip_size) and good_quality_clip(r, 20):
-                        # Mate is unmapped, insertion type. Only add if soft-clip is available
+                                      template_edges, node_to_name,
+                                      pos2, mapq_thresh, clip_scope, ReadEnum_t.BREAKEND, bad_clip_counter,
+                                      mm_only, site_adder, 0, trust_ins_len)
+            else:
+                # Use whole read, could be normal or discordant
+                if not paired_end:
+                    if max(left_clip_size, right_clip_size) > 250:
+                        read_enum = ReadEnum_t.BREAKEND
+                        if left_clip_size > right_clip_size:
+                            event_pos = r.pos  # else reference_end is used
                         process_alignment(G, r, clip_l, max_dist, gettid,
                                           overlap_regions, clustering_dist, pe_scope,
                                           cigar_index, event_pos, paired_end, tell, genome_scanner,
                                           template_edges, node_to_name,
-                                          pos2, mapq_thresh, clip_scope, ReadEnum_t.BREAKEND, bad_clip_counter,
+                                          pos2, mapq_thresh, clip_scope, read_enum, bad_clip_counter,
                                           mm_only, site_adder, 0, trust_ins_len)
-                else:
-                    # Use whole read, could be normal or discordant
-                    if not paired_end:
-                        if max(left_clip_size, right_clip_size) > 250:
-                            read_enum = ReadEnum_t.BREAKEND
-                            if left_clip_size > right_clip_size:
-                                event_pos = r.pos  # else reference_end is used
-                            process_alignment(G, r, clip_l, max_dist, gettid,
-                                              overlap_regions, clustering_dist, pe_scope,
-                                              cigar_index, event_pos, paired_end, tell, genome_scanner,
-                                              template_edges, node_to_name,
-                                              pos2, mapq_thresh, clip_scope, read_enum, bad_clip_counter,
-                                              mm_only, site_adder, 0, trust_ins_len)
 
+                else:
+                    if r.flag & 2 and abs(r.tlen) < max_dist and r.rname == r.rnext:
+                        if not clipped:
+                            continue
+                        read_enum = ReadEnum_t.BREAKEND
                     else:
-                        if r.flag & 2 and abs(r.tlen) < max_dist and r.rname == r.rnext:
-                            if not clipped:
-                                continue
-                            read_enum = ReadEnum_t.BREAKEND
-                        else:
-                            read_enum = ReadEnum_t.DISCORDANT
+                        read_enum = ReadEnum_t.DISCORDANT
 
-                        if left_clip_size or right_clip_size:
-                            if left_clip_size > right_clip_size:
-                                event_pos = r.pos  # else reference_end is used
-                        process_alignment(G, r, clip_l, max_dist, gettid,
-                                          overlap_regions, clustering_dist, pe_scope,
-                                          cigar_index, event_pos, paired_end, tell, genome_scanner,
-                                          template_edges, node_to_name,
-                                          pos2, mapq_thresh, clip_scope, read_enum, bad_clip_counter,
-                                          mm_only, site_adder, 0, trust_ins_len)
-            # process within-read events
-            if not events_to_add.empty():
-                itr_events = events_to_add.begin()
-                while itr_events != events_to_add.end():
-                    v = dereference(itr_events)
-                    if v.cigar_skip:
-                        pos2 = v.pos2
-                    else:
-                        pos2 = v.event_pos + v.length  # fall back on original cigar event length
+                    if left_clip_size or right_clip_size:
+                        if left_clip_size > right_clip_size:
+                            event_pos = r.pos  # else reference_end is used
                     process_alignment(G, r, clip_l, max_dist, gettid,
                                       overlap_regions, clustering_dist, pe_scope,
-                                      v.cigar_index, v.event_pos, paired_end, tell, genome_scanner,
+                                      cigar_index, event_pos, paired_end, tell, genome_scanner,
                                       template_edges, node_to_name,
-                                      v.pos2, mapq_thresh, clip_scope, v.read_enum, bad_clip_counter,
-                                      mm_only, site_adder, v.length, trust_ins_len)
-                    preincrement(itr_events)
+                                      pos2, mapq_thresh, clip_scope, read_enum, bad_clip_counter,
+                                      mm_only, site_adder, 0, trust_ins_len)
+        # process within-read events
+        if not events_to_add.empty():
+            itr_events = events_to_add.begin()
+            while itr_events != events_to_add.end():
+                v = dereference(itr_events)
+                if v.cigar_skip:
+                    pos2 = v.pos2
+                else:
+                    pos2 = v.event_pos + v.length  # fall back on original cigar event length
+                process_alignment(G, r, clip_l, max_dist, gettid,
+                                  overlap_regions, clustering_dist, pe_scope,
+                                  v.cigar_index, v.event_pos, paired_end, tell, genome_scanner,
+                                  template_edges, node_to_name,
+                                  v.pos2, mapq_thresh, clip_scope, v.read_enum, bad_clip_counter,
+                                  mm_only, site_adder, v.length, trust_ins_len)
+                preincrement(itr_events)
 
     add_template_edges(G, template_edges)
     if site_adder:
diff --git a/dysgu/post_call.py b/dysgu/post_call.py
index 945c7f8..4ae45cb 100644
--- a/dysgu/post_call.py
+++ b/dysgu/post_call.py
@@ -2,7 +2,7 @@
 import numpy as np
 from dysgu.map_set_utils import echo
 from dysgu import re_map
-from dysgu.io_funcs import reverse_complement, intersecter #, iitree
+from dysgu.io_funcs import reverse_complement, intersecter
 import zlib
 import math
 import pickle
@@ -17,8 +17,11 @@
 from collections import defaultdict, Counter
 import os
 import warnings
-from sklearn.exceptions import InconsistentVersionWarning
-warnings.filterwarnings(action='ignore', category=InconsistentVersionWarning)
+try:
+    from sklearn.exceptions import InconsistentVersionWarning
+    warnings.filterwarnings(action='ignore', category=InconsistentVersionWarning)
+except ImportError:
+    pass
 
 
 def get_badclip_metric(events, bad_clip_counter, bam, regions):
diff --git a/pyproject.toml b/pyproject.toml
index 91c0451..13eaf63 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -10,7 +10,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "dysgu"
-version = "1.8.0"
+version = "1.8.1"
 description = "Structural variant calling"
 authors = [
     { name = "Kez Cleal", email = "clealk@cardiff.ac.uk" }

From 8713824fccb15ff44e8d943b8af257b8f4a3d3fb Mon Sep 17 00:00:00 2001
From: kcleal <clealk@cardiff.ac.uk>
Date: Tue, 4 Mar 2025 15:26:16 +0000
Subject: [PATCH 10/14] #125 better calling of low support SVs

---
 .github/workflows/main.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index bf6a5fa..bdf396d 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -10,7 +10,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        os: [ubuntu-20.04, macOS-14]
+        os: [ubuntu-22.04, macOS-14]
     steps:
       - uses: actions/checkout@v4
       - name: Set correct paths for Homebrew on macOS

From 02d46b90cf8b7ac97bfa7efe5a58f0cd3083aa4f Mon Sep 17 00:00:00 2001
From: kcleal <clealk@cardiff.ac.uk>
Date: Tue, 4 Mar 2025 16:06:57 +0000
Subject: [PATCH 11/14] Updated requirements.txt

---
 pyproject.toml   | 2 +-
 requirements.txt | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 13eaf63..7c50a01 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -28,7 +28,7 @@ dependencies = [
     "networkx >= 2.4",
     "scikit-learn >= 0.22",
     "sortedcontainers",
-    "superintervals",
+    "superintervals >= 0.2.10",
     "lightgbm"
 ]
 
diff --git a/requirements.txt b/requirements.txt
index 96ee1eb..9d57a6b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,5 +7,5 @@ pandas
 pysam>=0.22
 networkx>=2.4
 scikit-learn>=0.22
-superintervals
+superintervals>=0.2.10
 lightgbm

From 27e28507bfbfdfddcd3e272c574b835abf865ac8 Mon Sep 17 00:00:00 2001
From: kcleal <clealk@cardiff.ac.uk>
Date: Wed, 5 Mar 2025 13:35:20 +0000
Subject: [PATCH 12/14] Fixed large record output, switched to symbolic

---
 dysgu/call_component.pyx | 67 +++++++++++++++++++++++++++-------------
 dysgu/post_call.py       | 59 ++++++++++++++---------------------
 2 files changed, 68 insertions(+), 58 deletions(-)

diff --git a/dysgu/call_component.pyx b/dysgu/call_component.pyx
index 40c259b..e18ceb6 100644
--- a/dysgu/call_component.pyx
+++ b/dysgu/call_component.pyx
@@ -2076,28 +2076,51 @@ cdef list multi(data, bam, int insert_size, int insert_stdev, float insert_ppf,
                         events += res
 
     # Check for events within clustered nodes
-    # if data.s_within:
-    #     for k, d in data.s_within:
-    #         o_count = out_counts[k]
-    #         i_counts = len(d)
-    #
-    #         if i_counts > max_single_size:
-    #             continue
-    #         # if o_count > 0 and i_counts >= (2*min_support) and i_counts > o_count:
-    #         # if o_count > 0 or i_counts > 0:
-    #         rds = get_reads(bam, d, data.reads, data.n2n, 0, info)
-    #         if len(rds) < lower_bound_support or (len(sites_info) != 0 and len(rds) == 0):
-    #             continue
-    #         echo('Calling single 1')
-    #         res = single(rds, insert_size, insert_stdev, insert_ppf, clip_length, min_support, assemble_contigs,
-    #                      sites_info, paired_end, length_extend, divergence, hp_tag)
-    #         for r in res:
-    #             echo(r.posA, r.posB, r.svlen)
-    #         if res:
-    #             if isinstance(res, EventResult):
-    #                 events.append(res)
-    #             else:
-    #                 events += res
+    # if paired_end and data.s_within:
+        # for item in events:
+        #     echo(item.svlen)
+        # echo('s_within', data.s_within)
+        # echo('parts', data.parts)
+        # echo('s_between', data.s_between)
+
+        # for k, d in data.s_within:
+        #     o_count = out_counts[k]
+        #     i_counts = len(d)
+        #
+        #     if i_counts > max_single_size:
+        #         continue
+        #     # if o_count > 0 and i_counts >= (2*min_support) and i_counts > o_count:
+        #     # if o_count > 0 or i_counts > 0:
+        #     rds = get_reads(bam, d, data.reads, data.n2n, 0, info)
+        #     if len(rds) < lower_bound_support or (len(sites_info) != 0 and len(rds) == 0):
+        #         continue
+        #     res = single(rds, insert_size, insert_stdev, insert_ppf, clip_length, min_support, assemble_contigs,
+        #                  sites_info, paired_end, length_extend, divergence, hp_tag)
+        #     if res:
+        #         if isinstance(res, EventResult):
+        #             events.append(res)
+        #         else:
+        #             events += res
+        #     for item in events:
+        #         echo(item.svlen)
+        #     echo()
+    if paired_end and data.s_within:
+        for k, d in data.s_within:
+            o_count = out_counts[k]
+            i_counts = len(d)
+            if i_counts > max_single_size:
+                continue
+            if o_count > 0 and i_counts > (2 * min_support) and i_counts > o_count:
+                rds = get_reads(bam, d, data.reads, data.n2n, 0, info)
+                if len(rds) < lower_bound_support or (len(sites_info) != 0 and len(rds) == 0):
+                    continue
+                res = single(rds, insert_size, insert_stdev, insert_ppf, clip_length, min_support, assemble_contigs,
+                             sites_info, paired_end, length_extend, divergence, hp_tag)
+                if res:
+                    if isinstance(res, EventResult):
+                        events.append(res)
+                    else:
+                        events += res
     return events
 
 
diff --git a/dysgu/post_call.py b/dysgu/post_call.py
index 4ae45cb..6f2eb9c 100644
--- a/dysgu/post_call.py
+++ b/dysgu/post_call.py
@@ -330,51 +330,38 @@ def median(arr, start, end):
 def get_ref_base(events, ref_genome, symbolic_sv_size):
     for e in events:
         if e.posA == 0:
-            e.posA = 1  # !!
+            e.posA = 1
         if not e.ref_seq and not e.svtype == "BND":
             if symbolic_sv_size == -1 or e.svtype in ("INS", "TRA", "INV"):
-                if e.posA == 0:
-                    e.posA = 1
                 try:
                     base = ref_genome.fetch(e.chrA, e.posA - 1, e.posA).upper()
                     e.ref_seq = base
                 except:
                     pass
             else:
-                bases = ""
-                if e.svlen < symbolic_sv_size:
-                    start = e.posA
+                if e.svlen < symbolic_sv_size: # Fetch the variant sequence
+                    start = e.posA - 1  # Adjust to 0-based and include preceding base
                     end = e.posB
-                else:
-                    start = e.posA - 1
-                    end - e.posA
-                try:
-                    bases = ref_genome.fetch(e.chrA, start, end).upper()
-                except:
-                    pass
-                if e.svtype == "DEL":
-                    e.ref_seq = bases
-                    e.variant_seq = bases[0] if bases else ""
-                else:
-                    e.variant_seq = bases
-                    e.ref_seq = bases[0] if bases else ""
-
-                #
-                #
-                #
-                #     try:
-                #         bases = ref_genome.fetch(e.chrA, e.posA, e.posB).upper()
-                #
-                #     except:
-                #         pass
-                #     e.ref_seq = bases
-                #     e.variant_seq = bases[0] if len(bases) else ""
-                # else:
-                #     try:
-                #         bases = ref_genome.fetch(e.chrA, e.posA - 1, e.posA).upper()
-                #     except:
-                #         pass
-                #     e.ref_seq = bases
+                    try:
+                        bases = ref_genome.fetch(e.chrA, start, end).upper()
+                    except:
+                        bases = ""
+                    if e.svtype == "DEL":
+                        e.ref_seq = bases
+                        e.variant_seq = bases[0] if bases else ""
+                    else:  # For duplications, etc.
+                        e.variant_seq = bases
+                        e.ref_seq = bases[0] if bases else ""
+                else: # Use symbolic representation
+                    try:
+                        base = ref_genome.fetch(e.chrA, e.posA - 1, e.posA).upper()
+                        e.ref_seq = base
+                        e.variant_seq = base  # For non-DEL SVs
+                        if e.svtype == "DEL":
+                            e.variant_seq = base
+                    except:
+                        pass
+
     return events
 
 

From 561d5914c622a02fa6511a5a1a52aeb3abb6b4a8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ond=C5=99ej=20Sloup?= <dev@lupphes.com>
Date: Sun, 6 Apr 2025 11:52:42 +0200
Subject: [PATCH 13/14] fix: relax input BAM file validation (#131)

---
 dysgu/coverage.pyx | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/dysgu/coverage.pyx b/dysgu/coverage.pyx
index 1df5430..7b30641 100644
--- a/dysgu/coverage.pyx
+++ b/dysgu/coverage.pyx
@@ -328,14 +328,8 @@ cdef class GenomeScanner:
             if "SO" in hd:
                 if hd["SO"] == "unsorted":
                     raise ValueError("Input bam file must be sorted")
-                
-        prev_alignment = None
+
         for a in file_iter:
-            # double check with index
-            if prev_alignment and prev_alignment.reference_id > a.reference_id:
-                raise ValueError("Input bam file must be sorted")
-            prev_alignment = a
-            
             if ibam is None:
                 cigar_l = a._delegate.core.n_cigar
                 if a.flag & 1284 or a.mapq < self.mapq_threshold or cigar_l == 0:

From d885e1cf54ff014abe488490f8afc1d029a43be7 Mon Sep 17 00:00:00 2001
From: kcleal <clealk@cardiff.ac.uk>
Date: Mon, 7 Apr 2025 11:41:45 +0100
Subject: [PATCH 14/14] Zero based POS

---
 dysgu/post_call.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dysgu/post_call.py b/dysgu/post_call.py
index 6f2eb9c..0f3c97a 100644
--- a/dysgu/post_call.py
+++ b/dysgu/post_call.py
@@ -339,8 +339,8 @@ def get_ref_base(events, ref_genome, symbolic_sv_size):
                 except:
                     pass
             else:
-                if e.svlen < symbolic_sv_size: # Fetch the variant sequence
-                    start = e.posA - 1  # Adjust to 0-based and include preceding base
+                if e.svlen < symbolic_sv_size:  # Fetch the variant sequence
+                    start = e.posA
                     end = e.posB
                     try:
                         bases = ref_genome.fetch(e.chrA, start, end).upper()