From 5c6ecdb2bfb9106dddad329556dec7abe35a12ff Mon Sep 17 00:00:00 2001 From: Yan Wong Date: Wed, 6 Nov 2024 16:17:48 +0000 Subject: [PATCH 01/25] Fix SVG pack_untracked_polytomies bug Fixes #3049 --- python/tests/test_drawing.py | 13 +++++++++++++ python/tskit/drawing.py | 29 ++++++++++++++++------------- 2 files changed, 29 insertions(+), 13 deletions(-) diff --git a/python/tests/test_drawing.py b/python/tests/test_drawing.py index 8da436dab3..0333141837 100644 --- a/python/tests/test_drawing.py +++ b/python/tests/test_drawing.py @@ -2602,6 +2602,19 @@ def test_only_subset_nodes_in_rank(self, caplog): ) assert "Ticks {10: '10'} lie outside the plotted axis" in caplog.text + def test_polytomy_collapsing(self): + tree = tskit.Tree.generate_balanced( + 20, arity=4, tracked_samples=np.arange(2, 8) + ) + svg = tree.draw_svg(pack_untracked_polytomies=True) + # Should have one collapsed node (untracked samples 8 and 9) + # and two "polytomy lines" (from nodes 21 and 28 (the root)) + assert svg.count('class="polytomy"') == 2 # poolytomy lines + collapsed_symbol = re.search("]*>", svg) + assert collapsed_symbol is not None + assert collapsed_symbol.group(0).count("sym") == 1 + assert collapsed_symbol.group(0).count("multi") == 1 + class TestDrawKnownSvg(TestDrawSvgBase): """ diff --git a/python/tskit/drawing.py b/python/tskit/drawing.py index a1c0e5cfee..7c6c591bfa 100644 --- a/python/tskit/drawing.py +++ b/python/tskit/drawing.py @@ -1827,23 +1827,26 @@ def assign_x_coordinates(self): prev = tree.virtual_root for u in self.postorder_nodes: parent = tree.parent(u) + omit = self.pack_untracked_polytomies and tree.num_tracked_samples(u) == 0 if parent == prev: raise ValueError("Nodes must be passed in postorder to Tree.draw_svg()") is_tip = tree.parent(prev) != u if is_tip: - if self.pack_untracked_polytomies and tree.num_tracked_samples(u) == 0: - untracked_children[parent].append(u) - else: + if not omit: leaf_x += 1 node_xpos[u] = leaf_x - else: - # Concatenate all the untracked children - num_untracked_children = len(untracked_children[u]) + elif not omit: + # Untracked children are available for packing into a polytomy summary + untracked_children = [] + if self.pack_untracked_polytomies: + untracked_children += [ + c for c in tree.children(u) if tree.num_tracked_samples(c) == 0 + ] child_x = [node_xpos[c] for c in tree.children(u) if c in node_xpos] - if num_untracked_children > 0: - if num_untracked_children <= 1: - # If only a single non-focal lineage, we might as well show it - for child in untracked_children[u]: + if len(untracked_children) > 0: + if len(untracked_children) <= 1: + # If only a single non-focal lineage, treat it as a condensed tip + for child in untracked_children: leaf_x += 1 node_xpos[child] = leaf_x child_x.append(leaf_x) @@ -1851,9 +1854,9 @@ def assign_x_coordinates(self): # Otherwise show a horizontal line with the number of lineages # Extra length of line is equal to log of the polytomy size self.extra_line[u] = self.PolytomyLine( - num_untracked_children, - sum(tree.num_samples(v) for v in untracked_children[u]), - [leaf_x, leaf_x + 1 + np.log(num_untracked_children)], + len(untracked_children), + sum(tree.num_samples(v) for v in untracked_children), + [leaf_x, leaf_x + 1 + np.log(len(untracked_children))], ) child_x.append(leaf_x + 1) leaf_x = self.extra_line[u].line_pos[1] From d61a8b68160d7f5f2c3bcd3a829e238d1d5c07e1 Mon Sep 17 00:00:00 2001 From: Peter Ralph Date: Sat, 2 Nov 2024 09:35:16 -0700 Subject: [PATCH 02/25] minor wording in error message --- c/tskit/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/c/tskit/core.c b/c/tskit/core.c index 4be44f6242..a63cfad287 100644 --- a/c/tskit/core.c +++ b/c/tskit/core.c @@ -232,10 +232,10 @@ tsk_strerror_internal(int err) /* Edge errors */ case TSK_ERR_NULL_PARENT: - ret = "Edge in parent is null. (TSK_ERR_NULL_PARENT)"; + ret = "Edge parent is null. (TSK_ERR_NULL_PARENT)"; break; case TSK_ERR_NULL_CHILD: - ret = "Edge in parent is null. (TSK_ERR_NULL_CHILD)"; + ret = "Edge child is null. (TSK_ERR_NULL_CHILD)"; break; case TSK_ERR_EDGES_NOT_SORTED_PARENT_TIME: ret = "Edges must be listed in (time[parent], child, left) order;" From 2c0509f66733271724cad2df5e1a41d217edbca6 Mon Sep 17 00:00:00 2001 From: Nate Pope Date: Tue, 19 Nov 2024 10:49:54 -0800 Subject: [PATCH 03/25] Fix wrong output dim in pair_coalescence_counts with extra time windows --- python/tests/test_coalrate.py | 38 +++++++++++++++++++++++++++++++++++ python/tskit/trees.py | 28 +++++++++++++++++--------- 2 files changed, 56 insertions(+), 10 deletions(-) diff --git a/python/tests/test_coalrate.py b/python/tests/test_coalrate.py index f21bcea627..04e8ccb9e6 100644 --- a/python/tests/test_coalrate.py +++ b/python/tests/test_coalrate.py @@ -1509,6 +1509,25 @@ def test_output_dim(self): ) assert implm.shape == (2, ts.num_nodes) + def test_extra_time_windows(self): + """ + test that output dimensions match number of time windows + and windows without nodes have zero counts + """ + ts = self.example_ts() + ss = [[0, 1, 2], [3, 4, 5]] + max_time = ts.nodes_time.max() + time_windows = np.linspace(0, max_time * 2, 10) + implm = ts.pair_coalescence_counts( + sample_sets=ss, + windows=None, + indexes=None, + time_windows=time_windows, + ) + assert implm.shape == (time_windows.size - 1,) + max_idx = np.searchsorted(time_windows, max_time, side="right") + np.testing.assert_allclose(implm[max_idx:], 0.0) + class TestPairCoalescenceQuantiles: """ @@ -1747,3 +1766,22 @@ def test_long_sequence(self): np.testing.assert_allclose( rates.flatten(), np.repeat(ck_rates, windows.size - 1) ) + + def test_extra_time_windows(self): + """ + test that output dimensions match number of time windows + and windows without nodes have NaN rates + """ + ts = self.example_ts() + ss = [[0, 1, 2], [3, 4, 5]] + max_time = ts.nodes_time.max() + time_windows = np.append(np.linspace(0, max_time * 2, 10), np.inf) + implm = ts.pair_coalescence_rates( + time_windows, + sample_sets=ss, + windows=None, + indexes=None, + ) + assert implm.shape == (time_windows.size - 1,) + max_idx = np.searchsorted(time_windows, max_time, side="right") + assert np.all(np.isnan(implm[max_idx:])) diff --git a/python/tskit/trees.py b/python/tskit/trees.py index a4f72393b7..d9770636fc 100644 --- a/python/tskit/trees.py +++ b/python/tskit/trees.py @@ -9739,32 +9739,40 @@ def pair_coalescence_counts( raise ValueError( "Must specify indexes if there are more than two sample sets" ) + num_indexes = len(indexes) drop_left_dimension = False if windows is None: drop_left_dimension = True windows = np.array([0.0, self.sequence_length]) + num_windows = len(windows) - 1 if isinstance(time_windows, str) and time_windows == "nodes": - node_bin_map = np.arange(self.num_nodes, dtype=np.int32) + num_time_windows = self.num_nodes + node_bin_map = np.arange(num_time_windows, dtype=np.int32) else: if self.time_units == tskit.TIME_UNITS_UNCALIBRATED: raise ValueError("Time windows require calibrated node times") + num_time_windows = len(time_windows) - 1 node_bin_map = np.digitize(self.nodes_time, time_windows) - 1 - node_bin_map[node_bin_map == time_windows.size - 1] = tskit.NULL + node_bin_map[node_bin_map == num_time_windows] = tskit.NULL node_bin_map = node_bin_map.astype(np.int32) + num_bins = node_bin_map.max() + 1 sample_set_sizes = np.array([len(s) for s in sample_sets], dtype=np.uint32) sample_sets = util.safe_np_int_cast(np.hstack(sample_sets), np.int32) - coalescing_pairs = self.ll_tree_sequence.pair_coalescence_counts( - sample_sets=sample_sets, - sample_set_sizes=sample_set_sizes, - windows=windows, - indexes=indexes, - node_bin_map=node_bin_map, - span_normalise=span_normalise, - pair_normalise=pair_normalise, + coalescing_pairs = np.zeros((num_windows, num_indexes, num_time_windows)) + coalescing_pairs[..., :num_bins] = ( + self.ll_tree_sequence.pair_coalescence_counts( + sample_sets=sample_sets, + sample_set_sizes=sample_set_sizes, + windows=windows, + indexes=indexes, + node_bin_map=node_bin_map, + span_normalise=span_normalise, + pair_normalise=pair_normalise, + ) ) if drop_middle_dimension: From b99dfe7e3e40eae365dbb8b353421bbe754995d3 Mon Sep 17 00:00:00 2001 From: Nate Pope Date: Tue, 19 Nov 2024 11:32:02 -0800 Subject: [PATCH 04/25] Update changelog --- python/CHANGELOG.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/python/CHANGELOG.rst b/python/CHANGELOG.rst index af0a1de8fd..6afd37ef6a 100644 --- a/python/CHANGELOG.rst +++ b/python/CHANGELOG.rst @@ -2,6 +2,12 @@ [0.6.1] - 2024-XX-XX -------------------- +**Bufixes** + +- Fix to ``TreeSequence.pair_coalescence_counts`` output dimension when + provided with time windows containing no nodes (:user:`nspope`, + :issue:`3046`, :pr:`3058`) + -------------------- [0.6.0] - 2024-10-16 From f6b27ed6356888e77f5ac07941cc50facbacbf48 Mon Sep 17 00:00:00 2001 From: Mergify <37929162+mergify[bot]@users.noreply.github.com> Date: Wed, 20 Nov 2024 09:22:44 +0000 Subject: [PATCH 05/25] ci(mergify): upgrade configuration to current format --- .mergify.yml | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/.mergify.yml b/.mergify.yml index d51c1a62a7..57f546a418 100644 --- a/.mergify.yml +++ b/.mergify.yml @@ -1,8 +1,11 @@ queue_rules: - name: default - conditions: + queue_conditions: + - "-merged" - "#approved-reviews-by>=1" - "#changes-requested-reviews-by=0" + - base=main + - label=AUTOMERGE-REQUESTED - status-success=Docs - status-success=Lint - status-success=Python (3.9, macos-latest) @@ -12,14 +15,9 @@ queue_rules: - status-success=Python (3.9, windows-latest) - status-success=Python (3.12, windows-latest) - "status-success=ci/circleci: build" -pull_request_rules: - - name: Automatic rebase, CI and merge - conditions: - - "-merged" + merge_conditions: - "#approved-reviews-by>=1" - "#changes-requested-reviews-by=0" - - base=main - - label=AUTOMERGE-REQUESTED - status-success=Docs - status-success=Lint - status-success=Python (3.9, macos-latest) @@ -29,16 +27,10 @@ pull_request_rules: - status-success=Python (3.9, windows-latest) - status-success=Python (3.12, windows-latest) - "status-success=ci/circleci: build" - #- status-success=codecov/patch - #- status-success=codecov/project/c-tests - #- status-success=codecov/project/python-c-tests - #- status-success=codecov/project/python-tests - actions: - queue: - name: default - method: rebase - update_method: rebase + merge_method: rebase + update_method: rebase +pull_request_rules: - name: Remove label after merge conditions: - merged @@ -48,3 +40,7 @@ pull_request_rules: remove: - AUTOMERGE-REQUESTED + - name: refactored queue action rule + conditions: [] + actions: + queue: From 8cd494a8812e7649341ff43f42d0011481310738 Mon Sep 17 00:00:00 2001 From: Nate Pope Date: Tue, 19 Nov 2024 11:29:53 -0800 Subject: [PATCH 06/25] Normalise pair coalescence counts by nonmissing span per window Update changelog Update changelog Docstring fix, more tests --- c/tests/test_stats.c | 23 +++++-- c/tests/testlib.c | 27 ++++++++ c/tests/testlib.h | 3 + c/tskit/trees.c | 26 ++++++-- python/CHANGELOG.rst | 9 ++- python/tests/test_coalrate.py | 112 +++++++++++++++++++++++++++++++++- python/tskit/trees.py | 6 +- 7 files changed, 193 insertions(+), 13 deletions(-) diff --git a/c/tests/test_stats.c b/c/tests/test_stats.c index a40817e7a9..f55cdcee50 100644 --- a/c/tests/test_stats.c +++ b/c/tests/test_stats.c @@ -316,12 +316,15 @@ verify_pair_coalescence_counts(tsk_treeseq_t *ts, tsk_flags_t options) const double *breakpoints = tsk_treeseq_get_breakpoints(ts); const tsk_size_t P = 2; const tsk_size_t I = P * (P + 1) / 2; + const tsk_size_t B = 8; tsk_id_t sample_sets[n]; tsk_size_t sample_set_sizes[P]; tsk_id_t index_tuples[2 * I]; tsk_id_t node_bin_map[N]; tsk_size_t dim = T * N * I; double C[dim]; + double C_B[T * B * I]; + double C_Nh[T * (N / 2) * I]; tsk_size_t i, j, k; for (i = 0; i < n; i++) { @@ -332,7 +335,7 @@ verify_pair_coalescence_counts(tsk_treeseq_t *ts, tsk_flags_t options) sample_set_sizes[i] = 0; } for (j = 0; j < n; j++) { - i = j / (n / P); + i = j / ((n + P - 1) / P); sample_set_sizes[i]++; } @@ -345,17 +348,17 @@ verify_pair_coalescence_counts(tsk_treeseq_t *ts, tsk_flags_t options) /* test various bin assignments */ for (i = 0; i < N; i++) { - node_bin_map[i] = ((tsk_id_t) i) % 8; + node_bin_map[i] = ((tsk_id_t) (i % B)); } ret = tsk_treeseq_pair_coalescence_counts(ts, P, sample_set_sizes, sample_sets, I, - index_tuples, T, breakpoints, 8, node_bin_map, options, C); + index_tuples, T, breakpoints, B, node_bin_map, options, C_B); CU_ASSERT_EQUAL_FATAL(ret, 0); for (i = 0; i < N; i++) { node_bin_map[i] = i < N / 2 ? ((tsk_id_t) i) : TSK_NULL; } ret = tsk_treeseq_pair_coalescence_counts(ts, P, sample_set_sizes, sample_sets, I, - index_tuples, T, breakpoints, N / 2, node_bin_map, options, C); + index_tuples, T, breakpoints, N / 2, node_bin_map, options, C_Nh); CU_ASSERT_EQUAL_FATAL(ret, 0); for (i = 0; i < N; i++) { @@ -3687,6 +3690,17 @@ test_pair_coalescence_counts(void) tsk_treeseq_free(&ts); } +static void +test_pair_coalescence_counts_missing(void) +{ + tsk_treeseq_t ts; + tsk_treeseq_from_text(&ts, 5, missing_ex_nodes, missing_ex_edges, NULL, + NULL, NULL, NULL, NULL, 0); + verify_pair_coalescence_counts(&ts, 0); + verify_pair_coalescence_counts(&ts, TSK_STAT_SPAN_NORMALISE); + tsk_treeseq_free(&ts); +} + static void test_pair_coalescence_quantiles(void) { @@ -3816,6 +3830,7 @@ main(int argc, char **argv) { "test_multiroot_divergence_matrix", test_multiroot_divergence_matrix }, { "test_pair_coalescence_counts", test_pair_coalescence_counts }, + { "test_pair_coalescence_counts_missing", test_pair_coalescence_counts_missing }, { "test_pair_coalescence_quantiles", test_pair_coalescence_quantiles }, { "test_pair_coalescence_rates", test_pair_coalescence_rates }, diff --git a/c/tests/testlib.c b/c/tests/testlib.c index 87537ff052..1f05178627 100644 --- a/c/tests/testlib.c +++ b/c/tests/testlib.c @@ -372,6 +372,33 @@ const char *empty_ex_nodes = "1 0.0 0 -1\n" "1 0.0 0 -1\n"; const char *empty_ex_edges = ""; +/*** An example of a tree sequence with missing marginal trees. ***/ +/* + | 4 | | 4 | + | / \ | | / \ | + | 3 \ | | / 3 | + | / \ \ | | / / \ | + | 0 1 2 | | 0 1 2 | + |-|-----------|-|-----------|-| + 0 1 2 3 4 5 +*/ +const char *missing_ex_nodes = + "1 0.0 0 -1\n" + "1 0.0 0 -1\n" + "1 0.0 0 -1\n" + "0 1.0 0 -1\n" + "0 2.0 0 -1\n"; + +const char *missing_ex_edges = + "1.0 2.0 3 0\n" + "1.0 2.0 3 1\n" + "3.0 4.0 3 1\n" + "3.0 4.0 3 2\n" + "3.0 4.0 4 0\n" + "1.0 2.0 4 2\n" + "1.0 2.0 4 3\n" + "3.0 4.0 4 3\n"; + /* Simple utilities to parse text so we can write declaritive * tests. This is not intended as a robust general input mechanism. */ diff --git a/c/tests/testlib.h b/c/tests/testlib.h index 2b4d5cdbf1..30f0013036 100644 --- a/c/tests/testlib.h +++ b/c/tests/testlib.h @@ -127,4 +127,7 @@ extern const char *paper_ex_sites; extern const char *paper_ex_mutations; extern const char *paper_ex_individuals; +extern const char *missing_ex_nodes; +extern const char *missing_ex_edges; + #endif diff --git a/c/tskit/trees.c b/c/tskit/trees.c index fd15aa3ab7..e65612c648 100644 --- a/c/tskit/trees.c +++ b/c/tskit/trees.c @@ -9334,9 +9334,9 @@ tsk_treeseq_pair_coalescence_stat(const tsk_treeseq_t *self, tsk_size_t num_samp void *summary_func_args, tsk_flags_t options, double *result) { int ret = 0; - double left, right, remaining_span, window_span, denominator, x, t; + double left, right, remaining_span, missing_span, window_span, denominator, x, t; tsk_id_t e, p, c, u, v, w, i, j; - tsk_size_t num_samples; + tsk_size_t num_samples, num_edges; tsk_tree_position_t tree_pos; const tsk_table_collection_t *tables = self->tables; const tsk_size_t num_nodes = tables->nodes.num_rows; @@ -9449,6 +9449,8 @@ tsk_treeseq_pair_coalescence_stat(const tsk_treeseq_t *self, tsk_size_t num_samp goto out; } + num_edges = 0; + missing_span = 0.0; w = 0; while (true) { tsk_tree_position_next(&tree_pos); @@ -9494,6 +9496,7 @@ tsk_treeseq_pair_coalescence_stat(const tsk_treeseq_t *self, tsk_size_t num_samp } p = nodes_parent[p]; } + num_edges -= 1; } for (u = tree_pos.in.start; u != tree_pos.in.stop; u++) { @@ -9530,6 +9533,11 @@ tsk_treeseq_pair_coalescence_stat(const tsk_treeseq_t *self, tsk_size_t num_samp c = p; p = nodes_parent[c]; } + num_edges += 1; + } + + if (num_edges == 0) { + missing_span += right - left; } /* flush windows */ @@ -9589,7 +9597,13 @@ tsk_treeseq_pair_coalescence_stat(const tsk_treeseq_t *self, tsk_size_t num_samp } /* normalise weights */ if (options & (TSK_STAT_SPAN_NORMALISE | TSK_STAT_PAIR_NORMALISE)) { - window_span = windows[w + 1] - windows[w]; + window_span = windows[w + 1] - windows[w] - missing_span; + missing_span = 0.0; + if (num_edges == 0) { + remaining_span = right - windows[w + 1]; + window_span -= remaining_span; + missing_span += remaining_span; + } for (i = 0; i < (tsk_id_t) num_set_indexes; i++) { denominator = 1.0; if (options & TSK_STAT_SPAN_NORMALISE) { @@ -9600,7 +9614,7 @@ tsk_treeseq_pair_coalescence_stat(const tsk_treeseq_t *self, tsk_size_t num_samp } weight = GET_2D_ROW(bin_weight, num_bins, i); for (v = 0; v < (tsk_id_t) num_bins; v++) { - weight[v] /= denominator; + weight[v] *= denominator == 0.0 ? 0.0 : 1 / denominator; } } } @@ -9668,6 +9682,9 @@ pair_coalescence_quantiles(tsk_size_t input_dim, const double *weight, j = 0; coalesced = 0.0; timepoint = TSK_UNKNOWN_TIME; + for (i = 0; i < output_dim; i++) { + output[i] = NAN; + } for (i = 0; i < input_dim; i++) { if (weight[i] > 0) { coalesced += weight[i]; @@ -9806,6 +9823,7 @@ pair_coalescence_rates(tsk_size_t input_dim, const double *weight, const double } else { rate = log(1 - weight[i] / (1 - coalesced)) / (a - b); } + // avoid tiny negative values from fp error output[i] = rate > 0 ? rate : 0; coalesced += weight[i]; } diff --git a/python/CHANGELOG.rst b/python/CHANGELOG.rst index 6afd37ef6a..eb5eefee45 100644 --- a/python/CHANGELOG.rst +++ b/python/CHANGELOG.rst @@ -2,12 +2,19 @@ [0.6.1] - 2024-XX-XX -------------------- -**Bufixes** +**Bugfixes** - Fix to ``TreeSequence.pair_coalescence_counts`` output dimension when provided with time windows containing no nodes (:user:`nspope`, :issue:`3046`, :pr:`3058`) +- Fix to ``TreeSequence.pair_coalescence_counts`` to normalise by non-missing + span if ``span_normalise=True``. This resolves a bug where + ``TreeSequence.pair_coalescence_rates`` would return incorrect values for + intervals with missing trees. (:user:`natep`, :issue:`3053`, :pr:`3059`) + +- Fix to ``TreeSequence.pair_coalescence_rates`` causing an + assertion to be triggered by floating point error, when all coalescence events are inside a single time window (:user:`natep`, :issue:`3035`, :pr:`3038`) -------------------- [0.6.0] - 2024-10-16 diff --git a/python/tests/test_coalrate.py b/python/tests/test_coalrate.py index 04e8ccb9e6..017a5f70bb 100644 --- a/python/tests/test_coalrate.py +++ b/python/tests/test_coalrate.py @@ -61,6 +61,36 @@ def _single_tree_example(L, T): # --- prototype --- # +def _nonmissing_window_span(ts, windows): + num_windows = windows.size - 1 + sequence_length = ts.sequence_length + missing_span = np.zeros(num_windows) + missing = 0.0 + num_edges = 0 + w = 0 + position = tsutil.TreePosition(ts) + while position.interval.right < sequence_length: + position.next() + left, right = position.interval.left, position.interval.right + out_range, in_range = position.out_range, position.in_range + for _ in range(out_range.start, out_range.stop): # edges_out + num_edges -= 1 + for _ in range(in_range.start, in_range.stop): # edges_out + num_edges += 1 + if num_edges == 0: + missing += right - left + while w < num_windows and windows[w + 1] <= right: # flush window + missing_span[w] = missing + missing = 0.0 + if num_edges == 0: + x = max(0, right - windows[w + 1]) + missing_span[w] -= x + missing += x + w += 1 + window_span = np.diff(windows) - missing_span + return window_span + + def _pair_coalescence_weights( coalescing_pairs, nodes_time, @@ -234,6 +264,9 @@ def _pair_coalescence_stat( else: total_pairs[i] = sizes[j] * sizes[k] + if span_normalise: + window_span = _nonmissing_window_span(ts, windows) + for i, s in enumerate(sample_sets): # initialize nodes_sample[s, i] = 1 sample_counts = nodes_sample.copy() @@ -327,7 +360,7 @@ def _pair_coalescence_stat( nodes_values[nonzero, i] /= nodes_weight[nonzero, i] nodes_values[~nonzero, i] = np.nan if span_normalise: - nodes_weight /= windows[w + 1] - windows[w] + nodes_weight /= window_span[w] if pair_normalise: nodes_weight /= total_pairs[np.newaxis, :] for i in range(num_indexes): # apply function to empirical distribution @@ -1261,6 +1294,40 @@ def test_span_normalise(self): proto = proto_pair_coalescence_counts(ts, windows=windows, span_normalise=False) np.testing.assert_allclose(proto, check) + def test_span_normalise_with_missing(self): + """ + test case where span is normalised and there are intervals without trees + """ + ts = self.example_ts() + missing = np.array([[0.0, 0.1], [0.8, 1.0]]) * ts.sequence_length + ts = ts.delete_intervals(missing) + windows = np.array([0.0, 0.33, 1.0]) * ts.sequence_length + window_size = np.diff(windows) - np.diff(missing, axis=1).flatten() + check = ( + ts.pair_coalescence_counts(windows=windows, span_normalise=False) + / window_size[:, np.newaxis] + ) + implm = ts.pair_coalescence_counts(windows=windows, span_normalise=True) + np.testing.assert_allclose(implm, check) + # TODO: remove with prototype + proto = proto_pair_coalescence_counts(ts, windows=windows, span_normalise=True) + np.testing.assert_allclose(proto, check) + + def test_empty_windows(self): + """ + test that windows without nodes contain zeros + """ + ts = self.example_ts() + missing = np.array([[0.0, 0.1], [0.8, 1.0]]) * ts.sequence_length + ts = ts.delete_intervals(missing) + windows = np.concatenate(missing) + check = ts.pair_coalescence_counts(windows=windows, span_normalise=False) + implm = ts.pair_coalescence_counts(windows=windows, span_normalise=True) + np.testing.assert_allclose(check[0], 0.0) + np.testing.assert_allclose(check[2], 0.0) + np.testing.assert_allclose(implm[0], 0.0) + np.testing.assert_allclose(implm[2], 0.0) + def test_pair_normalise(self): ts = self.example_ts() windows = np.array([0.0, 0.33, 1.0]) * ts.sequence_length @@ -1638,6 +1705,19 @@ def test_long_sequence(self): ) np.testing.assert_allclose(quants, np.tile(ck_quants, (windows.size - 1, 1))) + def test_empty_windows(self): + """ + test case where a window has no nodes + """ + ts = self.example_ts() + missing = np.array([[0.0, 0.1], [0.8, 1.0]]) * ts.sequence_length + ts = ts.delete_intervals(missing) + windows = np.concatenate(missing) + quantiles = np.linspace(0, 1, 10) + check = ts.pair_coalescence_quantiles(windows=windows, quantiles=quantiles) + assert np.all(np.isnan(check[0])) + assert np.all(np.isnan(check[2])) + class TestPairCoalescenceRates: """ @@ -1785,3 +1865,33 @@ def test_extra_time_windows(self): assert implm.shape == (time_windows.size - 1,) max_idx = np.searchsorted(time_windows, max_time, side="right") assert np.all(np.isnan(implm[max_idx:])) + + def test_missing_sequence(self): + """ + test that missing intervals are ignored when calculating rates + """ + ts = self.example_ts() + missing = np.array([[0.0, 0.1], [0.9, 1.0]]) * ts.sequence_length + ts = ts.delete_intervals(missing) + windows = np.array([0.0, 0.5, 1.0]) * ts.sequence_length + ts_trim = ts.trim() + windows_trim = np.array([0.0, 0.5, 1.0]) * ts_trim.sequence_length + time_windows = np.linspace(0, ts.nodes_time.max() * 2, 10) + time_windows[-1] = np.inf + implm = ts.pair_coalescence_rates(time_windows, windows=windows) + check = ts_trim.pair_coalescence_rates(time_windows, windows=windows_trim) + np.testing.assert_allclose(implm, check) + + def test_empty_windows(self): + """ + test case where a window has no nodes + """ + ts = self.example_ts() + missing = np.array([[0.0, 0.1], [0.8, 1.0]]) * ts.sequence_length + ts = ts.delete_intervals(missing) + windows = np.concatenate(missing) + time_windows = np.linspace(0, ts.nodes_time.max() * 2, 10) + time_windows[-1] = np.inf + check = ts.pair_coalescence_rates(time_windows, windows=windows) + assert np.all(np.isnan(check[0])) + assert np.all(np.isnan(check[2])) diff --git a/python/tskit/trees.py b/python/tskit/trees.py index d9770636fc..74abf59d84 100644 --- a/python/tskit/trees.py +++ b/python/tskit/trees.py @@ -9718,7 +9718,7 @@ def pair_coalescence_counts( :param list windows: An increasing list of breakpoints between the sequence windows to compute the statistic in, or None. :param bool span_normalise: Whether to divide the result by the span of - the window (defaults to True). + non-missing sequence in the window (defaults to True). :param bool pair_normalise: Whether to divide the result by the total number of pairs for a given index (defaults to False). :param time_windows: Either a string "nodes" or an increasing @@ -9893,8 +9893,8 @@ def pair_coalescence_rates( The first breakpoint in `time_windows` must start at the age of the samples, and the last must end at infinity. In the output array, any - time windows that do not contain coalescence events will have `NaN` - values. + time windows where all pairs have coalesced by start of the window will + contain `NaN` values. Pair coalescence rates may be calculated within or between the non-overlapping lists of samples contained in `sample_sets`. In the From aea14d5f17d106df998cfb5efc102e71616616f4 Mon Sep 17 00:00:00 2001 From: Ben Jeffery Date: Fri, 22 Nov 2024 15:32:58 +0000 Subject: [PATCH 07/25] Better edge metadata error --- c/tskit/core.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/c/tskit/core.c b/c/tskit/core.c index a63cfad287..8176ac8fcb 100644 --- a/c/tskit/core.c +++ b/c/tskit/core.c @@ -277,7 +277,10 @@ tsk_strerror_internal(int err) break; case TSK_ERR_CANT_PROCESS_EDGES_WITH_METADATA: ret = "Can't squash, flush, simplify or link ancestors with edges that have " - "non-empty metadata. (TSK_ERR_CANT_PROCESS_EDGES_WITH_METADATA)"; + "non-empty metadata. Removing the metadata from the edges will allow " + "these operations to proceed. For example using " + "tables.edges.drop_metadata() in the tskit Python API. " + "(TSK_ERR_CANT_PROCESS_EDGES_WITH_METADATA)"; break; /* Site errors */ From 17b1b85dfa5dc1ac589348c9849c264c0a3efef1 Mon Sep 17 00:00:00 2001 From: Yan Wong Date: Mon, 2 Dec 2024 15:47:19 +0000 Subject: [PATCH 08/25] Correct link to fwdpp --- docs/introduction.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/introduction.md b/docs/introduction.md index 604f1e06de..9e838eb834 100644 --- a/docs/introduction.md +++ b/docs/introduction.md @@ -28,7 +28,7 @@ formulation, and tskit is a therefore a powerful platform for processing ARGs. The tree sequence format is output by a number of external software libraries and programs (such as [msprime](https://github.com/tskit-dev/msprime), [SLiM](https://github.com/MesserLab/SLiM), -[fwdpp](http://molpopgen.github.io/fwdpp/), and +[fwdpp](https://fwdpp.readthedocs.io/en/), and [tsinfer](https://tsinfer.readthedocs.io/en/latest/)) that either simulate or infer the evolutionary history of genetic sequences. This library provides the underlying functionality that such software uses to load, examine, and From 846158d49da67cbcec1fc61ac9c44498c235c654 Mon Sep 17 00:00:00 2001 From: Yan Wong Date: Mon, 2 Dec 2024 15:49:37 +0000 Subject: [PATCH 09/25] Nicer links --- docs/introduction.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/introduction.md b/docs/introduction.md index 9e838eb834..2399fa25b1 100644 --- a/docs/introduction.md +++ b/docs/introduction.md @@ -26,10 +26,10 @@ Recombination Graph (ARG); succinct tree sequences are fully compatible with thi formulation, and tskit is a therefore a powerful platform for processing ARGs. The tree sequence format is output by a number of external software libraries -and programs (such as [msprime](https://github.com/tskit-dev/msprime), +and programs (such as [msprime](https://tskit.dev/msprime/docs), [SLiM](https://github.com/MesserLab/SLiM), [fwdpp](https://fwdpp.readthedocs.io/en/), and -[tsinfer](https://tsinfer.readthedocs.io/en/latest/)) that either simulate or +[tsinfer](https://tskit.dev/tsinfer/docs/)) that either simulate or infer the evolutionary history of genetic sequences. This library provides the underlying functionality that such software uses to load, examine, and manipulate tree sequences, including efficient methods for calculating @@ -42,5 +42,5 @@ tutorial material to introduce you to the key concepts behind succinct tree sequ :::{note} This documentation is under active development and may be incomplete in some areas. If you would like to help improve it, please open an issue or -pull request at [GitHub](https://github.com/tskit-dev/tskit). +pull request on [GitHub](https://github.com/tskit-dev/tskit). ::: From d1d5bb759257eb017753a325305e5123d788b1d8 Mon Sep 17 00:00:00 2001 From: Yan Wong Date: Tue, 3 Dec 2024 16:46:10 +0000 Subject: [PATCH 10/25] Add note re deleting only the mutations --- python/tskit/trees.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/python/tskit/trees.py b/python/tskit/trees.py index 74abf59d84..19ee9551a1 100644 --- a/python/tskit/trees.py +++ b/python/tskit/trees.py @@ -6821,6 +6821,10 @@ def delete_sites(self, site_ids, record_provenance=True): effect (i.e., calling ``tree_sequence.delete_sites([0, 1, 1])`` has the same effect as calling ``tree_sequence.delete_sites([0, 1])``. + .. note:: + To remove only the mutations associated with a site, but keep the site + itself, use the :meth:`MutationTable.keep_rows` method. + :param list[int] site_ids: A list of site IDs specifying the sites to remove. :param bool record_provenance: If ``True``, add details of this operation to the provenance information of the returned tree sequence. (Default: ``True``). From 6c5f497bbc9f3bb9704d8a09ee8159afbdb93023 Mon Sep 17 00:00:00 2001 From: Ben Jeffery Date: Wed, 11 Dec 2024 11:39:09 +0000 Subject: [PATCH 11/25] Update GitHub Actions to latest major versions --- .github/workflows/docs.yml | 6 +++--- .github/workflows/release.yml | 8 ++++---- .github/workflows/tests.yml | 24 +++++++++++------------ .github/workflows/wheels.yml | 36 +++++++++++++++++------------------ 4 files changed, 37 insertions(+), 37 deletions(-) diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 719fe8f2cd..7861d0fde4 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -27,16 +27,16 @@ jobs: with: access_token: ${{ github.token }} - - uses: actions/checkout@v4 + - uses: actions/checkout@v4.2.2 - name: Setup Conda - uses: conda-incubator/setup-miniconda@v3 + uses: conda-incubator/setup-miniconda@v3.1.0 with: miniforge-version: latest activate-environment: tskit-docs-env - name: Cache Conda env - uses: actions/cache@v4 + uses: actions/cache@v4.2.0 with: path: ${{ env.CONDA }}/envs key: conda-${{ runner.os }}--${{ runner.arch }}--${{ hashFiles(env.REQUIREMENTS) }}-${{ env.CACHE_NUMBER }} diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index b798338b5f..98c54a2ba3 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -10,9 +10,9 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout - uses: actions/checkout@v4 + uses: actions/checkout@v4.2.2 - name: Set up Python - uses: actions/setup-python@v4 + uses: actions/setup-python@v5.3.0 with: python-version: '3.12' - name: Install dependencies and set up venv @@ -39,7 +39,7 @@ jobs: run: echo ::set-output name=VERSION::$(echo $GITHUB_REF | cut -d / -f 3) - name: C Release - uses: softprops/action-gh-release@v1 + uses: softprops/action-gh-release@v2.2.0 if: startsWith(github.ref, 'refs/tags/') && contains(github.event.ref, 'C_') with: name: C API ${{ steps.get_version.outputs.VERSION }} @@ -48,7 +48,7 @@ jobs: fail_on_unmatched_files: True files: build-gcc/meson-dist/* - name: Python Release - uses: softprops/action-gh-release@v1 + uses: softprops/action-gh-release@v2.2.0 if: startsWith(github.ref, 'refs/tags/') && !contains(github.event.ref, 'C_') with: name: Python ${{ steps.get_version.outputs.VERSION }} diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index c5d105377c..ae53eaab0f 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -14,8 +14,8 @@ jobs: uses: styfle/cancel-workflow-action@0.12.1 with: access_token: ${{ github.token }} - - uses: actions/checkout@v4 - - uses: actions/setup-python@v5 + - uses: actions/checkout@v4.2.2 + - uses: actions/setup-python@v5.3.0 with: python-version: '3.10' - name: install clang-format @@ -24,7 +24,7 @@ jobs: python -m venv env source env/bin/activate pip install clang-format==6.0.1 - - uses: pre-commit/action@v3.0.0 + - uses: pre-commit/action@v3.0.1 benchmark: name: Benchmark @@ -32,8 +32,8 @@ jobs: steps: - name: Cancel Previous Runs uses: styfle/cancel-workflow-action@0.12.1 - - uses: actions/checkout@v4 - - uses: actions/setup-python@v5 + - uses: actions/checkout@v4.2.2 + - uses: actions/setup-python@v5.3.0 with: python-version: '3.11' cache: 'pip' @@ -50,7 +50,7 @@ jobs: pip uninstall -y tskit python run.py - name: Upload Results - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v4.4.3 with: name: benchmark-results path: python/benchmark @@ -73,7 +73,7 @@ jobs: access_token: ${{ github.token }} - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4.2.2 - name: Install OSX libs if: matrix.os == 'macos-latest' @@ -82,13 +82,13 @@ jobs: - name: Cache conda and dependencies id: cache - uses: actions/cache@v4 + uses: actions/cache@v4.2.0 with: path: ${{ env.CONDA }}/envs key: ${{ runner.os }}-${{ runner.arch }}-${{ matrix.python}}-conda-v1-${{ hashFiles('python/requirements/CI-tests-conda/requirements.txt') }}-${{ hashFiles('python/requirements/CI-tests-pip/requirements.txt') }} - name: Install Conda - uses: conda-incubator/setup-miniconda@v3 + uses: conda-incubator/setup-miniconda@v3.1.0 if: steps.cache.outputs.cache-hit != 'true' with: activate-environment: anaconda-client-env @@ -141,7 +141,7 @@ jobs: python -m pytest -x --cov=tskit --cov-report=xml --cov-branch -n2 --durations=20 tests - name: Upload coverage to Codecov - uses: codecov/codecov-action@v4 + uses: codecov/codecov-action@v5.1.1 with: token: ${{ secrets.CODECOV_TOKEN }} working-directory: python @@ -168,10 +168,10 @@ jobs: access_token: ${{ github.token }} - name: 'Checkout' - uses: actions/checkout@v4 + uses: actions/checkout@v4.2.2 - name: Setup MSYS2 ${{matrix.sys}} - uses: msys2/setup-msys2@v2 + uses: msys2/setup-msys2@v2.26.0 with: msystem: ${{matrix.sys}} update: true diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index 50f430619e..abf1601aa1 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -18,9 +18,9 @@ jobs: python: [3.9, "3.10", 3.11, 3.12] steps: - name: Checkout - uses: actions/checkout@v4 + uses: actions/checkout@v4.2.2 - name: Set up Python ${{ matrix.python }} - uses: actions/setup-python@v5 + uses: actions/setup-python@v5.3.0 with: python-version: ${{ matrix.python }} - name: Install deps @@ -36,7 +36,7 @@ jobs: pip install delocate delocate-wheel -v dist/*.whl - name: Upload Wheels - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v4.4.3 with: name: osx-wheel-${{ matrix.python }} path: python/dist @@ -49,7 +49,7 @@ jobs: wordsize: [64] steps: - name: Checkout - uses: actions/checkout@v4 + uses: actions/checkout@v4.2.2 - name: Install deps env: PYTHON: "py -${{ matrix.python }}-${{ matrix.wordsize }}" @@ -74,7 +74,7 @@ jobs: cp ../c/tskit.h lib/. ${PYTHON} -m build --wheel - name: Upload Wheels - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v4.4.3 with: name: win-wheel-${{ matrix.python }}-${{ matrix.wordsize }} path: python/dist @@ -83,10 +83,10 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout - uses: actions/checkout@v4 + uses: actions/checkout@v4.2.2 - name: Set up Python 3.9 - uses: actions/setup-python@v5 + uses: actions/setup-python@v5.3.0 with: python-version: 3.9 @@ -98,7 +98,7 @@ jobs: python -m build --sdist - name: Upload sdist - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v4.4.3 with: name: sdist path: python/dist @@ -109,7 +109,7 @@ jobs: docker run --rm -v `pwd`:/project -w /project quay.io/pypa/manylinux2014_x86_64 bash .github/workflows/docker/buildwheel.sh - name: Upload Wheels - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v4.4.3 with: name: linux-wheels path: python/dist/wheelhouse @@ -122,11 +122,11 @@ jobs: python: [3.9, "3.10", 3.11, 3.12] steps: - name: Download wheels - uses: actions/download-artifact@v4 + uses: actions/download-artifact@v4.1.8 with: name: osx-wheel-${{ matrix.python }} - name: Set up Python ${{ matrix.python }} - uses: actions/setup-python@v5 + uses: actions/setup-python@v5.3.0 with: python-version: ${{ matrix.python }} - name: Install wheel and test @@ -145,11 +145,11 @@ jobs: wordsize: [64] steps: - name: Download wheels - uses: actions/download-artifact@v4 + uses: actions/download-artifact@v4.1.8 with: name: win-wheel-${{ matrix.python }}-${{ matrix.wordsize }} - name: Set up Python ${{ matrix.python }} - uses: actions/setup-python@v5 + uses: actions/setup-python@v5.3.0 with: python-version: ${{ matrix.python }} - name: Install wheel and test @@ -178,11 +178,11 @@ jobs: wheel: cp312 steps: - name: Download wheels - uses: actions/download-artifact@v4 + uses: actions/download-artifact@v4.1.8 with: name: linux-wheels - name: Set up Python - uses: actions/setup-python@v4 + uses: actions/setup-python@v5.3.0 with: python-version: ${{ matrix.python }} - name: Install wheel and test @@ -201,16 +201,16 @@ jobs: id-token: write steps: - name: Download all - uses: actions/download-artifact@v4 + uses: actions/download-artifact@v4.1.8 - name: Move to dist run: | mkdir dist cp */*.{whl,gz} dist/. - name: Publish distribution to Test PyPI if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags') && !contains(github.event.ref, 'C_') - uses: pypa/gh-action-pypi-publish@release/v1 + uses: pypa/gh-action-pypi-publish@v1.12.3 with: repository_url: https://test.pypi.org/legacy/ - name: Publish distribution to PRODUCTION PyPI if: github.event_name == 'release' && !startsWith(github.event.release.tag_name, 'C_') - uses: pypa/gh-action-pypi-publish@release/v1 + uses: pypa/gh-action-pypi-publish@v1.12.3 From be5351d6c7deb384639cbb0f33d74f937d0245e8 Mon Sep 17 00:00:00 2001 From: Ben Jeffery Date: Thu, 12 Dec 2024 01:01:25 +0000 Subject: [PATCH 12/25] Update actions ubuntu version --- .github/workflows/docs.yml | 2 +- .github/workflows/release.yml | 2 +- .github/workflows/tests.yml | 6 +++--- .github/workflows/wheels.yml | 6 +++--- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 7861d0fde4..4379702963 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -17,7 +17,7 @@ env: jobs: build-deploy-docs: name: Docs - runs-on: ubuntu-latest + runs-on: ubuntu-24.04 defaults: run: shell: bash -l {0} diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 98c54a2ba3..c490e7ab95 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -7,7 +7,7 @@ on: jobs: build: - runs-on: ubuntu-latest + runs-on: ubuntu-24.04 steps: - name: Checkout uses: actions/checkout@v4.2.2 diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index ae53eaab0f..e58ff32297 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -8,7 +8,7 @@ on: jobs: pre-commit: name: Lint - runs-on: ubuntu-latest + runs-on: ubuntu-24.04 steps: - name: Cancel Previous Runs uses: styfle/cancel-workflow-action@0.12.1 @@ -28,7 +28,7 @@ jobs: benchmark: name: Benchmark - runs-on: ubuntu-latest + runs-on: ubuntu-24.04 steps: - name: Cancel Previous Runs uses: styfle/cancel-workflow-action@0.12.1 @@ -62,7 +62,7 @@ jobs: fail-fast: false matrix: python: [ 3.9, 3.12 ] - os: [ macos-latest, ubuntu-latest, windows-latest ] + os: [ macos-latest, ubuntu-24.04, windows-latest ] defaults: run: shell: bash diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index abf1601aa1..0e5bc39cf8 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -80,7 +80,7 @@ jobs: path: python/dist manylinux: - runs-on: ubuntu-latest + runs-on: ubuntu-24.04 steps: - name: Checkout uses: actions/checkout@v4.2.2 @@ -161,7 +161,7 @@ jobs: python -c "import tskit" manylinux-test: - runs-on: ubuntu-latest + runs-on: ubuntu-24.04 needs: ['manylinux'] strategy: matrix: @@ -194,7 +194,7 @@ jobs: PyPI_Upload: - runs-on: ubuntu-latest + runs-on: ubuntu-24.04 environment: release needs: ['windows-test', 'OSX-test', 'manylinux-test'] permissions: From 26f9c4aaf1cfa56a3eb57f372812a9aa4fcdc98b Mon Sep 17 00:00:00 2001 From: Ben Jeffery Date: Wed, 8 Jan 2025 11:26:05 +0000 Subject: [PATCH 13/25] Update actions ubuntu version --- .mergify.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.mergify.yml b/.mergify.yml index 57f546a418..13e858d3c6 100644 --- a/.mergify.yml +++ b/.mergify.yml @@ -10,8 +10,8 @@ queue_rules: - status-success=Lint - status-success=Python (3.9, macos-latest) - status-success=Python (3.12, macos-latest) - - status-success=Python (3.9, ubuntu-latest) - - status-success=Python (3.12, ubuntu-latest) + - status-success=Python (3.9, ubuntu-24.04) + - status-success=Python (3.12, ubuntu-24.04) - status-success=Python (3.9, windows-latest) - status-success=Python (3.12, windows-latest) - "status-success=ci/circleci: build" @@ -22,8 +22,8 @@ queue_rules: - status-success=Lint - status-success=Python (3.9, macos-latest) - status-success=Python (3.12, macos-latest) - - status-success=Python (3.9, ubuntu-latest) - - status-success=Python (3.12, ubuntu-latest) + - status-success=Python (3.9, ubuntu-24.04) + - status-success=Python (3.12, ubuntu-24.04) - status-success=Python (3.9, windows-latest) - status-success=Python (3.12, windows-latest) - "status-success=ci/circleci: build" From ffecae9ad821b04c837c9afda75619a534e96def Mon Sep 17 00:00:00 2001 From: Yan Wong Date: Thu, 14 Nov 2024 10:14:59 +0000 Subject: [PATCH 14/25] Add textual mouseover descriptions of collapsed nodes I realised that the numbers can be somewhat cryptic, and it's trivial to add helpful text that appears on mouseover, e.g. "This polytomy has 3 additional branches, leading to a total of 14 descendant samples" --- python/tests/data/svg/tree_poly_tracked.svg | 20 ++++++++-- .../data/svg/tree_poly_tracked_collapse.svg | 25 ++++++++++--- .../tests/data/svg/tree_simple_collapsed.svg | 10 ++++- .../data/svg/tree_subtrees_with_collapsed.svg | 5 ++- python/tskit/drawing.py | 37 ++++++++++++------- 5 files changed, 71 insertions(+), 26 deletions(-) diff --git a/python/tests/data/svg/tree_poly_tracked.svg b/python/tests/data/svg/tree_poly_tracked.svg index 517d508e21..fdf6b8712c 100644 --- a/python/tests/data/svg/tree_poly_tracked.svg +++ b/python/tests/data/svg/tree_poly_tracked.svg @@ -15,7 +15,10 @@ - +3/𝟑 + + +3/𝟑 + This polytomy has 3 additional branches, leading to a total of 3 descendant samples + @@ -23,7 +26,10 @@ - +3/𝟑 + + +3/𝟑 + This polytomy has 3 additional branches, leading to a total of 3 descendant samples + @@ -34,7 +40,10 @@ - +2 + + +2 + A collapsed non-sample node with 2 descendant samples in this tree + 36 @@ -93,7 +102,10 @@ - +14/𝟐 + + +14/𝟐 + This polytomy has 2 additional branches, leading to a total of 14 descendant samples + 41 diff --git a/python/tests/data/svg/tree_poly_tracked_collapse.svg b/python/tests/data/svg/tree_poly_tracked_collapse.svg index 437c2c355b..895daa2ac1 100644 --- a/python/tests/data/svg/tree_poly_tracked_collapse.svg +++ b/python/tests/data/svg/tree_poly_tracked_collapse.svg @@ -15,7 +15,10 @@ - +3/𝟑 + + +3/𝟑 + This polytomy has 3 additional branches, leading to a total of 3 descendant samples + @@ -23,7 +26,10 @@ - +3/𝟑 + + +3/𝟑 + This polytomy has 3 additional branches, leading to a total of 3 descendant samples + @@ -34,7 +40,10 @@ - +2 + + +2 + A collapsed non-sample node with 2 descendant samples in this tree + 36 @@ -71,7 +80,10 @@ - +3 + + +3 + A collapsed non-sample node with 3 descendant samples in this tree + 39 @@ -80,7 +92,10 @@ - +14/𝟐 + + +14/𝟐 + This polytomy has 2 additional branches, leading to a total of 14 descendant samples + 41 diff --git a/python/tests/data/svg/tree_simple_collapsed.svg b/python/tests/data/svg/tree_simple_collapsed.svg index 67446bb6dd..7bc5d31f7f 100644 --- a/python/tests/data/svg/tree_simple_collapsed.svg +++ b/python/tests/data/svg/tree_simple_collapsed.svg @@ -11,7 +11,10 @@ - +2 + + +2 + A collapsed non-sample node with 2 descendant samples in this tree + 8 @@ -37,7 +40,10 @@ - +4 + + +4 + A collapsed non-sample node with 4 descendant samples in this tree + 13 diff --git a/python/tests/data/svg/tree_subtrees_with_collapsed.svg b/python/tests/data/svg/tree_subtrees_with_collapsed.svg index 5bd9d76b74..afcae706b7 100644 --- a/python/tests/data/svg/tree_subtrees_with_collapsed.svg +++ b/python/tests/data/svg/tree_subtrees_with_collapsed.svg @@ -11,7 +11,10 @@ - +2 + + +2 + A collapsed non-sample node with 2 descendant samples in this tree + 16 diff --git a/python/tskit/drawing.py b/python/tskit/drawing.py index 7c6c591bfa..381029161a 100644 --- a/python/tskit/drawing.py +++ b/python/tskit/drawing.py @@ -1990,15 +1990,20 @@ def draw_tree(self): end=(x2, 0), ) ) - poly.add( - dwg.text( - f"+{info.num_samples}/{bold_integer(info.num_branches)}", - font_style="italic", - x=[rnd(x2)], - dy=[rnd(-self.text_height / 10)], # make the plus sign line up - text_anchor="end", + label = dwg.text( + f"+{info.num_samples}/{bold_integer(info.num_branches)}", + font_style="italic", + x=[rnd(x2)], + dy=[rnd(-self.text_height / 10)], # make the plus sign line up + text_anchor="end", + ) + label.set_desc( + title=( + f"This polytomy has {info.num_branches} additional branches, " + f"leading to a total of {info.num_samples} descendant samples" ) ) + poly.add(label) curr_svg_group.add(poly) # Add edge above node first => on layer underneath anything else @@ -2092,14 +2097,18 @@ def draw_tree(self): node_lab_attr["transform"] = self.text_transform("above") else: if multi_samples is not None: - curr_svg_group.add( - dwg.text( - text=f"+{multi_samples}", - transform=self.text_transform("below", dy=1), - font_style="italic", - class_="lab summary", - ) + label = dwg.text( + text=f"+{multi_samples}", + transform=self.text_transform("below", dy=1), + font_style="italic", + class_="lab summary", + ) + title = ( + f"A collapsed {'sample' if tree.is_sample(u) else 'non-sample'} " + f"node with {multi_samples} descendant samples in this tree" ) + label.set_desc(title=title) + curr_svg_group.add(label) if u == left_child[tree.parent(u)]: add_class(node_lab_attr, "lft") node_lab_attr["transform"] = self.text_transform("above_left") From 04272392fdd323bf6c3efe39f7a44138a1ff856f Mon Sep 17 00:00:00 2001 From: Hanbin Lee Date: Wed, 8 Jan 2025 14:38:55 -0500 Subject: [PATCH 15/25] add GIL release for statistics --- python/_tskitmodule.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/_tskitmodule.c b/python/_tskitmodule.c index 8663bd8695..65036e2df1 100644 --- a/python/_tskitmodule.c +++ b/python/_tskitmodule.c @@ -9715,9 +9715,11 @@ TreeSequence_weighted_stat_vector_method( if (result_array == NULL) { goto out; } + Py_BEGIN_ALLOW_THREADS err = method(self->tree_sequence, w_shape[1], PyArray_DATA(weights_array), num_windows, PyArray_DATA(windows_array), num_focal_nodes, PyArray_DATA(focal_nodes_array), PyArray_DATA(result_array), options); + Py_END_ALLOW_THREADS if (err != 0) { handle_library_error(err); goto out; From c807c1e69c31818a37c7fec8f398fd5d0e513514 Mon Sep 17 00:00:00 2001 From: Ben Jeffery Date: Wed, 15 Jan 2025 16:04:01 +0000 Subject: [PATCH 16/25] Pin zarr --- python/requirements/CI-tests-conda/requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/python/requirements/CI-tests-conda/requirements.txt b/python/requirements/CI-tests-conda/requirements.txt index 1b50515ce0..86b23b1b33 100644 --- a/python/requirements/CI-tests-conda/requirements.txt +++ b/python/requirements/CI-tests-conda/requirements.txt @@ -1,3 +1,4 @@ msprime==1.3.2 tszip==0.2.3 h5py==3.11.0 +zarr<3 From 9fb8a528180586e775d6d420b76400344214114d Mon Sep 17 00:00:00 2001 From: Ben Jeffery Date: Fri, 7 Feb 2025 20:23:32 +0000 Subject: [PATCH 17/25] Fix doxygen version --- .github/workflows/docs.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 4379702963..262036994a 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -46,7 +46,7 @@ jobs: - name: Update environment run: | - mamba install -y python=3.10 doxygen pip + mamba install -y python=3.10 doxygen=1.12.0 pip pip install -r ${{ env.REQUIREMENTS }} if: steps.cache.outputs.cache-hit != 'true' @@ -55,7 +55,9 @@ jobs: run: make $MAKE_TARGET - name: Build Docs - run: make -C docs + run: | + doxygen -v + make -C docs - name: Trigger docs site rebuild if: github.ref == 'refs/heads/main' From 25a9994a64a1713c325eb6c0312efa081a90d6cb Mon Sep 17 00:00:00 2001 From: Ben Jeffery Date: Fri, 7 Feb 2025 19:41:19 +0000 Subject: [PATCH 18/25] Update conda dev env --- python/requirements/development.txt | 3 --- python/requirements/development.yml | 15 ++++++++++----- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/python/requirements/development.txt b/python/requirements/development.txt index 79412f0688..a01fc2c063 100644 --- a/python/requirements/development.txt +++ b/python/requirements/development.txt @@ -32,11 +32,8 @@ sphinx==7.4.7 sphinx-argparse sphinx-autodoc-typehints>=1.18.3 sphinx-issues -sphinx-jupyterbook-latex -sphinxcontrib-prettyspecialmethods tqdm sphinx-book-theme tszip -pydata_sphinx_theme>=0.7.2 svgwrite>=1.1.10 xmlunittest diff --git a/python/requirements/development.yml b/python/requirements/development.yml index c71ed30fbb..1b3b52ffc3 100644 --- a/python/requirements/development.yml +++ b/python/requirements/development.yml @@ -3,7 +3,7 @@ channels: - bioconda - conda-forge dependencies: - - autodocsumm==0.1.13 #Pinned as errors on 0.2.0 + - autodocsumm>=0.2.7 - biopython - black - breathe @@ -15,14 +15,15 @@ dependencies: - flake8 - h5py>=2.6.0 - jsonschema>=3.0.0 + - jupyter-book>=0.12.1 - kastore + - matplotlib - meson>=0.61.0 - msprime>=1.0.0 - networkx - ninja - numpy<2 - packaging - - pip - portion - pre-commit - pyparsing @@ -30,13 +31,17 @@ dependencies: - pytest - pytest-cov - pytest-xdist - - PyVCF - - sphinx==4.5.0 + - si-prefix + - sphinx==7.4.7 - sphinx-argparse + - sphinx-autodoc-typehints>=1.18.3 - sphinx-issues - - sphinx_rtd_theme + - sphinx-book-theme - svgwrite>=1.1.10 + - tqdm + - tszip - pip: + - lshmm>=0.0.8 - newick - xmlunittest - msgpack>=1.0.0 From af33ed1067be2ee681522640c5b944aae73db542 Mon Sep 17 00:00:00 2001 From: Ben Jeffery Date: Fri, 28 Feb 2025 14:23:06 +0000 Subject: [PATCH 19/25] Add fixed-length arrays in metadata --- docs/metadata.md | 11 ++ python/CHANGELOG.rst | 5 + python/requirements/development.yml | 5 +- python/tests/test_metadata.py | 229 +++++++++++++++++++++++++++- python/tskit/metadata.py | 97 ++++++++++-- 5 files changed, 329 insertions(+), 18 deletions(-) diff --git a/docs/metadata.md b/docs/metadata.md index fe95546a2e..4b417e66fb 100644 --- a/docs/metadata.md +++ b/docs/metadata.md @@ -532,6 +532,17 @@ types above. `L` is the default. As an example: Will result in an array of 2 byte integers, prepended by a single-byte array-length. +For arrays with a known fixed size, you can specify the `length` property instead: +``` +{"type": "array", "length": 3, "items": {"type":"number", "binaryFormat":"i"}} +``` +This creates a fixed-length array of exactly 3 integers, without storing the array length in the encoded data. +Fixed-length arrays are more space-efficient since they don't need to store the length prefix. + +When using fixed-length arrays: +1. The `arrayLengthFormat` property should not be specified +2. Arrays provided for encoding must match the specified length exactly + For dealing with legacy encodings that do not store the length of the array, setting `noLengthEncodingExhaustBuffer` to `true` will read elements of the array until the metadata buffer is exhausted. As such an array diff --git a/python/CHANGELOG.rst b/python/CHANGELOG.rst index eb5eefee45..fa69d1eb45 100644 --- a/python/CHANGELOG.rst +++ b/python/CHANGELOG.rst @@ -16,6 +16,11 @@ - Fix to ``TreeSequence.pair_coalescence_rates`` causing an assertion to be triggered by floating point error, when all coalescence events are inside a single time window (:user:`natep`, :issue:`3035`, :pr:`3038`) +**Features** + +- Add support for fixed-length arrays in metadata struct codec using the ``length`` property. + (:user:`benjeffery`, :issue:`3088`,:pr:`3090`) + -------------------- [0.6.0] - 2024-10-16 -------------------- diff --git a/python/requirements/development.yml b/python/requirements/development.yml index 1b3b52ffc3..42a1c1facd 100644 --- a/python/requirements/development.yml +++ b/python/requirements/development.yml @@ -17,6 +17,7 @@ dependencies: - jsonschema>=3.0.0 - jupyter-book>=0.12.1 - kastore + - lshmm>=0.0.8 - matplotlib - meson>=0.61.0 - msprime>=1.0.0 @@ -36,12 +37,14 @@ dependencies: - sphinx-argparse - sphinx-autodoc-typehints>=1.18.3 - sphinx-issues + - sphinx-jupyterbook-latex + - sphinxcontrib-prettyspecialmethods - sphinx-book-theme + - pydata_sphinx_theme>=0.7.2 - svgwrite>=1.1.10 - tqdm - tszip - pip: - - lshmm>=0.0.8 - newick - xmlunittest - msgpack>=1.0.0 diff --git a/python/tests/test_metadata.py b/python/tests/test_metadata.py index 7387f73fad..cee353b01b 100644 --- a/python/tests/test_metadata.py +++ b/python/tests/test_metadata.py @@ -1,6 +1,6 @@ # MIT License # -# Copyright (c) 2018-2024 Tskit Developers +# Copyright (c) 2018-2025 Tskit Developers # Copyright (c) 2017 University of Oxford # # Permission is hereby granted, free of charge, to any person obtaining a copy @@ -1415,6 +1415,154 @@ def test_ordering_of_fields(self): assert ms.validate_and_encode_row(row_data) == index_order_encoded assert ms.decode_row(index_order_encoded) == row_data + def test_fixed_length_array(self): + schema = { + "codec": "struct", + "type": "object", + "properties": { + "array": { + "type": "array", + "length": 3, + "items": {"type": "number", "binaryFormat": "i"}, + } + }, + } + self.round_trip(schema, {"array": [1, 2, 3]}) + + # Test with complex fixed-length arrays + schema = { + "codec": "struct", + "type": "object", + "properties": { + "array": { + "type": "array", + "length": 2, + "items": { + "type": "object", + "properties": { + "int": {"type": "number", "binaryFormat": "i"}, + "float": {"type": "number", "binaryFormat": "d"}, + }, + }, + } + }, + } + self.round_trip( + schema, {"array": [{"int": 1, "float": 1.1}, {"int": 2, "float": 2.2}]} + ) + + # Test fixed-length nested arrays + schema = { + "codec": "struct", + "type": "object", + "properties": { + "array": { + "type": "array", + "length": 2, + "items": { + "type": "array", + "length": 3, + "items": {"type": "number", "binaryFormat": "d"}, + }, + } + }, + } + self.round_trip(schema, {"array": [[1.1, 1.2, 1.3], [2.1, 2.2, 2.3]]}) + + def test_mixed_fixed_and_variable_arrays(self): + schema = { + "codec": "struct", + "type": "object", + "properties": { + "fixed_array": { + "type": "array", + "length": 3, + "items": {"type": "number", "binaryFormat": "i"}, + }, + "variable_array": { + "type": "array", + "items": {"type": "number", "binaryFormat": "i"}, + }, + }, + } + self.round_trip( + schema, {"fixed_array": [1, 2, 3], "variable_array": [4, 5, 6, 7]} + ) + self.round_trip(schema, {"fixed_array": [1, 2, 3], "variable_array": []}) + + # Nested case - array of objects where each object has + # both fixed and variable-length arrays + schema = { + "codec": "struct", + "type": "object", + "properties": { + "objects": { + "type": "array", + "items": { + "type": "object", + "properties": { + "fixed": { + "type": "array", + "length": 2, + "items": {"type": "number", "binaryFormat": "d"}, + }, + "variable": { + "type": "array", + "items": {"type": "number", "binaryFormat": "i"}, + }, + }, + }, + } + }, + } + self.round_trip( + schema, + { + "objects": [ + {"fixed": [1.1, 2.2], "variable": [1, 2, 3]}, + {"fixed": [3.3, 4.4], "variable": [4]}, + {"fixed": [5.5, 6.6], "variable": []}, + ] + }, + ) + + def test_edge_case_zero_length_array(self): + schema = { + "codec": "struct", + "type": "object", + "properties": { + "empty_fixed": { + "type": "array", + "length": 0, + "items": {"type": "number", "binaryFormat": "i"}, + } + }, + } + self.round_trip(schema, {"empty_fixed": []}) + + # Can't provide non-empty array when length=0 + ms = metadata.MetadataSchema(schema) + with pytest.raises( + ValueError, match="Array length 1 does not match schema fixed length 0" + ): + ms.validate_and_encode_row({"empty_fixed": [1]}) + + # Complex object with zero-length array + schema = { + "codec": "struct", + "type": "object", + "properties": { + "name": {"type": "string", "binaryFormat": "10p"}, + "empty_fixed": { + "type": "array", + "length": 0, + "items": {"type": "number", "binaryFormat": "i"}, + }, + "value": {"type": "number", "binaryFormat": "d"}, + }, + } + self.round_trip(schema, {"name": "test", "empty_fixed": [], "value": 42.0}) + class TestStructCodecErrors: def encode(self, schema, row_data): @@ -1644,6 +1792,85 @@ def test_no_default_implies_required(self): ): self.encode(schema, {}) + def test_fixed_length_array_wrong_length(self): + schema = { + "codec": "struct", + "type": "object", + "properties": { + "array": { + "type": "array", + "length": 3, + "items": {"type": "number", "binaryFormat": "i"}, + }, + }, + } + ms = metadata.MetadataSchema(schema) + + with pytest.raises( + ValueError, match="Array length 2 does not match schema fixed length 3" + ): + ms.validate_and_encode_row({"array": [1, 2]}) + + with pytest.raises( + ValueError, match="Array length 4 does not match schema fixed length 3" + ): + ms.validate_and_encode_row({"array": [1, 2, 3, 4]}) + + def test_fixed_length_array_conflicts(self): + schema = { + "codec": "struct", + "type": "object", + "properties": { + "test": { + "type": "array", + "length": 3, + "noLengthEncodingExhaustBuffer": True, + "items": {"type": "number", "binaryFormat": "i"}, + }, + }, + } + with pytest.raises( + exceptions.MetadataSchemaValidationError, + match="test array cannot have both 'length' and " + "'noLengthEncodingExhaustBuffer' set", + ): + metadata.MetadataSchema(schema) + + def test_fixed_length_with_length_format(self): + schema = { + "codec": "struct", + "type": "object", + "properties": { + "array": { + "type": "array", + "length": 3, + "arrayLengthFormat": "B", + "items": {"type": "number", "binaryFormat": "i"}, + }, + }, + } + with pytest.raises( + exceptions.MetadataSchemaValidationError, + match="fixed-length array should not specify 'arrayLengthFormat'", + ): + metadata.MetadataSchema(schema) + + def test_negative_fixed_length(self): + """Test that negative fixed-length values are rejected.""" + schema = { + "codec": "struct", + "type": "object", + "properties": { + "array": { + "type": "array", + "length": -5, + "items": {"type": "number", "binaryFormat": "i"}, + }, + }, + } + with pytest.raises(exceptions.MetadataSchemaValidationError): + metadata.MetadataSchema(schema) + class TestSLiMDecoding: """ diff --git a/python/tskit/metadata.py b/python/tskit/metadata.py index 81e23d0ecd..d46ea8207f 100644 --- a/python/tskit/metadata.py +++ b/python/tskit/metadata.py @@ -1,6 +1,6 @@ # MIT License # -# Copyright (c) 2020-2024 Tskit Developers +# Copyright (c) 2020-2025 Tskit Developers # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -224,6 +224,40 @@ def binary_format_validator(validator, types, instance, schema): ) +def array_length_validator(validator, types, instance, schema): + # Validate that array schema doesn't have both length and + # noLengthEncodingExhaustBuffer set. Also ensure that arrayLengthFormat + # is not set when length is set. + + # Call the normal properties validator first + try: + yield from jsonschema._validators.properties(validator, types, instance, schema) + except AttributeError: + # Needed since jsonschema==4.19.1 + yield from jsonschema._keywords.properties(validator, types, instance, schema) + for prop, sub_schema in instance["properties"].items(): + if sub_schema.get("type") == "array": + has_length = "length" in sub_schema + has_exhaust = sub_schema.get("noLengthEncodingExhaustBuffer", False) + + if has_length and has_exhaust: + yield jsonschema.ValidationError( + f"{prop} array cannot have both 'length' and " + "'noLengthEncodingExhaustBuffer' set" + ) + + if has_length and "arrayLengthFormat" in sub_schema: + yield jsonschema.ValidationError( + f"{prop} fixed-length array should not specify 'arrayLengthFormat'" + ) + + if has_length and sub_schema["length"] < 0: + yield jsonschema.ValidationError( + f"{prop} array length must be non-negative, got " + f"{sub_schema['length']}" + ) + + def required_validator(validator, required, instance, schema): # Do the normal validation try: @@ -244,7 +278,11 @@ def required_validator(validator, required, instance, schema): StructCodecSchemaValidator = jsonschema.validators.extend( TSKITMetadataSchemaValidator, - {"type": binary_format_validator, "required": required_validator}, + { + "type": binary_format_validator, + "required": required_validator, + "properties": array_length_validator, + }, ) struct_meta_schema: Mapping[str, Any] = copy.deepcopy( StructCodecSchemaValidator.META_SCHEMA @@ -298,6 +336,13 @@ def required_validator(validator, required, instance, schema): struct_meta_schema["definitions"]["root"]["properties"][ "noLengthEncodingExhaustBuffer" ] = struct_meta_schema["properties"]["noLengthEncodingExhaustBuffer"] + +# length is numeric (for fixed-length arrays) +struct_meta_schema["properties"]["length"] = {"type": "integer"} +struct_meta_schema["definitions"]["root"]["properties"]["length"] = struct_meta_schema[ + "properties" +]["length"] + StructCodecSchemaValidator.META_SCHEMA = struct_meta_schema @@ -351,6 +396,7 @@ def make_decode(cls, sub_schema): @classmethod def make_array_decode(cls, sub_schema): element_decoder = StructCodec.make_decode(sub_schema["items"]) + fixed_length = sub_schema.get("length") array_length_f = "<" + sub_schema.get("arrayLengthFormat", "L") array_length_size = struct.calcsize(array_length_f) exhaust_buffer = sub_schema.get("noLengthEncodingExhaustBuffer", False) @@ -373,7 +419,12 @@ def array_decode_exhaust(buffer): raise e return ret - if exhaust_buffer: + def array_decode_fixed_length(buffer): + return [element_decoder(buffer) for _ in range(fixed_length)] + + if fixed_length is not None: + return array_decode_fixed_length + elif exhaust_buffer: return array_decode_exhaust else: return array_decode @@ -470,23 +521,37 @@ def make_encode(cls, sub_schema): @classmethod def make_array_encode(cls, sub_schema): - array_length_f = "<" + sub_schema.get("arrayLengthFormat", "L") element_encoder = StructCodec.make_encode(sub_schema["items"]) + fixed_length = sub_schema.get("length") + array_length_f = "<" + sub_schema.get("arrayLengthFormat", "L") exhaust_buffer = sub_schema.get("noLengthEncodingExhaustBuffer", False) - if exhaust_buffer: - return lambda array: b"".join(element_encoder(ele) for ele in array) - else: - def array_encode_with_length(array): - try: - packed_length = struct.pack(array_length_f, len(array)) - except struct.error: - raise ValueError( - "Couldn't pack array size - it is likely too long" - " for the specified arrayLengthFormat" - ) - return packed_length + b"".join(element_encoder(ele) for ele in array) + def array_encode_fixed_length(array): + if len(array) != fixed_length: + raise ValueError( + f"Array length {len(array)} does not match schema" + f" fixed length {fixed_length}" + ) + return b"".join(element_encoder(ele) for ele in array) + def array_encode_exhaust(array): + return b"".join(element_encoder(ele) for ele in array) + + def array_encode_with_length(array): + try: + packed_length = struct.pack(array_length_f, len(array)) + except struct.error: + raise ValueError( + "Couldn't pack array size - it is likely too long" + " for the specified arrayLengthFormat" + ) + return packed_length + b"".join(element_encoder(ele) for ele in array) + + if fixed_length is not None: + return array_encode_fixed_length + elif exhaust_buffer: + return array_encode_exhaust + else: return array_encode_with_length @classmethod From a26bafad8099c79114a8f5a872cb43f73e666c4e Mon Sep 17 00:00:00 2001 From: lkirk Date: Wed, 6 Nov 2024 03:18:10 -0600 Subject: [PATCH 20/25] get items from bit arrays; add a few tests --- c/tests/test_core.c | 30 ++++++++++++++++++++++++++++++ c/tests/testlib.h | 10 ++++++++++ c/tskit/core.c | 29 +++++++++++++++++++++++++++++ c/tskit/core.h | 2 ++ 4 files changed, 71 insertions(+) diff --git a/c/tests/test_core.c b/c/tests/test_core.c index ce01d80d66..d80ba788e2 100644 --- a/c/tests/test_core.c +++ b/c/tests/test_core.c @@ -532,9 +532,14 @@ test_bit_arrays(void) // NB: This test is only valid for the 32 bit implementation of bit arrays. If we // were to change the chunk size of a bit array, we'd need to update these tests tsk_bit_array_t arr; + tsk_id_t items_truth[64], items[64]; + tsk_size_t n_items = 0, n_items_truth = 0; + tsk_bit_array_init(&arr, 90, 1); for (tsk_bit_array_value_t i = 0; i < 20; i++) { tsk_bit_array_add_bit(&arr, i); + items_truth[n_items_truth] = (tsk_id_t) i; + n_items_truth++; } tsk_bit_array_add_bit(&arr, 63); tsk_bit_array_add_bit(&arr, 65); @@ -547,6 +552,12 @@ test_bit_arrays(void) // verify our assumptions about bit array counting CU_ASSERT_EQUAL_FATAL(tsk_bit_array_count(&arr), 22); + tsk_bit_array_get_items(&arr, items, &n_items); + assert_arrays_equal(n_items_truth, items, items_truth); + + tsk_memset(items, 0, 64); + tsk_memset(items_truth, 0, 64); + n_items = n_items_truth = 0; tsk_bit_array_free(&arr); // create a length-2 array with 64 bit capacity @@ -560,12 +571,31 @@ test_bit_arrays(void) // fill the first 50 bits of the first row for (tsk_bit_array_value_t i = 0; i < 50; i++) { tsk_bit_array_add_bit(&arr_row1, i); + items_truth[n_items_truth] = (tsk_id_t) i; + n_items_truth++; } + + tsk_bit_array_get_items(&arr_row1, items, &n_items); + assert_arrays_equal(n_items_truth, items, items_truth); + + tsk_memset(items, 0, 64); + tsk_memset(items_truth, 0, 64); + n_items = n_items_truth = 0; + // fill bits 20-40 of the second row for (tsk_bit_array_value_t i = 20; i < 40; i++) { tsk_bit_array_add_bit(&arr_row2, i); + items_truth[n_items_truth] = (tsk_id_t) i; + n_items_truth++; } + tsk_bit_array_get_items(&arr_row2, items, &n_items); + assert_arrays_equal(n_items_truth, items, items_truth); + + tsk_memset(items, 0, 64); + tsk_memset(items_truth, 0, 64); + n_items = n_items_truth = 0; + // verify our assumptions about row selection CU_ASSERT_EQUAL_FATAL(arr.data[0], 4294967295); CU_ASSERT_EQUAL_FATAL(arr.data[1], 262143); diff --git a/c/tests/testlib.h b/c/tests/testlib.h index 30f0013036..4c1ff722de 100644 --- a/c/tests/testlib.h +++ b/c/tests/testlib.h @@ -65,6 +65,16 @@ void unsort_edges(tsk_edge_table_t *edges, size_t start); } while (0); \ } +#define assert_arrays_equal(len, a, b) \ + { \ + do { \ + tsk_size_t _j; \ + for (_j = 0; _j < len; _j++) { \ + CU_ASSERT_EQUAL(a[_j], b[_j]); \ + } \ + } while (0); \ + } + /* Array equality if the arrays contain NaN values NB: the float cast for NaNs is for mingw, which complains without */ #define assert_arrays_almost_equal_nan(len, a, b) \ diff --git a/c/tskit/core.c b/c/tskit/core.c index 8176ac8fcb..ece3fe1261 100644 --- a/c/tskit/core.c +++ b/c/tskit/core.c @@ -1336,6 +1336,35 @@ tsk_bit_array_count(const tsk_bit_array_t *self) return count; } +void +tsk_bit_array_get_items( + const tsk_bit_array_t *self, tsk_id_t *items, tsk_size_t *n_items) +{ + // Get the items stored in the row of a bitset. + // Uses a de Bruijn sequence lookup table to determine the lowest bit set. See the + // wikipedia article for more info: https://w.wiki/BYiF + + tsk_size_t i, n, off; + tsk_bit_array_value_t v, lsb; // least significant bit + static const tsk_id_t lookup[32] = { 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, + 17, 4, 8, 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9 }; + + n = 0; + for (i = 0; i < self->size; i++) { + v = self->data[i]; + off = i * ((tsk_size_t) TSK_BIT_ARRAY_NUM_BITS); + if (v == 0) { + continue; + } + while ((lsb = v & -v)) { + items[n] = lookup[(lsb * 0x077cb531U) >> 27] + (tsk_id_t) off; + n++; + v ^= lsb; + } + } + *n_items = n; +} + void tsk_bit_array_free(tsk_bit_array_t *self) { diff --git a/c/tskit/core.h b/c/tskit/core.h index 0b6db1b59e..5ee450e4b4 100644 --- a/c/tskit/core.h +++ b/c/tskit/core.h @@ -1100,6 +1100,8 @@ void tsk_bit_array_add_bit(tsk_bit_array_t *self, const tsk_bit_array_value_t bi bool tsk_bit_array_contains( const tsk_bit_array_t *self, const tsk_bit_array_value_t bit); tsk_size_t tsk_bit_array_count(const tsk_bit_array_t *self); +void tsk_bit_array_get_items( + const tsk_bit_array_t *self, tsk_id_t *items, tsk_size_t *n_items); #ifdef __cplusplus } From bca015a609ce88046d6b0f44ade14819ddf15b89 Mon Sep 17 00:00:00 2001 From: lkirk Date: Wed, 6 Nov 2024 03:18:18 -0600 Subject: [PATCH 21/25] c implementation of python algorithm --- c/tskit/trees.c | 153 +++++++++++++++++++++--------------------------- 1 file changed, 67 insertions(+), 86 deletions(-) diff --git a/c/tskit/trees.c b/c/tskit/trees.c index e65612c648..0f9573cf5d 100644 --- a/c/tskit/trees.c +++ b/c/tskit/trees.c @@ -2920,8 +2920,8 @@ advance_collect_edges(iter_state *s, tsk_id_t index) static int compute_two_tree_branch_state_update(const tsk_treeseq_t *ts, tsk_id_t c, - tsk_bit_array_t *child_samples, const iter_state *A_state, const iter_state *B_state, - tsk_size_t state_dim, tsk_size_t result_dim, int sign, general_stat_func_t *f, + const iter_state *A_state, const iter_state *B_state, tsk_size_t state_dim, + tsk_size_t result_dim, int sign, general_stat_func_t *f, sample_count_stat_params_t *f_params, double *result) { int ret = 0; @@ -2935,6 +2935,9 @@ compute_two_tree_branch_state_update(const tsk_treeseq_t *ts, tsk_id_t c, const tsk_bit_array_t *restrict B_state_samples = B_state->node_samples; tsk_size_t num_samples = ts->num_samples; tsk_size_t num_nodes = ts->tables->nodes.num_rows; + b_len = B_branch_len[c] * sign; + if (b_len == 0) + return ret; tsk_memset(&AB_samples, 0, sizeof(AB_samples)); tsk_memset(&B_samples_tmp, 0, sizeof(B_samples_tmp)); @@ -2953,7 +2956,6 @@ compute_two_tree_branch_state_update(const tsk_treeseq_t *ts, tsk_id_t c, if (ret != 0) { goto out; } - b_len = B_branch_len[c] * sign; for (n = 0; n < num_nodes; n++) { a_len = A_branch_len[n]; if (a_len == 0) { @@ -2961,7 +2963,6 @@ compute_two_tree_branch_state_update(const tsk_treeseq_t *ts, tsk_id_t c, } for (k = 0; k < state_dim; k++) { a_row = (state_dim * n) + k; - // TODO: what if c is TSK_NULL? b_row = (state_dim * (tsk_size_t) c) + k; tsk_bit_array_get_row(A_state_samples, a_row, &A_samples); tsk_bit_array_get_row(B_state_samples, b_row, &B_samples); @@ -2980,33 +2981,8 @@ compute_two_tree_branch_state_update(const tsk_treeseq_t *ts, tsk_id_t c, for (k = 0; k < result_dim; k++) { result[k] += result_tmp[k] * a_len * b_len; } - - if (child_samples != NULL) { - for (k = 0; k < state_dim; k++) { - a_row = (state_dim * n) + k; - // TODO: what if c is TSK_NULL? - b_row = (state_dim * (tsk_size_t) c) + k; - tsk_bit_array_get_row(B_state_samples, b_row, &B_samples); - tsk_bit_array_add(&B_samples_tmp, &B_samples); - tsk_bit_array_subtract(&B_samples_tmp, child_samples); - tsk_bit_array_get_row(A_state_samples, a_row, &A_samples); - tsk_bit_array_intersect(&A_samples, &B_samples_tmp, &AB_samples); - weights_row = GET_2D_ROW(weights, 3, k); - weights_row[0] = (double) tsk_bit_array_count(&AB_samples); // w_AB - weights_row[1] - = (double) tsk_bit_array_count(&A_samples) - weights_row[0]; // w_Ab - weights_row[2] = (double) tsk_bit_array_count(&B_samples_tmp) - - weights_row[0]; // w_aB - } - ret = f(state_dim, weights, result_dim, result_tmp, f_params); - if (ret != 0) { - goto out; - } - for (k = 0; k < result_dim; k++) { - result[k] -= result_tmp[k] * a_len * b_len; - } - } } + out: tsk_safe_free(weights); tsk_safe_free(result_tmp); @@ -3015,97 +2991,102 @@ compute_two_tree_branch_state_update(const tsk_treeseq_t *ts, tsk_id_t c, return ret; } +// TODO: remove this forward declaration +static tsk_id_t *tsk_tree_alloc_node_stack(const tsk_tree_t *self); + static int compute_two_tree_branch_stat(const tsk_treeseq_t *ts, const iter_state *l_state, iter_state *r_state, general_stat_func_t *f, sample_count_stat_params_t *f_params, tsk_size_t result_dim, tsk_size_t state_dim, double *result) { int ret = 0; - tsk_id_t e, c, p; - tsk_size_t j, k; - tsk_bit_array_t child_samples, child_samples_row, samples_row, *in_parent; + tsk_id_t e, c, ec, p, *updated_nodes = NULL; + tsk_size_t j, k, n_updates; const double *restrict time = ts->tables->nodes.time; const tsk_id_t *restrict edges_child = ts->tables->edges.child; const tsk_id_t *restrict edges_parent = ts->tables->edges.parent; - tsk_bit_array_t *r_samples = r_state->node_samples; + const tsk_size_t num_nodes = ts->tables->nodes.num_rows; + tsk_bit_array_t updates, row, ec_row, *r_samples = r_state->node_samples; - tsk_memset(&child_samples, 0, sizeof(child_samples)); - ret = tsk_bit_array_init(&child_samples, ts->num_samples, state_dim); + tsk_memset(&updates, 0, sizeof(updates)); + ret = tsk_bit_array_init(&updates, num_nodes, 1); if (ret != 0) { goto out; } + updated_nodes = tsk_tree_alloc_node_stack(&r_state->tree); + if (updated_nodes == NULL) { + ret = TSK_ERR_NO_MEMORY; + goto out; + } + // Identify modified nodes both added and removed + for (j = 0; j < r_state->n_edges_out + r_state->n_edges_in; j++) { + e = j < r_state->n_edges_out ? r_state->edges_out[j] + : r_state->edges_in[j - r_state->n_edges_out]; + p = edges_parent[e]; + c = edges_child[e]; + // Identify affected nodes above child + while (p != TSK_NULL) { + tsk_bit_array_add_bit(&updates, (tsk_bit_array_value_t) c); + c = p; + p = r_state->parent[p]; + } + } + // Subtract the whole contribution from the child node + tsk_bit_array_get_items(&updates, updated_nodes, &n_updates); + while (n_updates != 0) { + n_updates--; + c = updated_nodes[n_updates]; + compute_two_tree_branch_state_update( + ts, c, l_state, r_state, state_dim, result_dim, -1, f, f_params, result); + } + // Remove samples under nodes from removed edges to parent nodes for (j = 0; j < r_state->n_edges_out; j++) { e = r_state->edges_out[j]; - c = edges_child[e]; p = edges_parent[e]; - tsk_memset(child_samples.data, 0, - child_samples.size * state_dim * sizeof(tsk_bit_array_value_t)); - tsk_bug_assert(c != TSK_NULL); // TODO: are these checks necessary? - tsk_bug_assert(p != TSK_NULL); - for (k = 0; k < state_dim; k++) { - tsk_bit_array_get_row( - r_samples, (state_dim * (tsk_size_t) c) + k, &samples_row); - tsk_bit_array_get_row(&child_samples, k, &child_samples_row); - tsk_bit_array_add(&child_samples_row, &samples_row); - } - in_parent = NULL; + ec = edges_child[e]; // edge child while (p != TSK_NULL) { - compute_two_tree_branch_state_update(ts, c, in_parent, l_state, r_state, - state_dim, result_dim, -1, f, f_params, result); - if (in_parent != NULL) { - for (k = 0; k < state_dim; k++) { - tsk_bit_array_get_row( - r_samples, (state_dim * (tsk_size_t) c) + k, &samples_row); - tsk_bit_array_get_row(&child_samples, k, &child_samples_row); - tsk_bit_array_subtract(&samples_row, &child_samples_row); - } + for (k = 0; k < state_dim; k++) { + tsk_bit_array_get_row( + r_samples, (state_dim * (tsk_size_t) ec) + k, &ec_row); + tsk_bit_array_get_row(r_samples, (state_dim * (tsk_size_t) p) + k, &row); + tsk_bit_array_subtract(&row, &ec_row); } - in_parent = &child_samples; - c = p; p = r_state->parent[p]; } - for (k = 0; k < state_dim; k++) { - tsk_bit_array_get_row( - r_samples, (state_dim * (tsk_size_t) c) + k, &samples_row); - tsk_bit_array_get_row(&child_samples, k, &child_samples_row); - tsk_bit_array_subtract(&samples_row, &child_samples_row); - } - c = edges_child[e]; - r_state->branch_len[c] = 0; - r_state->parent[c] = TSK_NULL; + r_state->branch_len[ec] = 0; + r_state->parent[ec] = TSK_NULL; } + // Add samples under nodes from added edges for (j = 0; j < r_state->n_edges_in; j++) { e = r_state->edges_in[j]; - c = edges_child[e]; p = edges_parent[e]; - tsk_memset(child_samples.data, 0, - child_samples.size * state_dim * sizeof(tsk_bit_array_value_t)); - for (k = 0; k < state_dim; k++) { - tsk_bit_array_get_row( - r_samples, (state_dim * (tsk_size_t) c) + k, &samples_row); - tsk_bit_array_get_row(&child_samples, k, &child_samples_row); - tsk_bit_array_add(&child_samples_row, &samples_row); - } + ec = c = edges_child[e]; r_state->branch_len[c] = time[p] - time[c]; r_state->parent[c] = p; - - in_parent = NULL; while (p != TSK_NULL) { + tsk_bit_array_add_bit(&updates, (tsk_bit_array_value_t) c); for (k = 0; k < state_dim; k++) { tsk_bit_array_get_row( - r_samples, (state_dim * (tsk_size_t) p) + k, &samples_row); - tsk_bit_array_get_row(&child_samples, k, &child_samples_row); - tsk_bit_array_add(&samples_row, &child_samples_row); + r_samples, (state_dim * (tsk_size_t) ec) + k, &ec_row); + tsk_bit_array_get_row(r_samples, (state_dim * (tsk_size_t) p) + k, &row); + tsk_bit_array_add(&row, &ec_row); } - compute_two_tree_branch_state_update(ts, c, in_parent, l_state, r_state, - state_dim, result_dim, +1, f, f_params, result); - in_parent = &child_samples; c = p; p = r_state->parent[p]; } } + // Update all affected child nodes (fully subtracted, deferred from addition) + n_updates = 0; + tsk_bit_array_get_items(&updates, updated_nodes, &n_updates); + while (n_updates != 0) { + n_updates--; + c = updated_nodes[n_updates]; + compute_two_tree_branch_state_update( + ts, c, l_state, r_state, state_dim, result_dim, +1, f, f_params, result); + } out: - tsk_bit_array_free(&child_samples); + tsk_safe_free(updated_nodes); + tsk_bit_array_free(&updates); return ret; } From 519f30b15f481ddf5edc86da3d98205e5eb2c388 Mon Sep 17 00:00:00 2001 From: lkirk Date: Wed, 6 Nov 2024 13:59:01 -0600 Subject: [PATCH 22/25] fix memory issue; turns out the allocated stack wasnt big enough. --- c/tskit/trees.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/c/tskit/trees.c b/c/tskit/trees.c index 0f9573cf5d..74983f36e1 100644 --- a/c/tskit/trees.c +++ b/c/tskit/trees.c @@ -2991,9 +2991,6 @@ compute_two_tree_branch_state_update(const tsk_treeseq_t *ts, tsk_id_t c, return ret; } -// TODO: remove this forward declaration -static tsk_id_t *tsk_tree_alloc_node_stack(const tsk_tree_t *self); - static int compute_two_tree_branch_stat(const tsk_treeseq_t *ts, const iter_state *l_state, iter_state *r_state, general_stat_func_t *f, sample_count_stat_params_t *f_params, @@ -3013,7 +3010,7 @@ compute_two_tree_branch_stat(const tsk_treeseq_t *ts, const iter_state *l_state, if (ret != 0) { goto out; } - updated_nodes = tsk_tree_alloc_node_stack(&r_state->tree); + updated_nodes = tsk_calloc(num_nodes, sizeof(*updated_nodes)); if (updated_nodes == NULL) { ret = TSK_ERR_NO_MEMORY; goto out; From 529ffb6e320350d3e6a95d059521ff5ea6959356 Mon Sep 17 00:00:00 2001 From: lkirk Date: Wed, 18 Dec 2024 14:34:46 -0600 Subject: [PATCH 23/25] norm_total_weighted for D_prime --- c/tskit/trees.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/c/tskit/trees.c b/c/tskit/trees.c index 74983f36e1..298df850cf 100644 --- a/c/tskit/trees.c +++ b/c/tskit/trees.c @@ -4226,7 +4226,7 @@ tsk_treeseq_D_prime(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, { options |= TSK_STAT_POLARISED; // TODO: allow user to pick? return tsk_treeseq_two_locus_count_stat(self, num_sample_sets, sample_set_sizes, - sample_sets, num_sample_sets, NULL, D_prime_summary_func, norm_hap_weighted, + sample_sets, num_sample_sets, NULL, D_prime_summary_func, norm_total_weighted, num_rows, row_sites, row_positions, num_cols, col_sites, col_positions, options, result); } From 1b559744dd66d4b242f819b954a0812f5cf1cacd Mon Sep 17 00:00:00 2001 From: lkirk Date: Wed, 5 Mar 2025 15:23:21 -0600 Subject: [PATCH 24/25] Revert "norm_total_weighted for D_prime" This reverts commit 529ffb6e320350d3e6a95d059521ff5ea6959356. --- c/tskit/trees.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/c/tskit/trees.c b/c/tskit/trees.c index 298df850cf..74983f36e1 100644 --- a/c/tskit/trees.c +++ b/c/tskit/trees.c @@ -4226,7 +4226,7 @@ tsk_treeseq_D_prime(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, { options |= TSK_STAT_POLARISED; // TODO: allow user to pick? return tsk_treeseq_two_locus_count_stat(self, num_sample_sets, sample_set_sizes, - sample_sets, num_sample_sets, NULL, D_prime_summary_func, norm_total_weighted, + sample_sets, num_sample_sets, NULL, D_prime_summary_func, norm_hap_weighted, num_rows, row_sites, row_positions, num_cols, col_sites, col_positions, options, result); } From 0fa68c43a065d5842baa800698f48dead32f2902 Mon Sep 17 00:00:00 2001 From: lkirk Date: Wed, 12 Mar 2025 03:45:21 -0500 Subject: [PATCH 25/25] add coverage for tsk_bit_array_get_items --- c/tests/test_core.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/c/tests/test_core.c b/c/tests/test_core.c index d80ba788e2..eed22fc1ca 100644 --- a/c/tests/test_core.c +++ b/c/tests/test_core.c @@ -532,10 +532,14 @@ test_bit_arrays(void) // NB: This test is only valid for the 32 bit implementation of bit arrays. If we // were to change the chunk size of a bit array, we'd need to update these tests tsk_bit_array_t arr; - tsk_id_t items_truth[64], items[64]; + tsk_id_t items_truth[64] = {0}, items[64] = {0}; tsk_size_t n_items = 0, n_items_truth = 0; + // test item retrieval tsk_bit_array_init(&arr, 90, 1); + tsk_bit_array_get_items(&arr, items, &n_items); + assert_arrays_equal(n_items_truth, items, items_truth); + for (tsk_bit_array_value_t i = 0; i < 20; i++) { tsk_bit_array_add_bit(&arr, i); items_truth[n_items_truth] = (tsk_id_t) i;