From a6a6574a0d5bfd01f85c415f2e9f589c9cf15a5d Mon Sep 17 00:00:00 2001 From: Rob Johnson Date: Tue, 1 Jul 2025 14:17:17 -0700 Subject: [PATCH 1/6] fix deadlock caused by double read-locking in trunk_merge_lookup --- src/core.c | 6 --- src/trunk.c | 144 +++++++++++++++++++++++++--------------------------- src/trunk.h | 5 ++ 3 files changed, 73 insertions(+), 82 deletions(-) diff --git a/src/core.c b/src/core.c index dec8825f..81a4da18 100644 --- a/src/core.c +++ b/src/core.c @@ -1259,8 +1259,6 @@ core_lookup(core_handle *spl, key target, merge_accumulator *result) rc = trunk_merge_lookup( &spl->trunk_context, &root_handle, target, result, NULL); - // Release the node handle before handling any errors - trunk_ondisk_node_handle_deinit(&root_handle); if (!SUCCESS(rc)) { return rc; } @@ -1340,9 +1338,6 @@ core_lookup_async(core_lookup_async_state *state) state->callback, state->callback_arg); rc = async_result(&state->trunk_node_state); - - // Release the node handle before handling any errors - trunk_ondisk_node_handle_deinit(&state->root_handle); if (!SUCCESS(rc)) { async_return(state, rc); } @@ -1911,7 +1906,6 @@ core_print_lookup(core_handle *spl, key target, platform_log_handle *log_handle) trunk_ondisk_node_handle handle; trunk_init_root_handle(&spl->trunk_context, &handle); trunk_merge_lookup(&spl->trunk_context, &handle, target, &data, log_handle); - trunk_ondisk_node_handle_deinit(&handle); } void diff --git a/src/trunk.c b/src/trunk.c index d5f8eb91..61c8e2d7 100644 --- a/src/trunk.c +++ b/src/trunk.c @@ -1027,7 +1027,7 @@ trunk_ondisk_node_handle_setup_content_page(trunk_ondisk_node_handle *handle, /* * IN Parameters: - * - state->handle: the ondisk_node_handle + * - state->handlep: the ondisk_node_handle * - state->offset: the offset of the page to get * * IN/OUT Parameters: @@ -1046,37 +1046,37 @@ trunk_ondisk_node_handle_setup_content_page_async( { async_begin(state, depth); - uint64 page_size = cache_page_size(state->handle.cc); + uint64 page_size = cache_page_size(state->handlep->cc); - if (offset_is_in_content_page(&state->handle, *state->page, state->offset)) { + if (offset_is_in_content_page(state->handlep, *state->page, state->offset)) { state->rc = STATUS_OK; async_return(state); } - if (*state->page != NULL && *state->page != state->handle.header_page) { - cache_unget(state->handle.cc, *state->page); + if (*state->page != NULL && *state->page != state->handlep->header_page) { + cache_unget(state->handlep->cc, *state->page); } if (state->offset < page_size) { - *state->page = state->handle.header_page; + *state->page = state->handlep->header_page; state->rc = STATUS_OK; async_return(state); } else { - uint64 addr = state->handle.header_page->disk_addr + state->offset; + uint64 addr = state->handlep->header_page->disk_addr + state->offset; addr -= (addr % page_size); cache_get_async_state_init(state->cache_get_state, - state->handle.cc, + state->handlep->cc, addr, PAGE_TYPE_TRUNK, state->callback, state->callback_arg); - while (cache_get_async(state->handle.cc, state->cache_get_state) + while (cache_get_async(state->handlep->cc, state->cache_get_state) != ASYNC_STATUS_DONE) { async_yield(state); } - *state->page = - cache_get_async_state_result(state->handle.cc, state->cache_get_state); + *state->page = cache_get_async_state_result(state->handlep->cc, + state->cache_get_state); if (*state->page == NULL) { platform_error_log("%s():%d: cache_get() failed", __func__, __LINE__); state->rc = STATUS_IO_ERROR; @@ -1123,7 +1123,7 @@ trunk_ondisk_node_get_pivot(trunk_ondisk_node_handle *handle, uint64 pivot_num) /* * IN Parameters: - * - state->handle: the ondisk_node_handle + * - state->handlep: the ondisk_node_handle * - state->pivot_num: the pivot number to get * * OUT Parameters: @@ -1142,9 +1142,9 @@ trunk_ondisk_node_get_pivot_async(trunk_merge_lookup_async_state *state, async_begin(state, depth); trunk_ondisk_node *header = - (trunk_ondisk_node *)state->handle.header_page->data; + (trunk_ondisk_node *)state->handlep->header_page->data; state->offset = header->pivot_offsets[state->pivot_num]; - state->page = &state->handle.pivot_page; + state->page = &state->handlep->pivot_page; async_await_subroutine(state, trunk_ondisk_node_handle_setup_content_page_async); if (!SUCCESS(state->rc)) { @@ -1157,9 +1157,9 @@ trunk_ondisk_node_get_pivot_async(trunk_merge_lookup_async_state *state, async_return(state); } state->pivot = - (trunk_ondisk_pivot *)(state->handle.pivot_page->data + state->offset - - content_page_offset(&state->handle, - state->handle.pivot_page)); + (trunk_ondisk_pivot *)(state->handlep->pivot_page->data + state->offset + - content_page_offset(state->handlep, + state->handlep->pivot_page)); state->rc = STATUS_OK; async_return(state); } @@ -1245,7 +1245,7 @@ trunk_ondisk_node_bundle_at_offset(trunk_ondisk_node_handle *handle, /* * IN Parameters: - * - state->handle: the ondisk_node_handle + * - state->handlep: the ondisk_node_handle * - state->offset: the offset of the bundle * * OUT Parameters: @@ -1260,7 +1260,7 @@ static async_status trunk_ondisk_node_bundle_at_offset_async(trunk_merge_lookup_async_state *state, uint64 depth) { - uint64 page_size = cache_page_size(state->handle.cc); + uint64 page_size = cache_page_size(state->handlep->cc); async_begin(state, depth); @@ -1270,7 +1270,7 @@ trunk_ondisk_node_bundle_at_offset_async(trunk_merge_lookup_async_state *state, state->offset += page_size - (state->offset % page_size); } - state->page = &state->handle.inflight_bundle_page; + state->page = &state->handlep->inflight_bundle_page; async_await_subroutine(state, trunk_ondisk_node_handle_setup_content_page_async); if (!SUCCESS(state->rc)) { @@ -1283,17 +1283,17 @@ trunk_ondisk_node_bundle_at_offset_async(trunk_merge_lookup_async_state *state, async_return(state); } state->bndl = - (trunk_ondisk_bundle *)(state->handle.inflight_bundle_page->data + (trunk_ondisk_bundle *)(state->handlep->inflight_bundle_page->data + state->offset - content_page_offset( - &state->handle, - state->handle.inflight_bundle_page)); + state->handlep, + state->handlep->inflight_bundle_page)); /* If there wasn't enough room for this bundle on this page, then we would * have zeroed the remaining bytes and put the bundle on the next page. */ if (state->bndl->num_branches == 0) { state->offset += page_size - (state->offset % page_size); - state->page = &state->handle.inflight_bundle_page; + state->page = &state->handlep->inflight_bundle_page; async_await_subroutine(state, trunk_ondisk_node_handle_setup_content_page_async); if (!SUCCESS(state->rc)) { @@ -1306,11 +1306,11 @@ trunk_ondisk_node_bundle_at_offset_async(trunk_merge_lookup_async_state *state, async_return(state); } state->bndl = - (trunk_ondisk_bundle *)(state->handle.inflight_bundle_page->data + (trunk_ondisk_bundle *)(state->handlep->inflight_bundle_page->data + state->offset - content_page_offset( - &state->handle, - state->handle.inflight_bundle_page)); + state->handlep, + state->handlep->inflight_bundle_page)); } async_return(state); } @@ -1331,7 +1331,7 @@ trunk_ondisk_node_get_first_inflight_bundle(trunk_ondisk_node_handle *handle, /* * IN Parameters: - * - state->handle: the ondisk_node_handle + * - state->handlep: the ondisk_node_handle * * OUT Parameters: * - state->bndl: the bundle @@ -1350,7 +1350,7 @@ trunk_ondisk_node_get_first_inflight_bundle_async( async_begin(state, depth); trunk_ondisk_node *header = - (trunk_ondisk_node *)state->handle.header_page->data; + (trunk_ondisk_node *)state->handlep->header_page->data; if (header->num_inflight_bundles == 0) { state->bndl = NULL; state->rc = STATUS_OK; @@ -1374,7 +1374,7 @@ trunk_ondisk_node_get_next_inflight_bundle(trunk_ondisk_node_handle *handle, /* * IN Parameters: - * - state->handle: the ondisk_node_handle + * - state->handlep: the ondisk_node_handle * * IN/OUT Parameters: * - state->bndl: the bundle @@ -1393,10 +1393,11 @@ trunk_ondisk_node_get_next_inflight_bundle_async( uint64 depth) { async_begin(state, depth); - state->offset = - ((char *)state->bndl) - state->handle.inflight_bundle_page->data - + content_page_offset(&state->handle, state->handle.inflight_bundle_page) - + sizeof_trunk_ondisk_bundle(state->bndl); + state->offset = ((char *)state->bndl) + - state->handlep->inflight_bundle_page->data + + content_page_offset(state->handlep, + state->handlep->inflight_bundle_page) + + sizeof_trunk_ondisk_bundle(state->bndl); async_await_subroutine(state, trunk_ondisk_node_bundle_at_offset_async); async_return(state); } @@ -4961,7 +4962,7 @@ trunk_ondisk_node_find_pivot(const trunk_context *context, /* * IN Parameters: * state->context: the trunk node context - * state->handle: the ondisk node handle + * state->handlep: the ondisk node handle * state->tgt: the target key * //state->cmp: the comparison to use * @@ -4987,7 +4988,7 @@ trunk_ondisk_node_find_pivot_async(trunk_merge_lookup_async_state *state, async_begin(state, depth); state->min = 0; - state->max = trunk_ondisk_node_num_pivots(&state->handle) - 1; + state->max = trunk_ondisk_node_num_pivots(state->handlep) - 1; // invariant: pivot[min] <= tgt < pivot[max] state->min_pivot = NULL; @@ -5018,12 +5019,12 @@ trunk_ondisk_node_find_pivot_async(trunk_merge_lookup_async_state *state, */ // if (0 < state->min && state->last_cmp == 0 && state->cmp == less_than) { // state->min--; - // state->min_pivot = ondisk_node_get_pivot(&state->handle, state->min); + // state->min_pivot = ondisk_node_get_pivot(state->handlep, state->min); // } if (state->min_pivot == NULL) { state->min_pivot = - trunk_ondisk_node_get_pivot(&state->handle, state->min); + trunk_ondisk_node_get_pivot(state->handlep, state->min); } state->pivot = state->min_pivot; @@ -5247,36 +5248,31 @@ trunk_merge_lookup(trunk_context *context, { platform_status rc = STATUS_OK; - trunk_ondisk_node_handle handle; - rc = trunk_ondisk_node_handle_clone(&handle, inhandle); - if (!SUCCESS(rc)) { - platform_error_log("trunk_merge_lookup: " - "trunk_ondisk_node_handle_clone failed: %d\n", - rc.r); - return rc; - } + trunk_ondisk_node_handle handle; + trunk_ondisk_node_handle *handlep; + handlep = inhandle; - while (handle.header_page) { - uint64 height = trunk_ondisk_node_height(&handle); + while (handlep && handlep->header_page) { + uint64 height = trunk_ondisk_node_height(handlep); if (log) { trunk_node node; rc = trunk_node_deserialize( - context, handle.header_page->disk_addr, &node); + context, handlep->header_page->disk_addr, &node); if (!SUCCESS(rc)) { platform_error_log("trunk_merge_lookup: " "node_deserialize failed: %d\n", rc.r); goto cleanup; } - platform_log(log, "addr: %lu\n", handle.header_page->disk_addr); + platform_log(log, "addr: %lu\n", handlep->header_page->disk_addr); trunk_node_print(&node, log, context->cfg->data_cfg, 0); trunk_node_deinit(&node, context); } trunk_ondisk_pivot *pivot; rc = trunk_ondisk_node_find_pivot( - context, &handle, tgt, less_than_or_equal, &pivot); + context, handlep, tgt, less_than_or_equal, &pivot); if (!SUCCESS(rc)) { platform_error_log( "trunk_merge_lookup: ondisk_node_find_pivot failed: " @@ -5294,7 +5290,7 @@ trunk_merge_lookup(trunk_context *context, // Search the inflight bundles trunk_ondisk_bundle *bndl; - rc = trunk_ondisk_node_get_first_inflight_bundle(&handle, &bndl); + rc = trunk_ondisk_node_get_first_inflight_bundle(handlep, &bndl); if (!SUCCESS(rc)) { platform_error_log("trunk_merge_lookup: " "ondisk_node_get_first_inflight_bundle failed\n"); @@ -5313,7 +5309,7 @@ trunk_merge_lookup(trunk_context *context, goto cleanup; } if (i < pivot->num_live_inflight_bundles - 1) { - bndl = trunk_ondisk_node_get_next_inflight_bundle(&handle, bndl); + bndl = trunk_ondisk_node_get_next_inflight_bundle(handlep, bndl); } } @@ -5342,16 +5338,18 @@ trunk_merge_lookup(trunk_context *context, rc.r); goto cleanup; } - trunk_ondisk_node_handle_deinit(&handle); - handle = child_handle; + trunk_ondisk_node_handle_deinit(handlep); + handle = child_handle; + handlep = &handle; } else { - trunk_ondisk_node_handle_deinit(&handle); + trunk_ondisk_node_handle_deinit(handlep); + handlep = NULL; } } cleanup: - if (handle.header_page) { - trunk_ondisk_node_handle_deinit(&handle); + if (handlep) { + trunk_ondisk_node_handle_deinit(handlep); } return rc; } @@ -5361,24 +5359,16 @@ trunk_merge_lookup_async(trunk_merge_lookup_async_state *state) { async_begin(state, 0); - // We don't need to perform the clone asynchronously because the header page - // is guaranteed to be in memory. - state->rc = trunk_ondisk_node_handle_clone(&state->handle, state->inhandle); - if (!SUCCESS(state->rc)) { - platform_error_log("trunk_merge_lookup_async: " - "trunk_ondisk_node_handle_clone failed: %d\n", - state->rc.r); - async_return(state, state->rc); - } + state->handlep = state->inhandle; - while (state->handle.header_page) { - state->height = trunk_ondisk_node_height(&state->handle); + while (state->handlep && state->handlep->header_page) { + state->height = trunk_ondisk_node_height(state->handlep); if (state->log) { // Sorry, but we're not going to perform the logging asynchronously. trunk_node node; state->rc = trunk_node_deserialize( - state->context, state->handle.header_page->disk_addr, &node); + state->context, state->handlep->header_page->disk_addr, &node); if (!SUCCESS(state->rc)) { platform_error_log("trunk_merge_lookup_async: " "node_deserialize failed: %d\n", @@ -5386,7 +5376,7 @@ trunk_merge_lookup_async(trunk_merge_lookup_async_state *state) goto cleanup; } platform_log( - state->log, "addr: %lu\n", state->handle.header_page->disk_addr); + state->log, "addr: %lu\n", state->handlep->header_page->disk_addr); trunk_node_print(&node, state->log, state->context->cfg->data_cfg, 0); trunk_node_deinit(&node, state->context); } @@ -5469,16 +5459,18 @@ trunk_merge_lookup_async(trunk_merge_lookup_async_state *state) state->rc.r); goto cleanup; } - trunk_ondisk_node_handle_deinit(&state->handle); - state->handle = state->child_handle; + trunk_ondisk_node_handle_deinit(state->handlep); + state->handle = state->child_handle; + state->handlep = &state->handle; } else { - trunk_ondisk_node_handle_deinit(&state->handle); + trunk_ondisk_node_handle_deinit(state->handlep); + state->handlep = NULL; } } cleanup: - if (state->handle.header_page) { - trunk_ondisk_node_handle_deinit(&state->handle); + if (state->handlep) { + trunk_ondisk_node_handle_deinit(state->handlep); } async_return(state, state->rc); } diff --git a/src/trunk.h b/src/trunk.h index cc8f7661..876117f0 100644 --- a/src/trunk.h +++ b/src/trunk.h @@ -275,6 +275,8 @@ trunk_init_root_handle(trunk_context *context, void trunk_ondisk_node_handle_deinit(trunk_ondisk_node_handle *handle); +/* Note: consumes handle, i.e. calls deinit on handle, even if the lookup + * otherwise encounters an error. */ platform_status trunk_merge_lookup(trunk_context *context, trunk_ondisk_node_handle *handle, @@ -301,6 +303,8 @@ trunk_collect_branches(const trunk_context *context, typedef struct trunk_ondisk_pivot trunk_ondisk_pivot; typedef struct trunk_ondisk_bundle trunk_ondisk_bundle; +/* As with trunk_merge_lookup, trunk_merge_lookup_async always deinits inhandle. + */ // clang-format off DEFINE_ASYNC_STATE(trunk_merge_lookup_async_state, 4, param, trunk_context *, context, @@ -313,6 +317,7 @@ DEFINE_ASYNC_STATE(trunk_merge_lookup_async_state, 4, local, platform_status, __async_result, local, platform_status, rc, local, trunk_ondisk_node_handle, handle, + local, trunk_ondisk_node_handle *, handlep, local, uint64, height, local, trunk_ondisk_pivot *, pivot, local, uint64, inflight_bundle_num, From 3b46007d40e0ed8b9531a50b69c7182aa1ff19d6 Mon Sep 17 00:00:00 2001 From: Rob Johnson Date: Tue, 1 Jul 2025 23:03:11 -0700 Subject: [PATCH 2/6] make queries hold read lock on root --- src/core.c | 3 +++ src/trunk.c | 20 ++++++++++++++------ src/trunk.h | 4 ---- 3 files changed, 17 insertions(+), 10 deletions(-) diff --git a/src/core.c b/src/core.c index 81a4da18..63db12fc 100644 --- a/src/core.c +++ b/src/core.c @@ -1259,6 +1259,7 @@ core_lookup(core_handle *spl, key target, merge_accumulator *result) rc = trunk_merge_lookup( &spl->trunk_context, &root_handle, target, result, NULL); + trunk_ondisk_node_handle_deinit(&root_handle); if (!SUCCESS(rc)) { return rc; } @@ -1337,6 +1338,7 @@ core_lookup_async(core_lookup_async_state *state) NULL, state->callback, state->callback_arg); + trunk_ondisk_node_handle_deinit(&state->root_handle); rc = async_result(&state->trunk_node_state); if (!SUCCESS(rc)) { async_return(state, rc); @@ -1906,6 +1908,7 @@ core_print_lookup(core_handle *spl, key target, platform_log_handle *log_handle) trunk_ondisk_node_handle handle; trunk_init_root_handle(&spl->trunk_context, &handle); trunk_merge_lookup(&spl->trunk_context, &handle, target, &data, log_handle); + trunk_ondisk_node_handle_deinit(&handle); } void diff --git a/src/trunk.c b/src/trunk.c index 61c8e2d7..54c6102e 100644 --- a/src/trunk.c +++ b/src/trunk.c @@ -5338,17 +5338,21 @@ trunk_merge_lookup(trunk_context *context, rc.r); goto cleanup; } - trunk_ondisk_node_handle_deinit(handlep); + if (handlep != inhandle) { + trunk_ondisk_node_handle_deinit(handlep); + } handle = child_handle; handlep = &handle; } else { - trunk_ondisk_node_handle_deinit(handlep); + if (handlep != inhandle) { + trunk_ondisk_node_handle_deinit(handlep); + } handlep = NULL; } } cleanup: - if (handlep) { + if (handlep && handlep != inhandle) { trunk_ondisk_node_handle_deinit(handlep); } return rc; @@ -5459,17 +5463,21 @@ trunk_merge_lookup_async(trunk_merge_lookup_async_state *state) state->rc.r); goto cleanup; } - trunk_ondisk_node_handle_deinit(state->handlep); + if (state->handlep != state->inhandle) { + trunk_ondisk_node_handle_deinit(state->handlep); + } state->handle = state->child_handle; state->handlep = &state->handle; } else { - trunk_ondisk_node_handle_deinit(state->handlep); + if (state->handlep != state->inhandle) { + trunk_ondisk_node_handle_deinit(state->handlep); + } state->handlep = NULL; } } cleanup: - if (state->handlep) { + if (state->handlep && state->handlep != state->inhandle) { trunk_ondisk_node_handle_deinit(state->handlep); } async_return(state, state->rc); diff --git a/src/trunk.h b/src/trunk.h index 876117f0..67714604 100644 --- a/src/trunk.h +++ b/src/trunk.h @@ -275,8 +275,6 @@ trunk_init_root_handle(trunk_context *context, void trunk_ondisk_node_handle_deinit(trunk_ondisk_node_handle *handle); -/* Note: consumes handle, i.e. calls deinit on handle, even if the lookup - * otherwise encounters an error. */ platform_status trunk_merge_lookup(trunk_context *context, trunk_ondisk_node_handle *handle, @@ -303,8 +301,6 @@ trunk_collect_branches(const trunk_context *context, typedef struct trunk_ondisk_pivot trunk_ondisk_pivot; typedef struct trunk_ondisk_bundle trunk_ondisk_bundle; -/* As with trunk_merge_lookup, trunk_merge_lookup_async always deinits inhandle. - */ // clang-format off DEFINE_ASYNC_STATE(trunk_merge_lookup_async_state, 4, param, trunk_context *, context, From 375e0eb3dbe18b517ed64f9b99e940c11855b4bd Mon Sep 17 00:00:00 2001 From: Rob Johnson Date: Tue, 1 Jul 2025 23:50:05 -0700 Subject: [PATCH 3/6] update test.sh --- test.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/test.sh b/test.sh index 9b72dba8..1ddadd2e 100755 --- a/test.sh +++ b/test.sh @@ -974,7 +974,6 @@ if [ "$INCLUDE_SLOW_TESTS" != "true" ]; then set -x run_with_timing "Config-params parsing test" "$BINDIR"/unit/config_parse_test --log \ - --max-branches-per-node 42 \ --num-inserts 20 \ --rough-count-height 11 \ --stats \ From 7b686d00c9799c1935e85c5a1401bee8713d2657 Mon Sep 17 00:00:00 2001 From: Rob Johnson Date: Thu, 3 Jul 2025 11:08:54 -0700 Subject: [PATCH 4/6] implement deferred trnuk gc --- src/cache.h | 20 +++++- src/clockcache.c | 33 +++++++-- src/core.c | 9 ++- src/trunk.c | 170 ++++++++++++++++++++++++++++++++--------------- src/trunk.h | 9 ++- 5 files changed, 177 insertions(+), 64 deletions(-) diff --git a/src/cache.h b/src/cache.h index e85f7aa2..bf3a2b8f 100644 --- a/src/cache.h +++ b/src/cache.h @@ -129,7 +129,8 @@ typedef void (*extent_sync_fn)(cache *cc, uint64 *pages_outstanding); typedef void (*page_prefetch_fn)(cache *cc, uint64 addr, page_type type); typedef int (*evict_fn)(cache *cc, bool32 ignore_pinned); -typedef void (*assert_ungot_fn)(cache *cc, uint64 addr); +typedef bool32 (*page_addr_pred_fn)(cache *cc, uint64 addr); +typedef void (*page_addr_fn)(cache *cc, uint64 addr); typedef void (*validate_page_fn)(cache *cc, page_handle *page, uint64 addr); typedef void (*io_stats_fn)(cache *cc, uint64 *read_bytes, uint64 *write_bytes); typedef uint32 (*count_dirty_fn)(cache *cc); @@ -168,7 +169,8 @@ typedef struct cache_ops { cache_generic_fn flush; evict_fn evict; cache_generic_fn cleanup; - assert_ungot_fn assert_ungot; + page_addr_pred_fn in_use; + page_addr_fn assert_ungot; cache_generic_fn assert_free; validate_page_fn validate_page; cache_present_fn cache_present; @@ -552,6 +554,20 @@ cache_cleanup(cache *cc) return cc->ops->cleanup(cc); } +/* + *----------------------------------------------------------------------------- + * cache_in_use + * + * Returns TRUE if there is a reader or writer holding a reference + * to the page at addr. + *----------------------------------------------------------------------------- + */ +static inline bool32 +cache_in_use(cache *cc, uint64 addr) +{ + return cc->ops->in_use(cc, addr); +} + /* *----------------------------------------------------------------------------- * cache_assert_ungot diff --git a/src/clockcache.c b/src/clockcache.c index 1db3e871..586495a6 100644 --- a/src/clockcache.c +++ b/src/clockcache.c @@ -1480,7 +1480,7 @@ void clockcache_extent_discard(clockcache *cc, uint64 addr, page_type type) { debug_assert(addr % clockcache_extent_size(cc) == 0); - debug_assert(allocator_get_refcount(cc->al, addr) == 1); + debug_assert(allocator_get_refcount(cc->al, addr) == AL_NO_REFS); clockcache_log(addr, 0, "hard evict extent: addr %lu\n", addr); for (uint64 i = 0; i < cc->cfg->pages_per_extent; i++) { @@ -1697,10 +1697,10 @@ clockcache_get_internal(clockcache *cc, // IN refcount extent_ref_count = allocator_get_refcount(cc->al, base_addr); // Dump allocated extents info for deeper debugging. - if (extent_ref_count <= 1) { + if (extent_ref_count AL_FREE) { allocator_print_allocated(cc->al); } - debug_assert((extent_ref_count > 1), + debug_assert((extent_ref_count != AL_FREE), "Attempt to get a buffer for page addr=%lu" ", page type=%d ('%s')," " from extent addr=%lu, (extent number=%lu)" @@ -1917,10 +1917,10 @@ clockcache_get_internal_async(clockcache_get_async_state *state, uint64 depth) allocator_get_refcount(state->cc->al, state->base_addr); // Dump allocated extents info for deeper debugging. - if (state->extent_ref_count <= 1) { + if (state->extent_ref_count == AL_FREE) { allocator_print_allocated(state->cc->al); } - debug_assert((state->extent_ref_count > 1), + debug_assert((state->extent_ref_count != AL_FREE), "Attempt to get a buffer for page addr=%lu" ", page type=%d ('%s')," " from extent addr=%lu, (extent number=%lu)" @@ -2680,6 +2680,21 @@ clockcache_count_dirty(clockcache *cc) return dirty_count; } +bool32 +clockcache_in_use(clockcache *cc, uint64 addr) +{ + uint32 entry_no = clockcache_lookup(cc, addr); + if (entry_no == CC_UNMAPPED_ENTRY) { + return FALSE; + } + for (threadid thr_i = 0; thr_i < CC_RC_WIDTH; thr_i++) { + if (clockcache_get_ref(cc, entry_no, thr_i) > 0) { + return TRUE; + } + } + return clockcache_test_flag(cc, entry_no, CC_WRITELOCKED); +} + uint16 clockcache_get_read_ref(clockcache *cc, page_handle *page) { @@ -2896,6 +2911,13 @@ clockcache_wait_virtual(cache *c) return clockcache_wait(cc); } +bool32 +clockcache_in_use_virtual(cache *c, uint64 addr) +{ + clockcache *cc = (clockcache *)c; + return clockcache_in_use(cc, addr); +} + void clockcache_assert_ungot_virtual(cache *c, uint64 addr) { @@ -3010,6 +3032,7 @@ static cache_ops clockcache_ops = { .flush = clockcache_flush_virtual, .evict = clockcache_evict_all_virtual, .cleanup = clockcache_wait_virtual, + .in_use = clockcache_in_use_virtual, .assert_ungot = clockcache_assert_ungot_virtual, .assert_free = clockcache_assert_no_locks_held_virtual, .print = clockcache_print_virtual, diff --git a/src/core.c b/src/core.c index 63db12fc..967d91e7 100644 --- a/src/core.c +++ b/src/core.c @@ -156,8 +156,11 @@ core_set_super_block(core_handle *spl, super = (core_super_block *)super_page->data; uint64 old_root_addr = super->root_addr; - if (spl->trunk_context.root != NULL) { - super->root_addr = spl->trunk_context.root->addr; + trunk_ondisk_node_handle root_handle; + trunk_init_root_handle(&spl->trunk_context, &root_handle); + uint64 root_addr = trunk_ondisk_node_handle_addr(&root_handle); + if (root_addr != 0) { + super->root_addr = root_addr; rc = trunk_inc_ref(spl->cfg.trunk_node_cfg, spl->heap_id, spl->cc, @@ -169,6 +172,8 @@ core_set_super_block(core_handle *spl, } else { super->root_addr = 0; } + trunk_ondisk_node_handle_deinit(&root_handle); + if (spl->cfg.use_log) { if (spl->log) { super->log_addr = log_addr(spl->log); diff --git a/src/trunk.c b/src/trunk.c index 54c6102e..5c841856 100644 --- a/src/trunk.c +++ b/src/trunk.c @@ -103,6 +103,11 @@ struct trunk_pivot_state { bundle_compaction *bundle_compactions; }; +struct pending_gc { + pending_gc *next; + uint64 addr; +}; + /*************************************************** * branch_ref operations ***************************************************/ @@ -1647,71 +1652,97 @@ bundle_dec_all_refs(trunk_context *context, bundle *bndl) bundle_dec_all_branch_refs(context, bndl); } +// static void +// trunk_ondisk_node_wait_for_readers(trunk_context *context, uint64 addr) +// { +// page_handle *page = cache_get(context->cc, addr, TRUE, +// PAGE_TYPE_TRUNK); bool32 success = cache_try_claim(context->cc, +// page); platform_assert(success); cache_lock(context->cc, page); +// cache_unlock(context->cc, page); +// cache_unclaim(context->cc, page); +// cache_unget(context->cc, page); +// } + +static void +trunk_ondisk_node_dec_ref(trunk_context *context, uint64 addr); + +/* Prerequisite: addr must be in the AL_NO_REFS state. */ +static void +trunk_ondisk_node_gc(trunk_context *context, uint64 addr) +{ + trunk_node node; + platform_status rc = trunk_node_deserialize(context, addr, &node); + if (SUCCESS(rc)) { + if (!trunk_node_is_leaf(&node)) { + for (uint64 i = 0; i < vector_length(&node.pivots) - 1; i++) { + trunk_pivot *pvt = vector_get(&node.pivots, i); + trunk_ondisk_node_dec_ref(context, pvt->child_addr); + } + } + for (uint64 i = 0; i < vector_length(&node.pivot_bundles); i++) { + bundle *bndl = vector_get_ptr(&node.pivot_bundles, i); + bundle_dec_all_refs(context, bndl); + } + for (uint64 i = 0; i < vector_length(&node.inflight_bundles); i++) { + bundle *bndl = vector_get_ptr(&node.inflight_bundles, i); + bundle_dec_all_refs(context, bndl); + } + trunk_node_deinit(&node, context); + } else { + platform_error_log("%s():%d: node_deserialize() failed: %s", + __func__, + __LINE__, + platform_status_to_string(rc)); + } + cache_extent_discard(context->cc, addr, PAGE_TYPE_TRUNK); + allocator_dec_ref(context->al, addr, PAGE_TYPE_TRUNK); +} + static void -trunk_ondisk_node_wait_for_readers(trunk_context *context, uint64 addr) +pending_gcs_lock(trunk_context *context) { - page_handle *page = cache_get(context->cc, addr, TRUE, PAGE_TYPE_TRUNK); - bool32 success = cache_try_claim(context->cc, page); - platform_assert(success); - cache_lock(context->cc, page); - cache_unlock(context->cc, page); - cache_unclaim(context->cc, page); - cache_unget(context->cc, page); + while (__sync_lock_test_and_set(&context->pending_gcs_lock, 1)) { + platform_yield(); + } +} + +static void +pending_gcs_unlock(trunk_context *context) +{ + __sync_lock_release(&context->pending_gcs_lock); } + static void trunk_ondisk_node_dec_ref(trunk_context *context, uint64 addr) { - // FIXME: the cache needs to allow accessing pages in the AL_NO_REFS state. - // Otherwise there is a crazy race here. This is an attempt to handle it. - // - // The problem is that the cache doesn't let you access pages in the - // AL_NO_REFS state. As a result, if we do a dec_ref while another thread is - // accessing the node, then it might do a cache_get on a page of the node - // after we've done the dec_ref, causing an assertion violation in the cache. - // So what we do is we wait for all readers to go away, and then we do a - // dec_ref. If a reader comes in after we've done the dec_ref, then the - // refcount must have been more than 1 before we did the dec_ref, so it - // won't be in the AL_NO_REFS state, so the other reader will not have a - // problem. Note that waiting for readers to go away is wasteful when the - // refcount is > 1, so it would be nice to get rid of this restriction that - // we are working around. - // - // If we do get AL_NO_REFS after the dec_ref, then we also face another - // problem: we need to deserialize the node to perform recursive dec_refs. So - // we have to temporarilty inc_ref the node, do our work, and then dec_ref it - // again. Sigh. - trunk_ondisk_node_wait_for_readers(context, addr); - refcount rfc = allocator_dec_ref(context->al, addr, PAGE_TYPE_TRUNK); - if (rfc == AL_NO_REFS) { - trunk_node node; - allocator_inc_ref(context->al, addr); - platform_status rc = trunk_node_deserialize(context, addr, &node); - if (SUCCESS(rc)) { - if (!trunk_node_is_leaf(&node)) { - for (uint64 i = 0; i < vector_length(&node.pivots) - 1; i++) { - trunk_pivot *pvt = vector_get(&node.pivots, i); - trunk_ondisk_node_dec_ref(context, pvt->child_addr); - } - } - for (uint64 i = 0; i < vector_length(&node.pivot_bundles); i++) { - bundle *bndl = vector_get_ptr(&node.pivot_bundles, i); - bundle_dec_all_refs(context, bndl); + refcount ref = allocator_dec_ref(context->al, addr, PAGE_TYPE_TRUNK); + if (ref == AL_NO_REFS) { + if (cache_in_use(context->cc, addr)) { + pending_gc *pgc = TYPED_MALLOC(context->hid, pgc); + if (pgc == NULL) { + platform_error_log("%s():%d: TYPED_MALLOC() failed. We're gonna " + "leak some disk space.", + __func__, + __LINE__); + return; } - for (uint64 i = 0; i < vector_length(&node.inflight_bundles); i++) { - bundle *bndl = vector_get_ptr(&node.inflight_bundles, i); - bundle_dec_all_refs(context, bndl); + pgc->addr = addr; + pgc->next = NULL; + + pending_gcs_lock(context); + if (context->pending_gcs_tail == NULL) { + context->pending_gcs = pgc; + context->pending_gcs_tail = pgc; + } else { + context->pending_gcs_tail->next = pgc; + context->pending_gcs_tail = pgc; } - trunk_node_deinit(&node, context); + pending_gcs_unlock(context); + } else { - platform_error_log("%s():%d: node_deserialize() failed: %s", - __func__, - __LINE__, - platform_status_to_string(rc)); + trunk_ondisk_node_gc(context, addr); } - allocator_dec_ref(context->al, addr, PAGE_TYPE_TRUNK); - cache_extent_discard(context->cc, addr, PAGE_TYPE_TRUNK); - allocator_dec_ref(context->al, addr, PAGE_TYPE_TRUNK); } } @@ -2351,6 +2382,12 @@ trunk_init_root_handle(trunk_context *context, trunk_ondisk_node_handle *handle) return rc; } +uint64 +trunk_ondisk_node_handle_addr(const trunk_ondisk_node_handle *handle) +{ + return handle->header_page == NULL ? 0 : handle->header_page->disk_addr; +} + void trunk_modification_begin(trunk_context *context) { @@ -2371,11 +2408,34 @@ trunk_set_root(trunk_context *context, trunk_ondisk_node_ref *new_root_ref) } } +static void +perform_pending_gcs(trunk_context *context) +{ + pending_gcs_lock(context); + + pending_gc *pgc = context->pending_gcs; + + while (pgc && !cache_in_use(context->cc, pgc->addr)) { + trunk_ondisk_node_gc(context, pgc->addr); + pending_gc *next = pgc->next; + platform_free(context->hid, pgc); + pgc = next; + } + + context->pending_gcs = pgc; + if (pgc == NULL) { + context->pending_gcs_tail = NULL; + } + + pending_gcs_unlock(context); +} + void trunk_modification_end(trunk_context *context) { platform_batch_rwlock_unclaim(&context->root_lock, 0); platform_batch_rwlock_unget(&context->root_lock, 0); + perform_pending_gcs(context); } /************************* @@ -5787,6 +5847,8 @@ trunk_context_deinit(trunk_context *context) if (context->root != NULL) { trunk_ondisk_node_ref_destroy(context->root, context, context->hid); } + perform_pending_gcs(context); + platform_assert(context->pending_gcs == NULL); trunk_pivot_state_map_deinit(&context->pivot_states); platform_batch_rwlock_deinit(&context->root_lock); } diff --git a/src/trunk.h b/src/trunk.h index 67714604..a6d53635 100644 --- a/src/trunk.h +++ b/src/trunk.h @@ -153,6 +153,8 @@ typedef struct incorporation_tasks { trunk_node_vector node_compactions; } incorporation_tasks; +typedef struct pending_gc pending_gc; + typedef struct trunk_context { const trunk_config *cfg; platform_heap_id hid; @@ -165,6 +167,9 @@ typedef struct trunk_context { trunk_ondisk_node_ref *root; trunk_ondisk_node_ref *post_incorporation_root; trunk_ondisk_node_ref *pre_incorporation_root; + uint64 pending_gcs_lock; + pending_gc *pending_gcs; + pending_gc *pending_gcs_tail; incorporation_tasks tasks; } trunk_context; @@ -208,7 +213,6 @@ trunk_context_init(trunk_context *context, task_system *ts, uint64 root_addr); - platform_status trunk_inc_ref(const trunk_config *cfg, platform_heap_id hid, @@ -272,6 +276,9 @@ platform_status trunk_init_root_handle(trunk_context *context, trunk_ondisk_node_handle *handle); +uint64 +trunk_ondisk_node_handle_addr(const trunk_ondisk_node_handle *handle); + void trunk_ondisk_node_handle_deinit(trunk_ondisk_node_handle *handle); From 8632578e8370eb8d93150d2404f2acd480944b01 Mon Sep 17 00:00:00 2001 From: Rob Johnson Date: Thu, 3 Jul 2025 18:01:25 -0700 Subject: [PATCH 5/6] fix deadlocks --- src/clockcache.c | 2 +- src/trunk.c | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/clockcache.c b/src/clockcache.c index 586495a6..0337da22 100644 --- a/src/clockcache.c +++ b/src/clockcache.c @@ -1697,7 +1697,7 @@ clockcache_get_internal(clockcache *cc, // IN refcount extent_ref_count = allocator_get_refcount(cc->al, base_addr); // Dump allocated extents info for deeper debugging. - if (extent_ref_count AL_FREE) { + if (extent_ref_count == AL_FREE) { allocator_print_allocated(cc->al); } debug_assert((extent_ref_count != AL_FREE), diff --git a/src/trunk.c b/src/trunk.c index 5c841856..1243f300 100644 --- a/src/trunk.c +++ b/src/trunk.c @@ -5762,6 +5762,8 @@ trunk_context_init(trunk_context *context, task_system *ts, uint64 root_addr) { + memset(context, 0, sizeof(trunk_context)); + if (root_addr != 0) { context->root = trunk_ondisk_node_ref_create(hid, NEGATIVE_INFINITY_KEY, root_addr); @@ -5792,7 +5794,6 @@ trunk_context_init(trunk_context *context, trunk_pivot_state_map_init(&context->pivot_states); platform_batch_rwlock_init(&context->root_lock); - return STATUS_OK; } From bbffceb230b706872598120cf5c2d453c2c67968 Mon Sep 17 00:00:00 2001 From: Rob Johnson Date: Thu, 3 Jul 2025 19:18:02 -0700 Subject: [PATCH 6/6] align w/ sync code --- src/trunk.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/trunk.c b/src/trunk.c index 1243f300..36bfc584 100644 --- a/src/trunk.c +++ b/src/trunk.c @@ -5423,6 +5423,7 @@ trunk_merge_lookup_async(trunk_merge_lookup_async_state *state) { async_begin(state, 0); + state->rc = STATUS_OK; state->handlep = state->inhandle; while (state->handlep && state->handlep->header_page) {