From 27a908281a8326a3721c94181a5aeb0738614527 Mon Sep 17 00:00:00 2001 From: Richard Patel Date: Tue, 3 Mar 2026 04:52:00 +0000 Subject: [PATCH] checkpoint --- book/api/metrics-generated.md | 31 +- src/app/firedancer-dev/commands/backtest.c | 23 +- src/app/firedancer-dev/main.h | 2 + src/app/firedancer/callbacks_vinyl.c | 28 + src/app/firedancer/config/default.toml | 4 +- src/app/firedancer/config/mainnet.toml | 4 +- src/app/firedancer/main.c | 2 + src/app/firedancer/topology.c | 48 +- src/app/shared/commands/mem.c | 2 +- src/app/shared/commands/metrics.c | 2 +- src/app/shared/commands/monitor/monitor.c | 30 + src/app/shared/commands/watch/watch.c | 36 +- src/app/shared/fd_action.h | 1 + .../metrics/generated/fd_metrics_accdb.c | 4 +- .../metrics/generated/fd_metrics_accdb.h | 21 +- .../metrics/generated/fd_metrics_enums.h | 9 - .../metrics/generated/fd_metrics_execle.c | 6 + .../metrics/generated/fd_metrics_execle.h | 38 +- .../metrics/generated/fd_metrics_execrp.c | 6 + .../metrics/generated/fd_metrics_execrp.h | 76 +- .../metrics/generated/fd_metrics_replay.c | 15 +- .../metrics/generated/fd_metrics_replay.h | 89 +- src/disco/metrics/metrics.xml | 43 +- src/disco/topo/fd_topo.h | 3 + src/disco/topo/fd_topob_vinyl.h | 10 + src/discof/accdb/fd_accdb_case_acquire.c | 220 ++++ src/discof/accdb/fd_accdb_line_ctl.h | 38 + src/discof/accdb/fd_accdb_tile.c | 364 +++--- src/discof/accdb/fd_accdb_tile_cache.c | 231 ++++ src/discof/accdb/fd_accdb_tile_private.h | 236 ++++ src/discof/accdb/fd_accdb_tile_root.c | 563 +++++++++ src/discof/execle/fd_execle_tile.c | 8 + src/discof/execrp/fd_execrp_tile.c | 8 +- src/discof/fd_accdb_topo.c | 16 + src/discof/genesis/fd_genesi_tile.c | 12 +- src/discof/replay/fd_replay_tile.c | 75 +- src/flamenco/accdb/Local.mk | 7 - src/flamenco/accdb/fd_accdb_admin_v2.c | 189 +-- src/flamenco/accdb/fd_accdb_admin_v2.h | 30 +- .../accdb/fd_accdb_admin_v2_private.h | 62 - src/flamenco/accdb/fd_accdb_admin_v2_root.c | 379 ------ src/flamenco/accdb/fd_accdb_base.h | 1 + src/flamenco/accdb/fd_accdb_ctl.c | 771 ------------ src/flamenco/accdb/fd_accdb_impl_v2.c | 317 +++-- src/flamenco/accdb/fd_accdb_impl_v2.h | 22 + src/flamenco/accdb/fd_accdb_specread.h | 136 ++ src/flamenco/accdb/fd_accdb_user.h | 10 + src/flamenco/accdb/test_accdb_v2.c | 670 ---------- src/flamenco/runtime/fd_executor.c | 85 +- src/vinyl/Local.mk | 8 +- src/vinyl/data/fd_vinyl_data.c | 108 +- src/vinyl/data/fd_vinyl_data.h | 50 +- src/vinyl/data/test_vinyl_data.c | 2 +- src/vinyl/fd_vinyl.c | 118 -- src/vinyl/fd_vinyl.h | 98 -- src/vinyl/fd_vinyl_case_acquire.c | 399 ------ src/vinyl/fd_vinyl_case_erase.c | 108 -- src/vinyl/fd_vinyl_case_fetch.c | 116 -- src/vinyl/fd_vinyl_case_flush.c | 68 - src/vinyl/fd_vinyl_case_move.c | 326 ----- src/vinyl/fd_vinyl_case_release.c | 352 ------ src/vinyl/fd_vinyl_case_test.c | 39 - src/vinyl/fd_vinyl_case_try.c | 180 --- src/vinyl/fd_vinyl_compact.c | 377 ------ src/vinyl/fd_vinyl_ctl.c | 712 ----------- src/vinyl/fd_vinyl_ctl_help | 91 -- src/vinyl/fd_vinyl_exec.c | 696 ----------- src/vinyl/fd_vinyl_recover.c | 731 ----------- src/vinyl/fd_vinyl_recover_serial.c | 351 ------ src/vinyl/line/fd_vinyl_line.h | 7 +- src/vinyl/test_vinyl_req.c | 1101 ----------------- 71 files changed, 2559 insertions(+), 8462 deletions(-) create mode 100644 src/discof/accdb/fd_accdb_case_acquire.c create mode 100644 src/discof/accdb/fd_accdb_line_ctl.h create mode 100644 src/discof/accdb/fd_accdb_tile_cache.c create mode 100644 src/discof/accdb/fd_accdb_tile_private.h create mode 100644 src/discof/accdb/fd_accdb_tile_root.c delete mode 100644 src/flamenco/accdb/fd_accdb_admin_v2_private.h delete mode 100644 src/flamenco/accdb/fd_accdb_admin_v2_root.c delete mode 100644 src/flamenco/accdb/fd_accdb_ctl.c create mode 100644 src/flamenco/accdb/fd_accdb_specread.h delete mode 100644 src/flamenco/accdb/test_accdb_v2.c delete mode 100644 src/vinyl/fd_vinyl_case_acquire.c delete mode 100644 src/vinyl/fd_vinyl_case_erase.c delete mode 100644 src/vinyl/fd_vinyl_case_fetch.c delete mode 100644 src/vinyl/fd_vinyl_case_flush.c delete mode 100644 src/vinyl/fd_vinyl_case_move.c delete mode 100644 src/vinyl/fd_vinyl_case_release.c delete mode 100644 src/vinyl/fd_vinyl_case_test.c delete mode 100644 src/vinyl/fd_vinyl_case_try.c delete mode 100644 src/vinyl/fd_vinyl_compact.c delete mode 100644 src/vinyl/fd_vinyl_ctl.c delete mode 100644 src/vinyl/fd_vinyl_ctl_help delete mode 100644 src/vinyl/fd_vinyl_exec.c delete mode 100644 src/vinyl/fd_vinyl_recover.c delete mode 100644 src/vinyl/fd_vinyl_recover_serial.c delete mode 100644 src/vinyl/test_vinyl_req.c diff --git a/book/api/metrics-generated.md b/book/api/metrics-generated.md index aa72b7ed776..1ffbe0c75b7 100644 --- a/book/api/metrics-generated.md +++ b/book/api/metrics-generated.md @@ -610,6 +610,12 @@ | execle_​transaction_​landed
{transaction_​landed="landed_​failed"} | counter | Whether a transaction landed in the block or not (Transaction landed, but failed to execute) | | execle_​transaction_​landed
{transaction_​landed="unlanded"} | counter | Whether a transaction landed in the block or not (Transaction did not land) | | execle_​compute_​units_​total | counter | Estimated number of compute units executed since tile start | +| execle_​accdb_​lookup_​funk | counter | Number of account lookups resolved from funk (in-memory fork store) | +| execle_​accdb_​lookup_​specrd | counter | Number of account lookups resolved from speculative read (vinyl cache) | +| execle_​accdb_​lookup_​accdb | counter | Number of account lookups sent to accdb tile (vinyl rq/cq) | +| execle_​accdb_​dt_​funk | counter | Cumulative time spent in funk (in-memory) account lookups | +| execle_​accdb_​dt_​specrd | counter | Cumulative time spent in speculative read (vinyl cache) account lookups | +| execle_​accdb_​dt_​vinyl | counter | Cumulative time spent waiting for vinyl rq/cq account lookups | @@ -925,15 +931,12 @@ | replay_​progcache_​gc_​root | counter | Number of program cache entries garbage collected while rooting | | replay_​accdb_​created | counter | Number of account database records created | | replay_​accdb_​reverted | counter | Number of account database records reverted | -| replay_​accdb_​rooted | counter | Number of account database entries rooted | -| replay_​accdb_​rooted_​bytes | counter | Number of bytes in account database entries rooted (including overhead) | -| replay_​accdb_​gc_​root | counter | Number of account database entries garbage collected | -| replay_​accdb_​reclaimed | counter | Number of account database entries reclaimed (deletion rooted) | -| replay_​root_​slot_​duration_​seconds | histogram | Time in seconds spent updating the rooted account store (one sample per block) | -| replay_​root_​account_​duration_​seconds | histogram | Time in seconds spent updating the rooted account store (one sample per block, normalized by account count) | -| replay_​root_​elapsed_​seconds
{root_​phase="db"} | counter | Total time in seconds spent rooting accounts (Waiting on database server) | -| replay_​root_​elapsed_​seconds
{root_​phase="copy"} | counter | Total time in seconds spent rooting accounts (Copying account data) | -| replay_​root_​elapsed_​seconds
{root_​phase="gc"} | counter | Total time in seconds spent rooting accounts (Garbage collecting old account data) | +| replay_​accdb_​lookup_​funk | counter | Number of account lookups resolved from funk (in-memory fork store) | +| replay_​accdb_​lookup_​specrd | counter | Number of account lookups resolved from speculative read (vinyl cache) | +| replay_​accdb_​lookup_​accdb | counter | Number of account lookups sent to accdb tile (vinyl rq/cq) | +| replay_​accdb_​dt_​funk | counter | Cumulative time spent in funk (in-memory) account lookups | +| replay_​accdb_​dt_​specrd | counter | Cumulative time spent in speculative read (vinyl cache) account lookups | +| replay_​accdb_​dt_​vinyl | counter | Cumulative time spent waiting for vinyl rq/cq account lookups | @@ -951,6 +954,12 @@ | execrp_​progcache_​dup_​inserts | counter | Number of time two tiles raced to insert the same cache entry | | execrp_​progcache_​invalidations | counter | Number of program cache invalidations | | execrp_​accdb_​created | counter | Number of account database records created | +| execrp_​accdb_​lookup_​funk | counter | Number of account lookups resolved from funk (in-memory fork store) | +| execrp_​accdb_​lookup_​specrd | counter | Number of account lookups resolved from speculative read (vinyl cache) | +| execrp_​accdb_​lookup_​accdb | counter | Number of account lookups sent to accdb tile (vinyl rq/cq) | +| execrp_​accdb_​dt_​funk | counter | Cumulative time spent in funk (in-memory) account lookups | +| execrp_​accdb_​dt_​specrd | counter | Cumulative time spent in speculative read (vinyl cache) account lookups | +| execrp_​accdb_​dt_​vinyl | counter | Cumulative time spent waiting for vinyl rq/cq account lookups | | execrp_​txn_​regime
{txn_​regime="setup"} | counter | Mutually exclusive and exhaustive duration of time spent in transaction execution regimes (Transaction setup) | | execrp_​txn_​regime
{txn_​regime="exec"} | counter | Mutually exclusive and exhaustive duration of time spent in transaction execution regimes (Transaction execution (includes VM setup/execution)) | | execrp_​txn_​regime
{txn_​regime="commit"} | counter | Mutually exclusive and exhaustive duration of time spent in transaction execution regimes (Transaction result commit) | @@ -994,9 +1003,7 @@ | accdb_​bstream_​seq
{bstream_​seq="present"} | gauge | Current bstream sequence number (Blocks between present and future are being written (write only)) | | accdb_​bstream_​seq
{bstream_​seq="future"} | gauge | Current bstream sequence number (Blocks between future and ancient have not been written (no read, no write)) | | accdb_​request_​batches | counter | Number of request batches processed | -| accdb_​requests
{vinyl_​request="acquire"} | counter | Number of requests processed (Acquire record) | -| accdb_​requests
{vinyl_​request="release"} | counter | Number of requests processed (Release record) | -| accdb_​requests
{vinyl_​request="erase"} | counter | Number of requests processed (Erase record) | +| accdb_​requests | counter | Number of requests processed | | accdb_​blocks
{vinyl_​blocks="pair"} | counter | Number of blocks written to bstream (Record) | | accdb_​blocks
{vinyl_​blocks="dead"} | counter | Number of blocks written to bstream (Record deletion) | | accdb_​blocks
{vinyl_​blocks="part"} | counter | Number of blocks written to bstream (Partition/divider) | diff --git a/src/app/firedancer-dev/commands/backtest.c b/src/app/firedancer-dev/commands/backtest.c index 9113f1409b6..f25d01c2d8b 100644 --- a/src/app/firedancer-dev/commands/backtest.c +++ b/src/app/firedancer-dev/commands/backtest.c @@ -23,6 +23,7 @@ #include "../../../disco/topo/fd_topob_vinyl.h" #include "../../../util/pod/fd_pod_format.h" #include "../../../discof/genesis/fd_genesi_tile.h" +#include "../../../funk/fd_funk_base.h" #include "../../../discof/replay/fd_replay_tile.h" #include "../../../discof/restore/fd_snapin_tile_private.h" #include "../../../discof/restore/fd_snaplv_tile_private.h" @@ -90,6 +91,11 @@ backtest_topo( config_t * config ) { ulong funk_locks_obj_id; FD_TEST( (funk_locks_obj_id = fd_pod_query_ulong( topo->props, "funk_locks", ULONG_MAX ) )!=ULONG_MAX ); fd_topob_tile_uses( topo, replay_tile, &topo->objs[ funk_obj_id ], FD_SHMEM_JOIN_MODE_READ_WRITE ); fd_topob_tile_uses( topo, replay_tile, &topo->objs[ funk_locks_obj_id ], FD_SHMEM_JOIN_MODE_READ_WRITE ); + if( vinyl_enabled ) { + fd_topo_tile_t * accdb_tile = &topo->tiles[ fd_topo_find_tile( topo, "accdb", 0UL ) ]; + fd_topob_tile_uses( topo, accdb_tile, &topo->objs[ funk_obj_id ], FD_SHMEM_JOIN_MODE_READ_WRITE ); + fd_topob_tile_uses( topo, accdb_tile, &topo->objs[ funk_locks_obj_id ], FD_SHMEM_JOIN_MODE_READ_WRITE ); + } fd_topob_wksp( topo, "progcache" ); setup_topo_progcache( topo, "progcache", @@ -349,8 +355,10 @@ backtest_topo( config_t * config ) { setup_topo_accdb_meta( topo, &config->firedancer ); ulong vinyl_map_obj_id = fd_pod_query_ulong( topo->props, "accdb.meta_map", ULONG_MAX ); FD_TEST( vinyl_map_obj_id !=ULONG_MAX ); ulong vinyl_pool_obj_id = fd_pod_query_ulong( topo->props, "accdb.meta_pool", ULONG_MAX ); FD_TEST( vinyl_pool_obj_id!=ULONG_MAX ); + ulong vinyl_line_obj_id = fd_pod_query_ulong( topo->props, "accdb.line", ULONG_MAX ); FD_TEST( vinyl_line_obj_id!=ULONG_MAX ); fd_topo_obj_t * accdb_map_obj = &topo->objs[ vinyl_map_obj_id ]; fd_topo_obj_t * accdb_pool_obj = &topo->objs[ vinyl_pool_obj_id ]; + fd_topo_obj_t * accdb_line_obj = &topo->objs[ vinyl_line_obj_id ]; fd_topo_obj_t * accdb_data = setup_topo_accdb_cache( topo, &config->firedancer ); @@ -359,13 +367,20 @@ backtest_topo( config_t * config ) { fd_topob_tile_uses( topo, accdb_tile, accdb_data, FD_SHMEM_JOIN_MODE_READ_WRITE ); fd_topob_tile_uses( topo, accdb_tile, accdb_map_obj, FD_SHMEM_JOIN_MODE_READ_WRITE ); fd_topob_tile_uses( topo, accdb_tile, accdb_pool_obj, FD_SHMEM_JOIN_MODE_READ_WRITE ); + fd_topob_tile_uses( topo, accdb_tile, accdb_line_obj, FD_SHMEM_JOIN_MODE_READ_WRITE ); - fd_topob_tile_uses( topo, replay_tile, accdb_data, FD_SHMEM_JOIN_MODE_READ_WRITE ); + fd_topob_tile_uses( topo, replay_tile, accdb_data, FD_SHMEM_JOIN_MODE_READ_WRITE ); + fd_topob_tile_uses( topo, replay_tile, accdb_line_obj, FD_SHMEM_JOIN_MODE_READ_WRITE ); for( ulong i=0UL; itiles[ fd_topo_find_tile( topo, "execrp", i ) ], accdb_data, FD_SHMEM_JOIN_MODE_READ_WRITE ); + fd_topo_tile_t * t = &topo->tiles[ fd_topo_find_tile( topo, "execrp", i ) ]; + fd_topob_tile_uses( topo, t, accdb_data, FD_SHMEM_JOIN_MODE_READ_WRITE ); + fd_topob_tile_uses( topo, t, accdb_line_obj, FD_SHMEM_JOIN_MODE_READ_WRITE ); } fd_topob_wksp( topo, "accdb_replay" ); + + fd_topob_wksp( topo, "replay_accdb" ); + fd_topob_link( topo, "replay_accdb", "replay_accdb", 128UL, sizeof(fd_funk_txn_xid_t), 1UL ); } /**********************************************************************/ @@ -415,6 +430,10 @@ backtest_topo( config_t * config ) { fd_topob_wksp( topo, "replay_execrp" ); fd_topob_link( topo, "replay_execrp", "replay_execrp", 16384UL, 2240UL, 1UL ); fd_topob_tile_out( topo, "replay", 0UL, "replay_execrp", 0UL ); + if( vinyl_enabled ) { + fd_topob_tile_out( topo, "replay", 0UL, "replay_accdb", 0UL ); + fd_topob_tile_in( topo, "accdb", 0UL, "metric_in", "replay_accdb", 0UL, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED ); + } for( ulong i=0UL; iid ), 0, vinyl_line_footprint( topo, obj ) ); +} + +fd_topo_obj_callbacks_t fd_obj_cb_vinyl_line = { + .name = "vinyl_line", + .footprint = vinyl_line_footprint, + .align = vinyl_line_align, + .new = vinyl_line_new, +}; + /* vinyl_data: memory arena for data cache entries */ static ulong diff --git a/src/app/firedancer/config/default.toml b/src/app/firedancer/config/default.toml index 2acb3d2350c..1602125f88f 100644 --- a/src/app/firedancer/config/default.toml +++ b/src/app/firedancer/config/default.toml @@ -543,7 +543,7 @@ telemetry = true # how much memory is reserved for such account changes. # If the amount of inflight account changes exceeds this limit, the # validator will crash. - max_unrooted_account_size_gib = 16 + max_unrooted_account_size_gib = 64 # Keep frequently accessed accounts in memory to improve # performance. @@ -579,7 +579,7 @@ telemetry = true # The expected mean size of recently accessed accounts. # # This heuristic is used to pack accounts into caches optimally. - mean_account_footprint = 256 + mean_account_footprint = 64 # io_uring specific options # diff --git a/src/app/firedancer/config/mainnet.toml b/src/app/firedancer/config/mainnet.toml index 7c6509b06d6..e85d0ac3fb8 100644 --- a/src/app/firedancer/config/mainnet.toml +++ b/src/app/firedancer/config/mainnet.toml @@ -9,8 +9,8 @@ [consensus] expected_genesis_hash = "5eykt4UsFv8P8NJdTREpY1vzqKqZKvdpKuc147dw2N9d" [accounts] - file_size_gib = 600 - max_accounts = 1_100_000_000 + file_size_gib = 700 + max_accounts = 1_300_000_000 [snapshots] [snapshots.sources] servers = [ "http://solana-mainnet-rpc.jumpisolated.com:8899" ] diff --git a/src/app/firedancer/main.c b/src/app/firedancer/main.c index 7445f0dd0f5..40776ee742f 100644 --- a/src/app/firedancer/main.c +++ b/src/app/firedancer/main.c @@ -27,6 +27,7 @@ extern fd_topo_obj_callbacks_t fd_obj_cb_rnonce_ss; extern fd_topo_obj_callbacks_t fd_obj_cb_vinyl_meta; extern fd_topo_obj_callbacks_t fd_obj_cb_vinyl_meta_ele; +extern fd_topo_obj_callbacks_t fd_obj_cb_vinyl_line; extern fd_topo_obj_callbacks_t fd_obj_cb_vinyl_data; extern fd_topo_obj_callbacks_t fd_obj_cb_vinyl_req_pool; extern fd_topo_obj_callbacks_t fd_obj_cb_vinyl_rq; @@ -53,6 +54,7 @@ fd_topo_obj_callbacks_t * CALLBACKS[] = { &fd_obj_cb_acc_pool, &fd_obj_cb_vinyl_meta, &fd_obj_cb_vinyl_meta_ele, + &fd_obj_cb_vinyl_line, &fd_obj_cb_vinyl_data, &fd_obj_cb_acc_pool, &fd_obj_cb_vinyl_req_pool, diff --git a/src/app/firedancer/topology.c b/src/app/firedancer/topology.c index 0f562d5b9b8..d3dda80850c 100644 --- a/src/app/firedancer/topology.c +++ b/src/app/firedancer/topology.c @@ -28,6 +28,7 @@ #include "../../discof/restore/utils/fd_ssctrl.h" #include "../../discof/restore/utils/fd_ssmsg.h" #include "../../flamenco/capture/fd_solcap_writer.h" +#include "../../funk/fd_funk_base.h" #include "../../flamenco/progcache/fd_progcache_admin.h" #include "../../flamenco/runtime/fd_acc_pool.h" #include "../../flamenco/accdb/fd_accdb_lineage.h" @@ -216,8 +217,11 @@ setup_topo_accdb_meta( fd_topo_t * topo, fd_topo_obj_t * meta_pool_obj = fd_topob_obj( topo, "vinyl_meta_e", "accdb_meta" ); fd_pod_insertf_ulong( topo->props, meta_max, "obj.%lu.cnt", meta_pool_obj->id ); + fd_topo_obj_t * line_obj = fd_topob_obj( topo, "vinyl_line", "accdb_meta" ); + fd_pod_insert_ulong( topo->props, "accdb.meta_map", map_obj->id ); fd_pod_insert_ulong( topo->props, "accdb.meta_pool", meta_pool_obj->id ); + fd_pod_insert_ulong( topo->props, "accdb.line", line_obj->id ); } fd_topo_obj_t * @@ -617,6 +621,10 @@ fd_topo_initialize( config_t * config ) { /**/ fd_topob_link( topo, "replay_epoch", "replay_epoch", 128UL, FD_EPOCH_OUT_MTU, 1UL ); /* TODO: This should be 2 but requires fixing STEM_BURST */ /**/ fd_topob_link( topo, "replay_out", "replay_out", 65536UL, sizeof(fd_replay_message_t), 1UL ); fd_topob_link( topo, "replay_execrp", "replay_execrp", 16384UL, sizeof(fd_execrp_task_msg_t), 1UL ); + if( !config->firedancer.accounts.in_memory_only ) { + fd_topob_wksp( topo, "replay_accdb" ); + fd_topob_link( topo, "replay_accdb", "replay_accdb", 128UL, sizeof(fd_funk_txn_xid_t), 1UL ); + } if( leader_enabled ) { /**/ fd_topob_link( topo, "dedup_resolv", "dedup_resolv", 65536UL, FD_TPU_PARSED_MTU, 1UL ); FOR(resolv_tile_cnt) fd_topob_link( topo, "resolv_pack", "resolv_pack", 65536UL, FD_TPU_RESOLVED_MTU, 1UL ); @@ -748,32 +756,48 @@ fd_topo_initialize( config_t * config ) { ulong vinyl_map_obj_id = fd_pod_query_ulong( topo->props, "accdb.meta_map", ULONG_MAX ); FD_TEST( vinyl_map_obj_id !=ULONG_MAX ); ulong vinyl_pool_obj_id = fd_pod_query_ulong( topo->props, "accdb.meta_pool", ULONG_MAX ); FD_TEST( vinyl_pool_obj_id!=ULONG_MAX ); + ulong vinyl_line_obj_id = fd_pod_query_ulong( topo->props, "accdb.line", ULONG_MAX ); FD_TEST( vinyl_line_obj_id!=ULONG_MAX ); fd_topo_obj_t * accdb_map_obj = &topo->objs[ vinyl_map_obj_id ]; fd_topo_obj_t * accdb_pool_obj = &topo->objs[ vinyl_pool_obj_id ]; + fd_topo_obj_t * accdb_line_obj = &topo->objs[ vinyl_line_obj_id ]; fd_topob_wksp( topo, "accdb" ); fd_topo_tile_t * accdb_tile = fd_topob_tile( topo, "accdb", "accdb", "metric_in", tile_to_cpu[ topo->tile_cnt ], 0, 0, 0 ); fd_topob_tile_uses( topo, accdb_tile, accdb_data, FD_SHMEM_JOIN_MODE_READ_WRITE ); fd_topob_tile_uses( topo, accdb_tile, accdb_map_obj, FD_SHMEM_JOIN_MODE_READ_WRITE ); fd_topob_tile_uses( topo, accdb_tile, accdb_pool_obj, FD_SHMEM_JOIN_MODE_READ_WRITE ); + fd_topob_tile_uses( topo, accdb_tile, accdb_line_obj, FD_SHMEM_JOIN_MODE_READ_WRITE ); fd_topob_tile_uses( topo, &topo->tiles[ fd_topo_find_tile( topo, "genesi", 0UL ) ], accdb_data, FD_SHMEM_JOIN_MODE_READ_WRITE ); - fd_topob_tile_uses( topo, &topo->tiles[ fd_topo_find_tile( topo, "replay", 0UL ) ], accdb_data, FD_SHMEM_JOIN_MODE_READ_WRITE ); + fd_topo_tile_t * replay_tile = &topo->tiles[ fd_topo_find_tile( topo, "replay", 0UL ) ]; + fd_topob_tile_uses( topo, replay_tile, accdb_data, FD_SHMEM_JOIN_MODE_READ_WRITE ); + fd_topob_tile_uses( topo, replay_tile, accdb_line_obj, FD_SHMEM_JOIN_MODE_READ_WRITE ); for( ulong i=0UL; itiles[ fd_topo_find_tile( topo, "execrp", i ) ], accdb_data, FD_SHMEM_JOIN_MODE_READ_WRITE ); + fd_topo_tile_t * t = &topo->tiles[ fd_topo_find_tile( topo, "execrp", i ) ]; + fd_topob_tile_uses( topo, t, accdb_data, FD_SHMEM_JOIN_MODE_READ_WRITE ); + fd_topob_tile_uses( topo, t, accdb_line_obj, FD_SHMEM_JOIN_MODE_READ_WRITE ); } for( ulong i=0UL; itiles[ fd_topo_find_tile( topo, "execle", i ) ], accdb_data, FD_SHMEM_JOIN_MODE_READ_WRITE ); + fd_topo_tile_t * t = &topo->tiles[ fd_topo_find_tile( topo, "execle", i ) ]; + fd_topob_tile_uses( topo, t, accdb_data, FD_SHMEM_JOIN_MODE_READ_WRITE ); + fd_topob_tile_uses( topo, t, accdb_line_obj, FD_SHMEM_JOIN_MODE_READ_WRITE ); + } + fd_topo_tile_t * tower_tile = &topo->tiles[ fd_topo_find_tile( topo, "tower", 0UL ) ]; + fd_topob_tile_uses( topo, tower_tile, accdb_data, FD_SHMEM_JOIN_MODE_READ_ONLY ); + fd_topob_tile_uses( topo, tower_tile, accdb_line_obj, FD_SHMEM_JOIN_MODE_READ_WRITE ); + FOR(resolv_tile_cnt) { + fd_topo_tile_t * t = &topo->tiles[ fd_topo_find_tile( topo, "resolv", i ) ]; + fd_topob_tile_uses( topo, t, accdb_data, FD_SHMEM_JOIN_MODE_READ_ONLY ); + fd_topob_tile_uses( topo, t, accdb_line_obj, FD_SHMEM_JOIN_MODE_READ_WRITE ); } - fd_topob_tile_uses( topo, &topo->tiles[ fd_topo_find_tile( topo, "tower", 0UL ) ], accdb_data, FD_SHMEM_JOIN_MODE_READ_ONLY ); - FOR(resolv_tile_cnt) fd_topob_tile_uses( topo, &topo->tiles[ fd_topo_find_tile( topo, "resolv", i ) ], accdb_data, FD_SHMEM_JOIN_MODE_READ_ONLY ); if( rpc_enabled ) { - fd_topob_tile_uses( topo, &topo->tiles[ fd_topo_find_tile( topo, "rpc", 0UL ) ], accdb_data, FD_SHMEM_JOIN_MODE_READ_ONLY ); + fd_topo_tile_t * t = &topo->tiles[ fd_topo_find_tile( topo, "rpc", 0UL ) ]; + fd_topob_tile_uses( topo, t, accdb_data, FD_SHMEM_JOIN_MODE_READ_ONLY ); + fd_topob_tile_uses( topo, t, accdb_line_obj, FD_SHMEM_JOIN_MODE_READ_WRITE ); } - fd_topob_wksp( topo, "accdb_genesi" ); fd_topob_wksp( topo, "accdb_replay" ); fd_topob_wksp( topo, "accdb_execrp" ); if( leader_enabled ) fd_topob_wksp( topo, "accdb_execle" ); @@ -929,6 +953,10 @@ fd_topo_initialize( config_t * config ) { /**/ fd_topob_tile_out( topo, "replay", 0UL, "replay_out", 0UL ); /**/ fd_topob_tile_out( topo, "replay", 0UL, "replay_epoch", 0UL ); /**/ fd_topob_tile_out( topo, "replay", 0UL, "replay_execrp", 0UL ); + if( vinyl_enabled ) { + fd_topob_tile_out( topo, "replay", 0UL, "replay_accdb", 0UL ); + fd_topob_tile_in( topo, "accdb", 0UL, "metric_in", "replay_accdb", 0UL, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED ); + } FOR(execrp_tile_cnt) fd_topob_tile_in ( topo, "replay", 0UL, "metric_in", "execrp_replay", i, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED ); if(leader_enabled) {fd_topob_tile_in ( topo, "replay", 0UL, "metric_in", "poh_replay", 0UL, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED );} /**/ fd_topob_tile_in ( topo, "replay", 0UL, "metric_in", "tower_out", 0UL, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED ); @@ -1233,12 +1261,14 @@ fd_topo_initialize( config_t * config ) { FOR(execrp_tile_cnt) fd_topob_tile_uses( topo, &topo->tiles[ fd_topo_find_tile( topo, "execrp", i ) ], funk_obj, FD_SHMEM_JOIN_MODE_READ_WRITE ); FOR(execle_tile_cnt) fd_topob_tile_uses( topo, &topo->tiles[ fd_topo_find_tile( topo, "execle", i ) ], funk_obj, FD_SHMEM_JOIN_MODE_READ_WRITE ); FOR(resolv_tile_cnt) fd_topob_tile_uses( topo, &topo->tiles[ fd_topo_find_tile( topo, "resolv", i ) ], funk_obj, FD_SHMEM_JOIN_MODE_READ_ONLY ); + if( vinyl_enabled ) fd_topob_tile_uses( topo, &topo->tiles[ fd_topo_find_tile( topo, "accdb", 0UL ) ], funk_obj, FD_SHMEM_JOIN_MODE_READ_WRITE ); /**/ fd_topob_tile_uses( topo, &topo->tiles[ fd_topo_find_tile( topo, "replay", 0UL ) ], funk_locks_obj, FD_SHMEM_JOIN_MODE_READ_WRITE ); /**/ fd_topob_tile_uses( topo, &topo->tiles[ fd_topo_find_tile( topo, "tower", 0UL ) ], funk_locks_obj, FD_SHMEM_JOIN_MODE_READ_WRITE ); FOR(execrp_tile_cnt) fd_topob_tile_uses( topo, &topo->tiles[ fd_topo_find_tile( topo, "execrp", i ) ], funk_locks_obj, FD_SHMEM_JOIN_MODE_READ_WRITE ); FOR(execle_tile_cnt) fd_topob_tile_uses( topo, &topo->tiles[ fd_topo_find_tile( topo, "execle", i ) ], funk_locks_obj, FD_SHMEM_JOIN_MODE_READ_WRITE ); FOR(resolv_tile_cnt) fd_topob_tile_uses( topo, &topo->tiles[ fd_topo_find_tile( topo, "resolv", i ) ], funk_locks_obj, FD_SHMEM_JOIN_MODE_READ_WRITE ); + if( vinyl_enabled ) fd_topob_tile_uses( topo, &topo->tiles[ fd_topo_find_tile( topo, "accdb", 0UL ) ], funk_locks_obj, FD_SHMEM_JOIN_MODE_READ_WRITE ); fd_topo_obj_t * banks_obj = setup_topo_banks( topo, "banks", config->firedancer.runtime.max_live_slots, config->firedancer.runtime.max_fork_width, config->development.bench.larger_max_cost_per_block ); /**/ fd_topob_tile_uses( topo, &topo->tiles[ fd_topo_find_tile( topo, "replay", 0UL ) ], banks_obj, FD_SHMEM_JOIN_MODE_READ_WRITE ); @@ -1387,7 +1417,6 @@ fd_topo_initialize( config_t * config ) { fd_pod_insert_int( topo->props, "sandbox", config->development.sandbox ? 1 : 0 ); if( vinyl_enabled ) { - fd_topob_vinyl_rq( topo, "genesi", 0UL, "accdb_genesi", "genesi", 4UL, 1024UL, 1024UL ); fd_topob_vinyl_rq( topo, "replay", 0UL, "accdb_replay", "replay", 4UL, 1024UL, 1024UL ); for( ulong i=0UL; iaccdb.meta_map_obj_id = fd_pod_query_ulong( config->topo.props, "accdb.meta_map", ULONG_MAX ); tile->accdb.meta_pool_obj_id = fd_pod_query_ulong( config->topo.props, "accdb.meta_pool", ULONG_MAX ); + tile->accdb.line_obj_id = fd_pod_query_ulong( config->topo.props, "accdb.line", ULONG_MAX ); tile->accdb.line_max = (config->firedancer.accounts.cache_size_gib << 30) / config->firedancer.accounts.mean_account_footprint; tile->accdb.data_obj_id = fd_pod_query_ulong( config->topo.props, "accdb.data", ULONG_MAX ); fd_cstr_ncpy( tile->accdb.bstream_path, config->paths.accounts, sizeof(tile->accdb.bstream_path) ); @@ -1893,6 +1923,7 @@ fd_topo_configure_tile( fd_topo_tile_t * tile, tile->accdb.io_type = !strcmp(config->firedancer.accounts.io_provider, "io_uring") ? FD_VINYL_IO_TYPE_UR : FD_VINYL_IO_TYPE_BD; tile->accdb.uring_depth = config->firedancer.accounts.io_uring.queue_depth; + tile->accdb.write_delay_slots = config->firedancer.accounts.write_delay_slots; /* Minimum bound for cache entry count */ ulong required_cache_entries = 0UL; @@ -1908,6 +1939,7 @@ fd_topo_configure_tile( fd_topo_tile_t * tile, if( FD_UNLIKELY( required_cache_entries > tile->accdb.line_max ) ) { tile->accdb.line_max = required_cache_entries; } + fd_pod_insertf_ulong( config->topo.props, tile->accdb.line_max, "obj.%lu.line_cnt", tile->accdb.line_obj_id ); } else if( FD_UNLIKELY( !strcmp( tile->name, "solcap" ) ) ) { diff --git a/src/app/shared/commands/mem.c b/src/app/shared/commands/mem.c index 8ac10f447f7..aad09d4323a 100644 --- a/src/app/shared/commands/mem.c +++ b/src/app/shared/commands/mem.c @@ -20,7 +20,7 @@ reconstruct_topo( config_t * config, if( !topo_name[0] ) return; /* keep default action topo */ action_t const * selected = NULL; - for( action_t ** a=ACTIONS; a; a++ ) { + for( action_t ** a=ACTIONS; *a; a++ ) { action_t const * action = *a; if( 0==strcmp( action->name, topo_name ) ) { selected = action; diff --git a/src/app/shared/commands/metrics.c b/src/app/shared/commands/metrics.c index 3f175d2649c..883a689c6ec 100644 --- a/src/app/shared/commands/metrics.c +++ b/src/app/shared/commands/metrics.c @@ -27,7 +27,7 @@ reconstruct_topo( config_t * config, if( !topo_name[0] ) return; /* keep default action topo */ action_t const * selected = NULL; - for( action_t ** a=ACTIONS; a; a++ ) { + for( action_t ** a=ACTIONS; *a; a++ ) { action_t const * action = *a; if( 0==strcmp( action->name, topo_name ) ) { selected = action; diff --git a/src/app/shared/commands/monitor/monitor.c b/src/app/shared/commands/monitor/monitor.c index 35c20a00b10..4582b2538f1 100644 --- a/src/app/shared/commands/monitor/monitor.c +++ b/src/app/shared/commands/monitor/monitor.c @@ -22,6 +22,8 @@ #include #include "generated/monitor_seccomp.h" +extern action_t * ACTIONS[]; + void monitor_cmd_args( int * pargc, char *** pargv, @@ -35,6 +37,12 @@ monitor_cmd_args( int * pargc, args->monitor.with_bench = fd_env_strip_cmdline_contains( pargc, pargv, "--bench" ); args->monitor.with_sankey = fd_env_strip_cmdline_contains( pargc, pargv, "--sankey" ); + char const * topo_name = fd_env_strip_cmdline_cstr( pargc, pargv, "--topo", NULL, "" ); + + ulong topo_name_len = strlen( topo_name ); + if( FD_UNLIKELY( topo_name_len > sizeof(args->monitor.topo)-1 ) ) FD_LOG_ERR(( "Unknown --topo %s", topo_name )); + fd_cstr_fini( fd_cstr_append_text( fd_cstr_init( args->monitor.topo ), topo_name, topo_name_len ) ); + if( FD_UNLIKELY( args->monitor.dt_min<0L ) ) FD_LOG_ERR(( "--dt-min should be positive" )); if( FD_UNLIKELY( args->monitor.dt_maxmonitor.dt_min ) ) FD_LOG_ERR(( "--dt-max should be at least --dt-min" )); if( FD_UNLIKELY( args->monitor.duration<0L ) ) FD_LOG_ERR(( "--duration should be non-negative" )); @@ -498,9 +506,31 @@ signal1( int sig ) { exit( 0 ); /* gracefully exit */ } +static void +reconstruct_topo( config_t * config, + char const * topo_name ) { + if( !topo_name[0] ) return; /* keep default action topo */ + + action_t const * selected = NULL; + for( action_t ** a=ACTIONS; *a; a++ ) { + action_t const * action = *a; + if( 0==strcmp( action->name, topo_name ) ) { + selected = action; + break; + } + } + + if( !selected ) FD_LOG_ERR(( "Unknown --topo %s", topo_name )); + if( !selected->topo ) FD_LOG_ERR(( "Cannot recover topology for --topo %s", topo_name )); + + selected->topo( config ); +} + void monitor_cmd_fn( args_t * args, config_t * config ) { + reconstruct_topo( config, args->monitor.topo ); + if( FD_UNLIKELY( args->monitor.with_bench ) ) { add_bench_topo( &config->topo, config->development.bench.affinity, diff --git a/src/app/shared/commands/watch/watch.c b/src/app/shared/commands/watch/watch.c index d9bbf6e3c20..c899a0ecf1c 100644 --- a/src/app/shared/commands/watch/watch.c +++ b/src/app/shared/commands/watch/watch.c @@ -428,7 +428,8 @@ write_snapshots( config_t const * config, static uint write_accdb( config_t const * config, - ulong const * cur_tile ) { + ulong const * cur_tile, + ulong const * prev_tile ) { ulong accdb_tile_idx = fd_topo_find_tile( &config->topo, "accdb", 0UL ); ulong snapwm_tile_idx = fd_topo_find_tile( &config->topo, "snapwm", 0UL ); ulong snapwr_tile_idx = fd_topo_find_tile( &config->topo, "snapwr", 0UL ); @@ -455,11 +456,25 @@ write_accdb( config_t const * config, for( ulong i=0UL; i0L ? 100.0*(double)(lookup_funk+lookup_specrd)/(double)lookup_total : 100.0; + PRINT( "💾 " BOLD GREEN "ACCOUNTS...." RESET UNBOLD " " BOLD "DATA" UNBOLD " %4.1f%% (%.1f GB) " " " BOLD "INDEX" UNBOLD " %4.1f%% (%.1fM) " - " " BOLD "RPS" UNBOLD " %s" CLEARLN "\n", - data_pct, used_gb, index_pct, (double)acct_cnt/1e6, rps_str ); + " " BOLD "RPS" UNBOLD " %s" + " " BOLD "CACHE" UNBOLD " %4.1f%%" CLEARLN "\n", + data_pct, used_gb, index_pct, (double)acct_cnt/1e6, rps_str, cache_hit_pct ); return 1; } @@ -720,7 +735,7 @@ write_summary( config_t const * config, write_snapshots( config, cur_tile, prev_tile ); } - lines_printed += write_accdb( config, cur_tile ); + lines_printed += write_accdb( config, cur_tile, prev_tile ); lines_printed += write_gossip( config, cur_tile, prev_tile, cur_link, prev_link ); lines_printed += write_repair( config, cur_tile, cur_link, prev_link ); lines_printed += write_replay( config, cur_tile ); @@ -836,16 +851,9 @@ run( config_t const * config, event_bytes_read_samples[ event_bytes_read_samples_idx%(sizeof(event_bytes_read_samples)/sizeof(event_bytes_read_samples[0])) ] = (ulong)diff_tile( config, "event", tiles+(1UL-last_snap)*tile_cnt*FD_METRICS_TOTAL_SZ, tiles+last_snap*tile_cnt*FD_METRICS_TOTAL_SZ, MIDX( COUNTER, EVENT, BYTES_READ ) ); event_bytes_read_samples_idx++; rps_samples[ rps_samples_idx%(sizeof(rps_samples)/sizeof(rps_samples[0])) ] = (ulong)( - diff_tile( config, "execrp", tiles+(1UL-last_snap)*tile_cnt*FD_METRICS_TOTAL_SZ, tiles+last_snap*tile_cnt*FD_METRICS_TOTAL_SZ, MIDX( COUNTER, EXECRP, TXN_ACCOUNT_CHANGES_UNCHANGED_NONEXIST ) ) + - diff_tile( config, "execrp", tiles+(1UL-last_snap)*tile_cnt*FD_METRICS_TOTAL_SZ, tiles+last_snap*tile_cnt*FD_METRICS_TOTAL_SZ, MIDX( COUNTER, EXECRP, TXN_ACCOUNT_CHANGES_CREATED ) ) + - diff_tile( config, "execrp", tiles+(1UL-last_snap)*tile_cnt*FD_METRICS_TOTAL_SZ, tiles+last_snap*tile_cnt*FD_METRICS_TOTAL_SZ, MIDX( COUNTER, EXECRP, TXN_ACCOUNT_CHANGES_DELETE ) ) + - diff_tile( config, "execrp", tiles+(1UL-last_snap)*tile_cnt*FD_METRICS_TOTAL_SZ, tiles+last_snap*tile_cnt*FD_METRICS_TOTAL_SZ, MIDX( COUNTER, EXECRP, TXN_ACCOUNT_CHANGES_MODIFY ) ) + - diff_tile( config, "execrp", tiles+(1UL-last_snap)*tile_cnt*FD_METRICS_TOTAL_SZ, tiles+last_snap*tile_cnt*FD_METRICS_TOTAL_SZ, MIDX( COUNTER, EXECRP, TXN_ACCOUNT_CHANGES_UNCHANGED ) ) + - diff_tile( config, "replay", tiles+(1UL-last_snap)*tile_cnt*FD_METRICS_TOTAL_SZ, tiles+last_snap*tile_cnt*FD_METRICS_TOTAL_SZ, MIDX( COUNTER, REPLAY, ACCDB_CREATED ) ) + - diff_tile( config, "replay", tiles+(1UL-last_snap)*tile_cnt*FD_METRICS_TOTAL_SZ, tiles+last_snap*tile_cnt*FD_METRICS_TOTAL_SZ, MIDX( COUNTER, REPLAY, ACCDB_ROOTED ) ) + - diff_tile( config, "replay", tiles+(1UL-last_snap)*tile_cnt*FD_METRICS_TOTAL_SZ, tiles+last_snap*tile_cnt*FD_METRICS_TOTAL_SZ, MIDX( COUNTER, REPLAY, ACCDB_REVERTED ) ) + - diff_tile( config, "replay", tiles+(1UL-last_snap)*tile_cnt*FD_METRICS_TOTAL_SZ, tiles+last_snap*tile_cnt*FD_METRICS_TOTAL_SZ, MIDX( COUNTER, REPLAY, ACCDB_GC_ROOT ) ) + - diff_tile( config, "replay", tiles+(1UL-last_snap)*tile_cnt*FD_METRICS_TOTAL_SZ, tiles+last_snap*tile_cnt*FD_METRICS_TOTAL_SZ, MIDX( COUNTER, REPLAY, ACCDB_RECLAIMED ) ) ); + diff_tile( config, "replay", tiles+(1UL-last_snap)*tile_cnt*FD_METRICS_TOTAL_SZ, tiles+last_snap*tile_cnt*FD_METRICS_TOTAL_SZ, MIDX( COUNTER, REPLAY, ACCDB_LOOKUP_ACCDB ) ) + + diff_tile( config, "execrp", tiles+(1UL-last_snap)*tile_cnt*FD_METRICS_TOTAL_SZ, tiles+last_snap*tile_cnt*FD_METRICS_TOTAL_SZ, MIDX( COUNTER, EXECRP, ACCDB_LOOKUP_ACCDB ) ) + + diff_tile( config, "execle", tiles+(1UL-last_snap)*tile_cnt*FD_METRICS_TOTAL_SZ, tiles+last_snap*tile_cnt*FD_METRICS_TOTAL_SZ, MIDX( COUNTER, EXECLE, ACCDB_LOOKUP_ACCDB ) ) ); rps_samples_idx++; /* Move cursor to top of dashboard and overwrite in place. diff --git a/src/app/shared/fd_action.h b/src/app/shared/fd_action.h index e922c2245fe..bab0f203b8e 100644 --- a/src/app/shared/fd_action.h +++ b/src/app/shared/fd_action.h @@ -18,6 +18,7 @@ union fdctl_args { int drain_output_fd; int with_bench; int with_sankey; + char topo[ 64 ]; } monitor; struct { diff --git a/src/disco/metrics/generated/fd_metrics_accdb.c b/src/disco/metrics/generated/fd_metrics_accdb.c index 1253e16ec32..1da9ca3d3ad 100644 --- a/src/disco/metrics/generated/fd_metrics_accdb.c +++ b/src/disco/metrics/generated/fd_metrics_accdb.c @@ -22,9 +22,7 @@ const fd_metrics_meta_t FD_METRICS_ACCDB[FD_METRICS_ACCDB_TOTAL] = { DECLARE_METRIC_ENUM( ACCDB_BSTREAM_SEQ, GAUGE, BSTREAM_SEQ, PRESENT ), DECLARE_METRIC_ENUM( ACCDB_BSTREAM_SEQ, GAUGE, BSTREAM_SEQ, FUTURE ), DECLARE_METRIC( ACCDB_REQUEST_BATCHES, COUNTER ), - DECLARE_METRIC_ENUM( ACCDB_REQUESTS, COUNTER, VINYL_REQUEST, ACQUIRE ), - DECLARE_METRIC_ENUM( ACCDB_REQUESTS, COUNTER, VINYL_REQUEST, RELEASE ), - DECLARE_METRIC_ENUM( ACCDB_REQUESTS, COUNTER, VINYL_REQUEST, ERASE ), + DECLARE_METRIC( ACCDB_REQUESTS, COUNTER ), DECLARE_METRIC_ENUM( ACCDB_BLOCKS, COUNTER, VINYL_BLOCKS, PAIR ), DECLARE_METRIC_ENUM( ACCDB_BLOCKS, COUNTER, VINYL_BLOCKS, DEAD ), DECLARE_METRIC_ENUM( ACCDB_BLOCKS, COUNTER, VINYL_BLOCKS, PART ), diff --git a/src/disco/metrics/generated/fd_metrics_accdb.h b/src/disco/metrics/generated/fd_metrics_accdb.h index 77aa3bd188d..31ae2985c9a 100644 --- a/src/disco/metrics/generated/fd_metrics_accdb.h +++ b/src/disco/metrics/generated/fd_metrics_accdb.h @@ -91,42 +91,37 @@ #define FD_METRICS_COUNTER_ACCDB_REQUESTS_TYPE (FD_METRICS_TYPE_COUNTER) #define FD_METRICS_COUNTER_ACCDB_REQUESTS_DESC "Number of requests processed" #define FD_METRICS_COUNTER_ACCDB_REQUESTS_CVT (FD_METRICS_CONVERTER_NONE) -#define FD_METRICS_COUNTER_ACCDB_REQUESTS_CNT (3UL) -#define FD_METRICS_COUNTER_ACCDB_REQUESTS_ACQUIRE_OFF (43UL) -#define FD_METRICS_COUNTER_ACCDB_REQUESTS_RELEASE_OFF (44UL) -#define FD_METRICS_COUNTER_ACCDB_REQUESTS_ERASE_OFF (45UL) - -#define FD_METRICS_COUNTER_ACCDB_BLOCKS_OFF (46UL) +#define FD_METRICS_COUNTER_ACCDB_BLOCKS_OFF (44UL) #define FD_METRICS_COUNTER_ACCDB_BLOCKS_NAME "accdb_blocks" #define FD_METRICS_COUNTER_ACCDB_BLOCKS_TYPE (FD_METRICS_TYPE_COUNTER) #define FD_METRICS_COUNTER_ACCDB_BLOCKS_DESC "Number of blocks written to bstream" #define FD_METRICS_COUNTER_ACCDB_BLOCKS_CVT (FD_METRICS_CONVERTER_NONE) #define FD_METRICS_COUNTER_ACCDB_BLOCKS_CNT (3UL) -#define FD_METRICS_COUNTER_ACCDB_BLOCKS_PAIR_OFF (46UL) -#define FD_METRICS_COUNTER_ACCDB_BLOCKS_DEAD_OFF (47UL) -#define FD_METRICS_COUNTER_ACCDB_BLOCKS_PART_OFF (48UL) +#define FD_METRICS_COUNTER_ACCDB_BLOCKS_PAIR_OFF (44UL) +#define FD_METRICS_COUNTER_ACCDB_BLOCKS_DEAD_OFF (45UL) +#define FD_METRICS_COUNTER_ACCDB_BLOCKS_PART_OFF (46UL) -#define FD_METRICS_GAUGE_ACCDB_GARBAGE_BYTES_OFF (49UL) +#define FD_METRICS_GAUGE_ACCDB_GARBAGE_BYTES_OFF (47UL) #define FD_METRICS_GAUGE_ACCDB_GARBAGE_BYTES_NAME "accdb_garbage_bytes" #define FD_METRICS_GAUGE_ACCDB_GARBAGE_BYTES_TYPE (FD_METRICS_TYPE_GAUGE) #define FD_METRICS_GAUGE_ACCDB_GARBAGE_BYTES_DESC "" #define FD_METRICS_GAUGE_ACCDB_GARBAGE_BYTES_CVT (FD_METRICS_CONVERTER_NONE) -#define FD_METRICS_COUNTER_ACCDB_CUM_GC_BYTES_OFF (50UL) +#define FD_METRICS_COUNTER_ACCDB_CUM_GC_BYTES_OFF (48UL) #define FD_METRICS_COUNTER_ACCDB_CUM_GC_BYTES_NAME "accdb_cum_gc_bytes" #define FD_METRICS_COUNTER_ACCDB_CUM_GC_BYTES_TYPE (FD_METRICS_TYPE_COUNTER) #define FD_METRICS_COUNTER_ACCDB_CUM_GC_BYTES_DESC "Total number of record bytes that were garbage collected" #define FD_METRICS_COUNTER_ACCDB_CUM_GC_BYTES_CVT (FD_METRICS_CONVERTER_NONE) -#define FD_METRICS_GAUGE_ACCDB_ACCOUNT_INDEX_REMAINING_FREE_OFF (51UL) +#define FD_METRICS_GAUGE_ACCDB_ACCOUNT_INDEX_REMAINING_FREE_OFF (49UL) #define FD_METRICS_GAUGE_ACCDB_ACCOUNT_INDEX_REMAINING_FREE_NAME "accdb_account_index_remaining_free" #define FD_METRICS_GAUGE_ACCDB_ACCOUNT_INDEX_REMAINING_FREE_TYPE (FD_METRICS_TYPE_GAUGE) #define FD_METRICS_GAUGE_ACCDB_ACCOUNT_INDEX_REMAINING_FREE_DESC "Remaining free slots in the account database index (validator crashes when this number reaches zero)" #define FD_METRICS_GAUGE_ACCDB_ACCOUNT_INDEX_REMAINING_FREE_CVT (FD_METRICS_CONVERTER_NONE) -#define FD_METRICS_ACCDB_TOTAL (29UL) +#define FD_METRICS_ACCDB_TOTAL (27UL) extern const fd_metrics_meta_t FD_METRICS_ACCDB[FD_METRICS_ACCDB_TOTAL]; #endif /* HEADER_fd_src_disco_metrics_generated_fd_metrics_accdb_h */ diff --git a/src/disco/metrics/generated/fd_metrics_enums.h b/src/disco/metrics/generated/fd_metrics_enums.h index d5a3faab4d9..e095e990f1b 100644 --- a/src/disco/metrics/generated/fd_metrics_enums.h +++ b/src/disco/metrics/generated/fd_metrics_enums.h @@ -828,15 +828,6 @@ #define FD_METRICS_ENUM_ACCOUNT_CHANGE_V_UNCHANGED_IDX 4 #define FD_METRICS_ENUM_ACCOUNT_CHANGE_V_UNCHANGED_NAME "unchanged" -#define FD_METRICS_ENUM_VINYL_REQUEST_NAME "vinyl_request" -#define FD_METRICS_ENUM_VINYL_REQUEST_CNT (3UL) -#define FD_METRICS_ENUM_VINYL_REQUEST_V_ACQUIRE_IDX 0 -#define FD_METRICS_ENUM_VINYL_REQUEST_V_ACQUIRE_NAME "acquire" -#define FD_METRICS_ENUM_VINYL_REQUEST_V_RELEASE_IDX 1 -#define FD_METRICS_ENUM_VINYL_REQUEST_V_RELEASE_NAME "release" -#define FD_METRICS_ENUM_VINYL_REQUEST_V_ERASE_IDX 2 -#define FD_METRICS_ENUM_VINYL_REQUEST_V_ERASE_NAME "erase" - #define FD_METRICS_ENUM_VINYL_BLOCKS_NAME "vinyl_blocks" #define FD_METRICS_ENUM_VINYL_BLOCKS_CNT (3UL) #define FD_METRICS_ENUM_VINYL_BLOCKS_V_PAIR_IDX 0 diff --git a/src/disco/metrics/generated/fd_metrics_execle.c b/src/disco/metrics/generated/fd_metrics_execle.c index e499ff0e54e..004954ac7f3 100644 --- a/src/disco/metrics/generated/fd_metrics_execle.c +++ b/src/disco/metrics/generated/fd_metrics_execle.c @@ -33,4 +33,10 @@ const fd_metrics_meta_t FD_METRICS_EXECLE[FD_METRICS_EXECLE_TOTAL] = { DECLARE_METRIC_ENUM( EXECLE_TRANSACTION_LANDED, COUNTER, TRANSACTION_LANDED, LANDED_FAILED ), DECLARE_METRIC_ENUM( EXECLE_TRANSACTION_LANDED, COUNTER, TRANSACTION_LANDED, UNLANDED ), DECLARE_METRIC( EXECLE_COMPUTE_UNITS_TOTAL, COUNTER ), + DECLARE_METRIC( EXECLE_ACCDB_LOOKUP_FUNK, COUNTER ), + DECLARE_METRIC( EXECLE_ACCDB_LOOKUP_SPECRD, COUNTER ), + DECLARE_METRIC( EXECLE_ACCDB_LOOKUP_ACCDB, COUNTER ), + DECLARE_METRIC( EXECLE_ACCDB_DT_FUNK, COUNTER ), + DECLARE_METRIC( EXECLE_ACCDB_DT_SPECRD, COUNTER ), + DECLARE_METRIC( EXECLE_ACCDB_DT_VINYL, COUNTER ), }; diff --git a/src/disco/metrics/generated/fd_metrics_execle.h b/src/disco/metrics/generated/fd_metrics_execle.h index 12f5692cc1d..b9a0ff7194e 100644 --- a/src/disco/metrics/generated/fd_metrics_execle.h +++ b/src/disco/metrics/generated/fd_metrics_execle.h @@ -58,7 +58,43 @@ #define FD_METRICS_COUNTER_EXECLE_COMPUTE_UNITS_TOTAL_DESC "Estimated number of compute units executed since tile start" #define FD_METRICS_COUNTER_EXECLE_COMPUTE_UNITS_TOTAL_CVT (FD_METRICS_CONVERTER_NONE) -#define FD_METRICS_EXECLE_TOTAL (31UL) +#define FD_METRICS_COUNTER_EXECLE_ACCDB_LOOKUP_FUNK_OFF (54UL) +#define FD_METRICS_COUNTER_EXECLE_ACCDB_LOOKUP_FUNK_NAME "execle_accdb_lookup_funk" +#define FD_METRICS_COUNTER_EXECLE_ACCDB_LOOKUP_FUNK_TYPE (FD_METRICS_TYPE_COUNTER) +#define FD_METRICS_COUNTER_EXECLE_ACCDB_LOOKUP_FUNK_DESC "Number of account lookups resolved from funk (in-memory fork store)" +#define FD_METRICS_COUNTER_EXECLE_ACCDB_LOOKUP_FUNK_CVT (FD_METRICS_CONVERTER_NONE) + +#define FD_METRICS_COUNTER_EXECLE_ACCDB_LOOKUP_SPECRD_OFF (55UL) +#define FD_METRICS_COUNTER_EXECLE_ACCDB_LOOKUP_SPECRD_NAME "execle_accdb_lookup_specrd" +#define FD_METRICS_COUNTER_EXECLE_ACCDB_LOOKUP_SPECRD_TYPE (FD_METRICS_TYPE_COUNTER) +#define FD_METRICS_COUNTER_EXECLE_ACCDB_LOOKUP_SPECRD_DESC "Number of account lookups resolved from speculative read (vinyl cache)" +#define FD_METRICS_COUNTER_EXECLE_ACCDB_LOOKUP_SPECRD_CVT (FD_METRICS_CONVERTER_NONE) + +#define FD_METRICS_COUNTER_EXECLE_ACCDB_LOOKUP_ACCDB_OFF (56UL) +#define FD_METRICS_COUNTER_EXECLE_ACCDB_LOOKUP_ACCDB_NAME "execle_accdb_lookup_accdb" +#define FD_METRICS_COUNTER_EXECLE_ACCDB_LOOKUP_ACCDB_TYPE (FD_METRICS_TYPE_COUNTER) +#define FD_METRICS_COUNTER_EXECLE_ACCDB_LOOKUP_ACCDB_DESC "Number of account lookups sent to accdb tile (vinyl rq/cq)" +#define FD_METRICS_COUNTER_EXECLE_ACCDB_LOOKUP_ACCDB_CVT (FD_METRICS_CONVERTER_NONE) + +#define FD_METRICS_COUNTER_EXECLE_ACCDB_DT_FUNK_OFF (57UL) +#define FD_METRICS_COUNTER_EXECLE_ACCDB_DT_FUNK_NAME "execle_accdb_dt_funk" +#define FD_METRICS_COUNTER_EXECLE_ACCDB_DT_FUNK_TYPE (FD_METRICS_TYPE_COUNTER) +#define FD_METRICS_COUNTER_EXECLE_ACCDB_DT_FUNK_DESC "Cumulative time spent in funk (in-memory) account lookups" +#define FD_METRICS_COUNTER_EXECLE_ACCDB_DT_FUNK_CVT (FD_METRICS_CONVERTER_SECONDS) + +#define FD_METRICS_COUNTER_EXECLE_ACCDB_DT_SPECRD_OFF (58UL) +#define FD_METRICS_COUNTER_EXECLE_ACCDB_DT_SPECRD_NAME "execle_accdb_dt_specrd" +#define FD_METRICS_COUNTER_EXECLE_ACCDB_DT_SPECRD_TYPE (FD_METRICS_TYPE_COUNTER) +#define FD_METRICS_COUNTER_EXECLE_ACCDB_DT_SPECRD_DESC "Cumulative time spent in speculative read (vinyl cache) account lookups" +#define FD_METRICS_COUNTER_EXECLE_ACCDB_DT_SPECRD_CVT (FD_METRICS_CONVERTER_SECONDS) + +#define FD_METRICS_COUNTER_EXECLE_ACCDB_DT_VINYL_OFF (59UL) +#define FD_METRICS_COUNTER_EXECLE_ACCDB_DT_VINYL_NAME "execle_accdb_dt_vinyl" +#define FD_METRICS_COUNTER_EXECLE_ACCDB_DT_VINYL_TYPE (FD_METRICS_TYPE_COUNTER) +#define FD_METRICS_COUNTER_EXECLE_ACCDB_DT_VINYL_DESC "Cumulative time spent waiting for vinyl rq/cq account lookups" +#define FD_METRICS_COUNTER_EXECLE_ACCDB_DT_VINYL_CVT (FD_METRICS_CONVERTER_SECONDS) + +#define FD_METRICS_EXECLE_TOTAL (37UL) extern const fd_metrics_meta_t FD_METRICS_EXECLE[FD_METRICS_EXECLE_TOTAL]; #endif /* HEADER_fd_src_disco_metrics_generated_fd_metrics_execle_h */ diff --git a/src/disco/metrics/generated/fd_metrics_execrp.c b/src/disco/metrics/generated/fd_metrics_execrp.c index 7da8fbd6e99..b8c9ab1d211 100644 --- a/src/disco/metrics/generated/fd_metrics_execrp.c +++ b/src/disco/metrics/generated/fd_metrics_execrp.c @@ -10,6 +10,12 @@ const fd_metrics_meta_t FD_METRICS_EXECRP[FD_METRICS_EXECRP_TOTAL] = { DECLARE_METRIC( EXECRP_PROGCACHE_DUP_INSERTS, COUNTER ), DECLARE_METRIC( EXECRP_PROGCACHE_INVALIDATIONS, COUNTER ), DECLARE_METRIC( EXECRP_ACCDB_CREATED, COUNTER ), + DECLARE_METRIC( EXECRP_ACCDB_LOOKUP_FUNK, COUNTER ), + DECLARE_METRIC( EXECRP_ACCDB_LOOKUP_SPECRD, COUNTER ), + DECLARE_METRIC( EXECRP_ACCDB_LOOKUP_ACCDB, COUNTER ), + DECLARE_METRIC( EXECRP_ACCDB_DT_FUNK, COUNTER ), + DECLARE_METRIC( EXECRP_ACCDB_DT_SPECRD, COUNTER ), + DECLARE_METRIC( EXECRP_ACCDB_DT_VINYL, COUNTER ), DECLARE_METRIC_ENUM( EXECRP_TXN_REGIME, COUNTER, TXN_REGIME, SETUP ), DECLARE_METRIC_ENUM( EXECRP_TXN_REGIME, COUNTER, TXN_REGIME, EXEC ), DECLARE_METRIC_ENUM( EXECRP_TXN_REGIME, COUNTER, TXN_REGIME, COMMIT ), diff --git a/src/disco/metrics/generated/fd_metrics_execrp.h b/src/disco/metrics/generated/fd_metrics_execrp.h index 4d454832b1e..1eb0dcdf01b 100644 --- a/src/disco/metrics/generated/fd_metrics_execrp.h +++ b/src/disco/metrics/generated/fd_metrics_execrp.h @@ -54,50 +54,86 @@ #define FD_METRICS_COUNTER_EXECRP_ACCDB_CREATED_DESC "Number of account database records created" #define FD_METRICS_COUNTER_EXECRP_ACCDB_CREATED_CVT (FD_METRICS_CONVERTER_NONE) -#define FD_METRICS_COUNTER_EXECRP_TXN_REGIME_OFF (31UL) +#define FD_METRICS_COUNTER_EXECRP_ACCDB_LOOKUP_FUNK_OFF (31UL) +#define FD_METRICS_COUNTER_EXECRP_ACCDB_LOOKUP_FUNK_NAME "execrp_accdb_lookup_funk" +#define FD_METRICS_COUNTER_EXECRP_ACCDB_LOOKUP_FUNK_TYPE (FD_METRICS_TYPE_COUNTER) +#define FD_METRICS_COUNTER_EXECRP_ACCDB_LOOKUP_FUNK_DESC "Number of account lookups resolved from funk (in-memory fork store)" +#define FD_METRICS_COUNTER_EXECRP_ACCDB_LOOKUP_FUNK_CVT (FD_METRICS_CONVERTER_NONE) + +#define FD_METRICS_COUNTER_EXECRP_ACCDB_LOOKUP_SPECRD_OFF (32UL) +#define FD_METRICS_COUNTER_EXECRP_ACCDB_LOOKUP_SPECRD_NAME "execrp_accdb_lookup_specrd" +#define FD_METRICS_COUNTER_EXECRP_ACCDB_LOOKUP_SPECRD_TYPE (FD_METRICS_TYPE_COUNTER) +#define FD_METRICS_COUNTER_EXECRP_ACCDB_LOOKUP_SPECRD_DESC "Number of account lookups resolved from speculative read (vinyl cache)" +#define FD_METRICS_COUNTER_EXECRP_ACCDB_LOOKUP_SPECRD_CVT (FD_METRICS_CONVERTER_NONE) + +#define FD_METRICS_COUNTER_EXECRP_ACCDB_LOOKUP_ACCDB_OFF (33UL) +#define FD_METRICS_COUNTER_EXECRP_ACCDB_LOOKUP_ACCDB_NAME "execrp_accdb_lookup_accdb" +#define FD_METRICS_COUNTER_EXECRP_ACCDB_LOOKUP_ACCDB_TYPE (FD_METRICS_TYPE_COUNTER) +#define FD_METRICS_COUNTER_EXECRP_ACCDB_LOOKUP_ACCDB_DESC "Number of account lookups sent to accdb tile (vinyl rq/cq)" +#define FD_METRICS_COUNTER_EXECRP_ACCDB_LOOKUP_ACCDB_CVT (FD_METRICS_CONVERTER_NONE) + +#define FD_METRICS_COUNTER_EXECRP_ACCDB_DT_FUNK_OFF (34UL) +#define FD_METRICS_COUNTER_EXECRP_ACCDB_DT_FUNK_NAME "execrp_accdb_dt_funk" +#define FD_METRICS_COUNTER_EXECRP_ACCDB_DT_FUNK_TYPE (FD_METRICS_TYPE_COUNTER) +#define FD_METRICS_COUNTER_EXECRP_ACCDB_DT_FUNK_DESC "Cumulative time spent in funk (in-memory) account lookups" +#define FD_METRICS_COUNTER_EXECRP_ACCDB_DT_FUNK_CVT (FD_METRICS_CONVERTER_SECONDS) + +#define FD_METRICS_COUNTER_EXECRP_ACCDB_DT_SPECRD_OFF (35UL) +#define FD_METRICS_COUNTER_EXECRP_ACCDB_DT_SPECRD_NAME "execrp_accdb_dt_specrd" +#define FD_METRICS_COUNTER_EXECRP_ACCDB_DT_SPECRD_TYPE (FD_METRICS_TYPE_COUNTER) +#define FD_METRICS_COUNTER_EXECRP_ACCDB_DT_SPECRD_DESC "Cumulative time spent in speculative read (vinyl cache) account lookups" +#define FD_METRICS_COUNTER_EXECRP_ACCDB_DT_SPECRD_CVT (FD_METRICS_CONVERTER_SECONDS) + +#define FD_METRICS_COUNTER_EXECRP_ACCDB_DT_VINYL_OFF (36UL) +#define FD_METRICS_COUNTER_EXECRP_ACCDB_DT_VINYL_NAME "execrp_accdb_dt_vinyl" +#define FD_METRICS_COUNTER_EXECRP_ACCDB_DT_VINYL_TYPE (FD_METRICS_TYPE_COUNTER) +#define FD_METRICS_COUNTER_EXECRP_ACCDB_DT_VINYL_DESC "Cumulative time spent waiting for vinyl rq/cq account lookups" +#define FD_METRICS_COUNTER_EXECRP_ACCDB_DT_VINYL_CVT (FD_METRICS_CONVERTER_SECONDS) + +#define FD_METRICS_COUNTER_EXECRP_TXN_REGIME_OFF (37UL) #define FD_METRICS_COUNTER_EXECRP_TXN_REGIME_NAME "execrp_txn_regime" #define FD_METRICS_COUNTER_EXECRP_TXN_REGIME_TYPE (FD_METRICS_TYPE_COUNTER) #define FD_METRICS_COUNTER_EXECRP_TXN_REGIME_DESC "Mutually exclusive and exhaustive duration of time spent in transaction execution regimes" -#define FD_METRICS_COUNTER_EXECRP_TXN_REGIME_CVT (FD_METRICS_CONVERTER_NANOSECONDS) +#define FD_METRICS_COUNTER_EXECRP_TXN_REGIME_CVT (FD_METRICS_CONVERTER_SECONDS) #define FD_METRICS_COUNTER_EXECRP_TXN_REGIME_CNT (3UL) -#define FD_METRICS_COUNTER_EXECRP_TXN_REGIME_SETUP_OFF (31UL) -#define FD_METRICS_COUNTER_EXECRP_TXN_REGIME_EXEC_OFF (32UL) -#define FD_METRICS_COUNTER_EXECRP_TXN_REGIME_COMMIT_OFF (33UL) +#define FD_METRICS_COUNTER_EXECRP_TXN_REGIME_SETUP_OFF (37UL) +#define FD_METRICS_COUNTER_EXECRP_TXN_REGIME_EXEC_OFF (38UL) +#define FD_METRICS_COUNTER_EXECRP_TXN_REGIME_COMMIT_OFF (39UL) -#define FD_METRICS_COUNTER_EXECRP_VM_REGIME_OFF (34UL) +#define FD_METRICS_COUNTER_EXECRP_VM_REGIME_OFF (40UL) #define FD_METRICS_COUNTER_EXECRP_VM_REGIME_NAME "execrp_vm_regime" #define FD_METRICS_COUNTER_EXECRP_VM_REGIME_TYPE (FD_METRICS_TYPE_COUNTER) #define FD_METRICS_COUNTER_EXECRP_VM_REGIME_DESC "Mutually exclusive and exhaustive duration of time spent in virtual machine execution regimes" -#define FD_METRICS_COUNTER_EXECRP_VM_REGIME_CVT (FD_METRICS_CONVERTER_NANOSECONDS) +#define FD_METRICS_COUNTER_EXECRP_VM_REGIME_CVT (FD_METRICS_CONVERTER_SECONDS) #define FD_METRICS_COUNTER_EXECRP_VM_REGIME_CNT (5UL) -#define FD_METRICS_COUNTER_EXECRP_VM_REGIME_SETUP_OFF (34UL) -#define FD_METRICS_COUNTER_EXECRP_VM_REGIME_COMMIT_OFF (35UL) -#define FD_METRICS_COUNTER_EXECRP_VM_REGIME_SETUP_CPI_OFF (36UL) -#define FD_METRICS_COUNTER_EXECRP_VM_REGIME_COMMIT_CPI_OFF (37UL) -#define FD_METRICS_COUNTER_EXECRP_VM_REGIME_INTERPRETER_OFF (38UL) +#define FD_METRICS_COUNTER_EXECRP_VM_REGIME_SETUP_OFF (40UL) +#define FD_METRICS_COUNTER_EXECRP_VM_REGIME_COMMIT_OFF (41UL) +#define FD_METRICS_COUNTER_EXECRP_VM_REGIME_SETUP_CPI_OFF (42UL) +#define FD_METRICS_COUNTER_EXECRP_VM_REGIME_COMMIT_CPI_OFF (43UL) +#define FD_METRICS_COUNTER_EXECRP_VM_REGIME_INTERPRETER_OFF (44UL) -#define FD_METRICS_COUNTER_EXECRP_TXN_ACCOUNT_CHANGES_OFF (39UL) +#define FD_METRICS_COUNTER_EXECRP_TXN_ACCOUNT_CHANGES_OFF (45UL) #define FD_METRICS_COUNTER_EXECRP_TXN_ACCOUNT_CHANGES_NAME "execrp_txn_account_changes" #define FD_METRICS_COUNTER_EXECRP_TXN_ACCOUNT_CHANGES_TYPE (FD_METRICS_TYPE_COUNTER) #define FD_METRICS_COUNTER_EXECRP_TXN_ACCOUNT_CHANGES_DESC "Transaction account change event counters" #define FD_METRICS_COUNTER_EXECRP_TXN_ACCOUNT_CHANGES_CVT (FD_METRICS_CONVERTER_NONE) #define FD_METRICS_COUNTER_EXECRP_TXN_ACCOUNT_CHANGES_CNT (5UL) -#define FD_METRICS_COUNTER_EXECRP_TXN_ACCOUNT_CHANGES_UNCHANGED_NONEXIST_OFF (39UL) -#define FD_METRICS_COUNTER_EXECRP_TXN_ACCOUNT_CHANGES_CREATED_OFF (40UL) -#define FD_METRICS_COUNTER_EXECRP_TXN_ACCOUNT_CHANGES_DELETE_OFF (41UL) -#define FD_METRICS_COUNTER_EXECRP_TXN_ACCOUNT_CHANGES_MODIFY_OFF (42UL) -#define FD_METRICS_COUNTER_EXECRP_TXN_ACCOUNT_CHANGES_UNCHANGED_OFF (43UL) +#define FD_METRICS_COUNTER_EXECRP_TXN_ACCOUNT_CHANGES_UNCHANGED_NONEXIST_OFF (45UL) +#define FD_METRICS_COUNTER_EXECRP_TXN_ACCOUNT_CHANGES_CREATED_OFF (46UL) +#define FD_METRICS_COUNTER_EXECRP_TXN_ACCOUNT_CHANGES_DELETE_OFF (47UL) +#define FD_METRICS_COUNTER_EXECRP_TXN_ACCOUNT_CHANGES_MODIFY_OFF (48UL) +#define FD_METRICS_COUNTER_EXECRP_TXN_ACCOUNT_CHANGES_UNCHANGED_OFF (49UL) -#define FD_METRICS_COUNTER_EXECRP_COMPUTE_UNITS_TOTAL_OFF (44UL) +#define FD_METRICS_COUNTER_EXECRP_COMPUTE_UNITS_TOTAL_OFF (50UL) #define FD_METRICS_COUNTER_EXECRP_COMPUTE_UNITS_TOTAL_NAME "execrp_compute_units_total" #define FD_METRICS_COUNTER_EXECRP_COMPUTE_UNITS_TOTAL_TYPE (FD_METRICS_TYPE_COUNTER) #define FD_METRICS_COUNTER_EXECRP_COMPUTE_UNITS_TOTAL_DESC "Estimated number of compute units executed since tile start" #define FD_METRICS_COUNTER_EXECRP_COMPUTE_UNITS_TOTAL_CVT (FD_METRICS_CONVERTER_NONE) -#define FD_METRICS_EXECRP_TOTAL (22UL) +#define FD_METRICS_EXECRP_TOTAL (28UL) extern const fd_metrics_meta_t FD_METRICS_EXECRP[FD_METRICS_EXECRP_TOTAL]; #endif /* HEADER_fd_src_disco_metrics_generated_fd_metrics_execrp_h */ diff --git a/src/disco/metrics/generated/fd_metrics_replay.c b/src/disco/metrics/generated/fd_metrics_replay.c index a1e565b97de..0dcede771c4 100644 --- a/src/disco/metrics/generated/fd_metrics_replay.c +++ b/src/disco/metrics/generated/fd_metrics_replay.c @@ -31,13 +31,10 @@ const fd_metrics_meta_t FD_METRICS_REPLAY[FD_METRICS_REPLAY_TOTAL] = { DECLARE_METRIC( REPLAY_PROGCACHE_GC_ROOT, COUNTER ), DECLARE_METRIC( REPLAY_ACCDB_CREATED, COUNTER ), DECLARE_METRIC( REPLAY_ACCDB_REVERTED, COUNTER ), - DECLARE_METRIC( REPLAY_ACCDB_ROOTED, COUNTER ), - DECLARE_METRIC( REPLAY_ACCDB_ROOTED_BYTES, COUNTER ), - DECLARE_METRIC( REPLAY_ACCDB_GC_ROOT, COUNTER ), - DECLARE_METRIC( REPLAY_ACCDB_RECLAIMED, COUNTER ), - DECLARE_METRIC_HISTOGRAM_SECONDS( REPLAY_ROOT_SLOT_DURATION_SECONDS ), - DECLARE_METRIC_HISTOGRAM_SECONDS( REPLAY_ROOT_ACCOUNT_DURATION_SECONDS ), - DECLARE_METRIC_ENUM( REPLAY_ROOT_ELAPSED_SECONDS, COUNTER, ROOT_PHASE, DB ), - DECLARE_METRIC_ENUM( REPLAY_ROOT_ELAPSED_SECONDS, COUNTER, ROOT_PHASE, COPY ), - DECLARE_METRIC_ENUM( REPLAY_ROOT_ELAPSED_SECONDS, COUNTER, ROOT_PHASE, GC ), + DECLARE_METRIC( REPLAY_ACCDB_LOOKUP_FUNK, COUNTER ), + DECLARE_METRIC( REPLAY_ACCDB_LOOKUP_SPECRD, COUNTER ), + DECLARE_METRIC( REPLAY_ACCDB_LOOKUP_ACCDB, COUNTER ), + DECLARE_METRIC( REPLAY_ACCDB_DT_FUNK, COUNTER ), + DECLARE_METRIC( REPLAY_ACCDB_DT_SPECRD, COUNTER ), + DECLARE_METRIC( REPLAY_ACCDB_DT_VINYL, COUNTER ), }; diff --git a/src/disco/metrics/generated/fd_metrics_replay.h b/src/disco/metrics/generated/fd_metrics_replay.h index 150b5e03342..acb80fddd9c 100644 --- a/src/disco/metrics/generated/fd_metrics_replay.h +++ b/src/disco/metrics/generated/fd_metrics_replay.h @@ -184,58 +184,43 @@ #define FD_METRICS_COUNTER_REPLAY_ACCDB_REVERTED_DESC "Number of account database records reverted" #define FD_METRICS_COUNTER_REPLAY_ACCDB_REVERTED_CVT (FD_METRICS_CONVERTER_NONE) -#define FD_METRICS_COUNTER_REPLAY_ACCDB_ROOTED_OFF (84UL) -#define FD_METRICS_COUNTER_REPLAY_ACCDB_ROOTED_NAME "replay_accdb_rooted" -#define FD_METRICS_COUNTER_REPLAY_ACCDB_ROOTED_TYPE (FD_METRICS_TYPE_COUNTER) -#define FD_METRICS_COUNTER_REPLAY_ACCDB_ROOTED_DESC "Number of account database entries rooted" -#define FD_METRICS_COUNTER_REPLAY_ACCDB_ROOTED_CVT (FD_METRICS_CONVERTER_NONE) - -#define FD_METRICS_COUNTER_REPLAY_ACCDB_ROOTED_BYTES_OFF (85UL) -#define FD_METRICS_COUNTER_REPLAY_ACCDB_ROOTED_BYTES_NAME "replay_accdb_rooted_bytes" -#define FD_METRICS_COUNTER_REPLAY_ACCDB_ROOTED_BYTES_TYPE (FD_METRICS_TYPE_COUNTER) -#define FD_METRICS_COUNTER_REPLAY_ACCDB_ROOTED_BYTES_DESC "Number of bytes in account database entries rooted (including overhead)" -#define FD_METRICS_COUNTER_REPLAY_ACCDB_ROOTED_BYTES_CVT (FD_METRICS_CONVERTER_NONE) - -#define FD_METRICS_COUNTER_REPLAY_ACCDB_GC_ROOT_OFF (86UL) -#define FD_METRICS_COUNTER_REPLAY_ACCDB_GC_ROOT_NAME "replay_accdb_gc_root" -#define FD_METRICS_COUNTER_REPLAY_ACCDB_GC_ROOT_TYPE (FD_METRICS_TYPE_COUNTER) -#define FD_METRICS_COUNTER_REPLAY_ACCDB_GC_ROOT_DESC "Number of account database entries garbage collected" -#define FD_METRICS_COUNTER_REPLAY_ACCDB_GC_ROOT_CVT (FD_METRICS_CONVERTER_NONE) - -#define FD_METRICS_COUNTER_REPLAY_ACCDB_RECLAIMED_OFF (87UL) -#define FD_METRICS_COUNTER_REPLAY_ACCDB_RECLAIMED_NAME "replay_accdb_reclaimed" -#define FD_METRICS_COUNTER_REPLAY_ACCDB_RECLAIMED_TYPE (FD_METRICS_TYPE_COUNTER) -#define FD_METRICS_COUNTER_REPLAY_ACCDB_RECLAIMED_DESC "Number of account database entries reclaimed (deletion rooted)" -#define FD_METRICS_COUNTER_REPLAY_ACCDB_RECLAIMED_CVT (FD_METRICS_CONVERTER_NONE) - -#define FD_METRICS_HISTOGRAM_REPLAY_ROOT_SLOT_DURATION_SECONDS_OFF (88UL) -#define FD_METRICS_HISTOGRAM_REPLAY_ROOT_SLOT_DURATION_SECONDS_NAME "replay_root_slot_duration_seconds" -#define FD_METRICS_HISTOGRAM_REPLAY_ROOT_SLOT_DURATION_SECONDS_TYPE (FD_METRICS_TYPE_HISTOGRAM) -#define FD_METRICS_HISTOGRAM_REPLAY_ROOT_SLOT_DURATION_SECONDS_DESC "Time in seconds spent updating the rooted account store (one sample per block)" -#define FD_METRICS_HISTOGRAM_REPLAY_ROOT_SLOT_DURATION_SECONDS_CVT (FD_METRICS_CONVERTER_SECONDS) -#define FD_METRICS_HISTOGRAM_REPLAY_ROOT_SLOT_DURATION_SECONDS_MIN (0.0005) -#define FD_METRICS_HISTOGRAM_REPLAY_ROOT_SLOT_DURATION_SECONDS_MAX (1.0) - -#define FD_METRICS_HISTOGRAM_REPLAY_ROOT_ACCOUNT_DURATION_SECONDS_OFF (105UL) -#define FD_METRICS_HISTOGRAM_REPLAY_ROOT_ACCOUNT_DURATION_SECONDS_NAME "replay_root_account_duration_seconds" -#define FD_METRICS_HISTOGRAM_REPLAY_ROOT_ACCOUNT_DURATION_SECONDS_TYPE (FD_METRICS_TYPE_HISTOGRAM) -#define FD_METRICS_HISTOGRAM_REPLAY_ROOT_ACCOUNT_DURATION_SECONDS_DESC "Time in seconds spent updating the rooted account store (one sample per block, normalized by account count)" -#define FD_METRICS_HISTOGRAM_REPLAY_ROOT_ACCOUNT_DURATION_SECONDS_CVT (FD_METRICS_CONVERTER_SECONDS) -#define FD_METRICS_HISTOGRAM_REPLAY_ROOT_ACCOUNT_DURATION_SECONDS_MIN (1e-07) -#define FD_METRICS_HISTOGRAM_REPLAY_ROOT_ACCOUNT_DURATION_SECONDS_MAX (0.1) - -#define FD_METRICS_COUNTER_REPLAY_ROOT_ELAPSED_SECONDS_OFF (122UL) -#define FD_METRICS_COUNTER_REPLAY_ROOT_ELAPSED_SECONDS_NAME "replay_root_elapsed_seconds" -#define FD_METRICS_COUNTER_REPLAY_ROOT_ELAPSED_SECONDS_TYPE (FD_METRICS_TYPE_COUNTER) -#define FD_METRICS_COUNTER_REPLAY_ROOT_ELAPSED_SECONDS_DESC "Total time in seconds spent rooting accounts" -#define FD_METRICS_COUNTER_REPLAY_ROOT_ELAPSED_SECONDS_CVT (FD_METRICS_CONVERTER_SECONDS) -#define FD_METRICS_COUNTER_REPLAY_ROOT_ELAPSED_SECONDS_CNT (3UL) - -#define FD_METRICS_COUNTER_REPLAY_ROOT_ELAPSED_SECONDS_DB_OFF (122UL) -#define FD_METRICS_COUNTER_REPLAY_ROOT_ELAPSED_SECONDS_COPY_OFF (123UL) -#define FD_METRICS_COUNTER_REPLAY_ROOT_ELAPSED_SECONDS_GC_OFF (124UL) - -#define FD_METRICS_REPLAY_TOTAL (38UL) +#define FD_METRICS_COUNTER_REPLAY_ACCDB_LOOKUP_FUNK_OFF (84UL) +#define FD_METRICS_COUNTER_REPLAY_ACCDB_LOOKUP_FUNK_NAME "replay_accdb_lookup_funk" +#define FD_METRICS_COUNTER_REPLAY_ACCDB_LOOKUP_FUNK_TYPE (FD_METRICS_TYPE_COUNTER) +#define FD_METRICS_COUNTER_REPLAY_ACCDB_LOOKUP_FUNK_DESC "Number of account lookups resolved from funk (in-memory fork store)" +#define FD_METRICS_COUNTER_REPLAY_ACCDB_LOOKUP_FUNK_CVT (FD_METRICS_CONVERTER_NONE) + +#define FD_METRICS_COUNTER_REPLAY_ACCDB_LOOKUP_SPECRD_OFF (85UL) +#define FD_METRICS_COUNTER_REPLAY_ACCDB_LOOKUP_SPECRD_NAME "replay_accdb_lookup_specrd" +#define FD_METRICS_COUNTER_REPLAY_ACCDB_LOOKUP_SPECRD_TYPE (FD_METRICS_TYPE_COUNTER) +#define FD_METRICS_COUNTER_REPLAY_ACCDB_LOOKUP_SPECRD_DESC "Number of account lookups resolved from speculative read (vinyl cache)" +#define FD_METRICS_COUNTER_REPLAY_ACCDB_LOOKUP_SPECRD_CVT (FD_METRICS_CONVERTER_NONE) + +#define FD_METRICS_COUNTER_REPLAY_ACCDB_LOOKUP_ACCDB_OFF (86UL) +#define FD_METRICS_COUNTER_REPLAY_ACCDB_LOOKUP_ACCDB_NAME "replay_accdb_lookup_accdb" +#define FD_METRICS_COUNTER_REPLAY_ACCDB_LOOKUP_ACCDB_TYPE (FD_METRICS_TYPE_COUNTER) +#define FD_METRICS_COUNTER_REPLAY_ACCDB_LOOKUP_ACCDB_DESC "Number of account lookups sent to accdb tile (vinyl rq/cq)" +#define FD_METRICS_COUNTER_REPLAY_ACCDB_LOOKUP_ACCDB_CVT (FD_METRICS_CONVERTER_NONE) + +#define FD_METRICS_COUNTER_REPLAY_ACCDB_DT_FUNK_OFF (87UL) +#define FD_METRICS_COUNTER_REPLAY_ACCDB_DT_FUNK_NAME "replay_accdb_dt_funk" +#define FD_METRICS_COUNTER_REPLAY_ACCDB_DT_FUNK_TYPE (FD_METRICS_TYPE_COUNTER) +#define FD_METRICS_COUNTER_REPLAY_ACCDB_DT_FUNK_DESC "Cumulative time spent in funk (in-memory) account lookups" +#define FD_METRICS_COUNTER_REPLAY_ACCDB_DT_FUNK_CVT (FD_METRICS_CONVERTER_SECONDS) + +#define FD_METRICS_COUNTER_REPLAY_ACCDB_DT_SPECRD_OFF (88UL) +#define FD_METRICS_COUNTER_REPLAY_ACCDB_DT_SPECRD_NAME "replay_accdb_dt_specrd" +#define FD_METRICS_COUNTER_REPLAY_ACCDB_DT_SPECRD_TYPE (FD_METRICS_TYPE_COUNTER) +#define FD_METRICS_COUNTER_REPLAY_ACCDB_DT_SPECRD_DESC "Cumulative time spent in speculative read (vinyl cache) account lookups" +#define FD_METRICS_COUNTER_REPLAY_ACCDB_DT_SPECRD_CVT (FD_METRICS_CONVERTER_SECONDS) + +#define FD_METRICS_COUNTER_REPLAY_ACCDB_DT_VINYL_OFF (89UL) +#define FD_METRICS_COUNTER_REPLAY_ACCDB_DT_VINYL_NAME "replay_accdb_dt_vinyl" +#define FD_METRICS_COUNTER_REPLAY_ACCDB_DT_VINYL_TYPE (FD_METRICS_TYPE_COUNTER) +#define FD_METRICS_COUNTER_REPLAY_ACCDB_DT_VINYL_DESC "Cumulative time spent waiting for vinyl rq/cq account lookups" +#define FD_METRICS_COUNTER_REPLAY_ACCDB_DT_VINYL_CVT (FD_METRICS_CONVERTER_SECONDS) + +#define FD_METRICS_REPLAY_TOTAL (35UL) extern const fd_metrics_meta_t FD_METRICS_REPLAY[FD_METRICS_REPLAY_TOTAL]; #endif /* HEADER_fd_src_disco_metrics_generated_fd_metrics_replay_h */ diff --git a/src/disco/metrics/metrics.xml b/src/disco/metrics/metrics.xml index 7412eb578e5..548f2613fa0 100644 --- a/src/disco/metrics/metrics.xml +++ b/src/disco/metrics/metrics.xml @@ -671,6 +671,12 @@ metric introduced. + + + + + + @@ -798,20 +804,13 @@ metric introduced. - - - - - - Time in seconds spent updating the rooted account store (one sample per block) - - - - Time in seconds spent updating the rooted account store (one sample per block, normalized by account count) - - - + + + + + + @@ -1170,9 +1169,15 @@ metric introduced. + + + + + + - - + + @@ -1270,12 +1275,6 @@ metric introduced. - - - - - - @@ -1308,7 +1307,7 @@ metric introduced. - + diff --git a/src/disco/topo/fd_topo.h b/src/disco/topo/fd_topo.h index b9cf010f507..7bb6dc924d6 100644 --- a/src/disco/topo/fd_topo.h +++ b/src/disco/topo/fd_topo.h @@ -648,6 +648,7 @@ struct fd_topo_tile { struct { ulong meta_map_obj_id; ulong meta_pool_obj_id; + ulong line_obj_id; ulong line_max; ulong data_obj_id; char bstream_path[ PATH_MAX ]; @@ -655,6 +656,8 @@ struct fd_topo_tile { int io_type; /* FD_VINYL_IO_TYPE_* */ uint uring_depth; + + ulong write_delay_slots; } accdb; struct { diff --git a/src/disco/topo/fd_topob_vinyl.h b/src/disco/topo/fd_topob_vinyl.h index cf53ba60712..e77c01d7b6d 100644 --- a/src/disco/topo/fd_topob_vinyl.h +++ b/src/disco/topo/fd_topob_vinyl.h @@ -59,6 +59,16 @@ fd_topob_vinyl_rq( fd_topo_t * topo, fd_topob_tile_uses( topo, client_tile, rq_obj, FD_SHMEM_JOIN_MODE_READ_WRITE ); fd_topob_tile_uses( topo, client_tile, cq_obj, FD_SHMEM_JOIN_MODE_READ_ONLY ); + /* Grant read-only access to meta map and element pool for speculative + reads (pin-based direct cache access). If the pod keys are absent, + specread is simply not available for this client. */ + ulong meta_obj_id = fd_pod_query_ulong( topo->props, "accdb.meta_map", ULONG_MAX ); + ulong ele_obj_id = fd_pod_query_ulong( topo->props, "accdb.meta_pool", ULONG_MAX ); + if( meta_obj_id!=ULONG_MAX && ele_obj_id!=ULONG_MAX ) { + fd_topob_tile_uses( topo, client_tile, &topo->objs[ meta_obj_id ], FD_SHMEM_JOIN_MODE_READ_ONLY ); + fd_topob_tile_uses( topo, client_tile, &topo->objs[ ele_obj_id ], FD_SHMEM_JOIN_MODE_READ_ONLY ); + } + FD_TEST( rq_obj->label_idx==req_pool_obj->label_idx ); /* keep rq and req_pool in sync */ return rq_obj; } diff --git a/src/discof/accdb/fd_accdb_case_acquire.c b/src/discof/accdb/fd_accdb_case_acquire.c new file mode 100644 index 00000000000..e517f5cd57a --- /dev/null +++ b/src/discof/accdb/fd_accdb_case_acquire.c @@ -0,0 +1,220 @@ + case FD_VINYL_REQ_TYPE_ACQUIRE: { + FD_MCNT_INC( ACCDB, REQUEST_BATCHES, 1UL ); + FD_MCNT_INC( ACCDB, REQUESTS, batch_cnt ); + + ulong req_flags = (ulong)req->flags; + fd_vinyl_key_t const * req_key = MAP_REQ_GADDR( req->key_gaddr, fd_vinyl_key_t, batch_cnt ); + ulong * req_val_gaddr = MAP_REQ_GADDR( req->val_gaddr_gaddr, ulong, batch_cnt ); + schar * req_err = MAP_REQ_GADDR( req->err_gaddr, schar, batch_cnt ); + + int req_evict_prio = fd_vinyl_req_evict_prio( req_flags ); + + int bad_gaddr = (!!batch_cnt) & ((!req_key) | (!req_val_gaddr) | (!req_err)); + + if( FD_UNLIKELY( bad_gaddr ) ) { + comp_err = FD_VINYL_ERR_INVAL; + break; + } + + for( ulong batch_idx=0UL; batch_idxline_idx==line_idx, "corruption detected" ); + + ulong line_ctl = line[ line_idx ].ctl; + + long ref = fd_accdb_line_ctl_ref( line_ctl ); + + /* At this point, we are acquiring a cached pair for read. + If the line is acquired for modify, fail with AGAIN. If + there are too many acquires for read on this pair, CRIT + (could consider AGAIN here). Otherwise, we update the + ref count (don't change the ver), point the client at the + line caching pair key to finish the acquire. Note that + we don't validate the pair header if we detect that an + earlier acquire in this batch started fetching the pair + because the read might still be in progress (see note + below for more details). */ + + if( FD_UNLIKELY( ref<0L ) ) DONE( FD_VINYL_ERR_AGAIN ); + if( FD_UNLIKELY( ref>=FD_VINYL_LINE_REF_MAX ) ) FD_LOG_CRIT(( "too many acquires for read on this pair" )); + + if( FD_LIKELY( !obj->rd_active ) ) { + fd_vinyl_bstream_phdr_t * phdr = fd_vinyl_data_obj_phdr( obj ); + + FD_CRIT( fd_vinyl_data_obj_val_max( obj ) >= val_sz, "corruption detected" ); + FD_CRIT( phdr->ctl==fd_vinyl_bstream_ctl( FD_VINYL_BSTREAM_CTL_TYPE_PAIR, + FD_VINYL_BSTREAM_CTL_STYLE_RAW, val_sz ), "corruption detected" ); + FD_CRIT( fd_vinyl_key_eq( &phdr->key, key ), "corruption detected" ); + FD_CRIT( !memcmp( &phdr->info, &ele0[ ele_idx ].phdr.info, sizeof(fd_vinyl_info_t) ), "corruption detected" ); + } + + FD_ATOMIC_FETCH_AND_ADD( &line[ line_idx ].ctl, 1UL ); + + req_val_gaddr[ batch_idx ] = fd_vinyl_data_gaddr( fd_vinyl_data_obj_val( obj ), data_laddr0 ); + + DONE( FD_VINYL_SUCCESS ); + + } /* pair key data cached */ + + /* At this point, pair key is not cached. If we are not allowed + to acquire this pair, fail. Otherwise, evict the least + recently used evictable line (this should always be possible + if quotas are confiured correctly) to make room to cache this + pair. Connect this line to meta element ele_idx, set the + line's reference count appropriately, bump the line's version + and move the line to the desired location in the eviction + sequence. We don't modify any shared fields in meta element + ele_idx so we can do the modification fast. + + We do this upfront to free data cache for the alloc if the + LRU line is in use and to handle the same pair appearing + multiple times in an acquire. + + That is, if req_key appears multiple times in an acquire to + modify, the trailing redundant acquires will see the object + as cached with ref==-1 and fail with AGAIN. If the key + appears multiple times in an acquire for read, the trailing + redundant acquires will see the object as cached with ref>0 + and rd_active==1, conclude that the first redundant acquire + is in the process of reading the pair into cache, skip any + racy metadata checks, increase the ref count and succeed. + + IMPORTANT SAFETY TIP! Note that this implies that client + doing an acquire-for-read with redundant keys and with + speculative processing will see req_err transition to success + for the trailing redundant items for a key before the leading + item of that key transitions to success (and thus before the + object is fully read / verified and/or decoded). It is up to + the client doing speculative cut through processing to avoid + redundant keys or react accordingly. */ + + line_idx = fd_accdb_clock_evict( ctx, line, line_cnt, ele0, ele_max, data ); + + line[ line_idx ].ele_idx = ele_idx; ele0[ ele_idx ].line_idx = line_idx; + FD_ATOMIC_FETCH_AND_ADD( &line[ line_idx ].ctl, 1UL ); + if( req_evict_prio<=FD_VINYL_LINE_EVICT_PRIO_MRU ) { + FD_ATOMIC_FETCH_AND_OR( &line[ line_idx ].ctl, FD_ACCDB_LINE_CTL_CHANCE ); + } + + /* Allocate an appropriately sized object to hold this pair, + connect it to this line and report the location to the client. */ + + ulong val_max = val_sz; + + ulong szc = fd_vinyl_data_szc( val_max ); + + fd_vinyl_data_obj_t * obj = fd_vinyl_data_alloc( data, szc ); + if( FD_UNLIKELY( !obj ) ) FD_LOG_CRIT(( "increase data cache size" )); + + line[ line_idx ].obj_gaddr = fd_vinyl_data_gaddr( obj, data_laddr0 ); obj->line_idx = line_idx; + + void * val = fd_vinyl_data_obj_val( obj ); + + req_val_gaddr[ batch_idx ] = fd_vinyl_data_gaddr( val, data_laddr0 ); + + /* If we need to do I/O, start reading encoded pair data and + defer the data integrity and decoding to later (and then in + whatever order the I/O layer sees fit). */ + + obj->rd_active = (short)1; + + int style = fd_vinyl_bstream_ctl_style( pair_ctl ); + ulong val_esz = fd_vinyl_bstream_ctl_sz ( pair_ctl ); + + FD_CRIT( val_esz<=FD_VINYL_VAL_MAX, "corruption detected" ); + FD_CRIT( (style!=FD_VINYL_BSTREAM_CTL_STYLE_RAW) | (val_sz==val_esz), "corruption detected" ); + + fd_vinyl_data_obj_t * cobj; + + if( FD_LIKELY( style==FD_VINYL_BSTREAM_CTL_STYLE_RAW ) ) cobj = obj; + else { + cobj = fd_vinyl_data_alloc( data, fd_vinyl_data_szc( val_esz ) ); + if( FD_UNLIKELY( !cobj ) ) FD_LOG_CRIT(( "increase data cache size" )); + } + + cobj->rd->ctx = (ulong)obj; + cobj->rd->seq = ele0[ ele_idx ].seq; + cobj->rd->dst = fd_vinyl_data_obj_phdr( cobj ); + cobj->rd->sz = fd_vinyl_bstream_pair_sz( val_esz ); + + cobj->rd_err = req_err + batch_idx; + + fd_vinyl_io_read( io, cobj->rd ); + read_cnt++; + + goto next_acquire; + + } /* pair key meta cached */ + + /* At this point, pair key does not exist at bstream seq_present + and is not in the process of being created. */ + + DONE( FD_VINYL_ERR_KEY ); + + next_acquire: /* silly language restriction */; + +# undef DONE + + } /* for batch_idx */ + + FD_CRIT( !read_cnt, "corruption detected" ); + + comp_err = FD_VINYL_SUCCESS; + break; + } diff --git a/src/discof/accdb/fd_accdb_line_ctl.h b/src/discof/accdb/fd_accdb_line_ctl.h new file mode 100644 index 00000000000..38fa03b3abc --- /dev/null +++ b/src/discof/accdb/fd_accdb_line_ctl.h @@ -0,0 +1,38 @@ +#ifndef HEADER_fd_src_discof_accdb_fd_accdb_line_ctl_h +#define HEADER_fd_src_discof_accdb_fd_accdb_line_ctl_h + +/* fd_accdb_line_ctl.h provides the ctl field encoding for accdb cache + lines. This header is shared between the accdb tile + (fd_accdb_tile_private.h) and specread clients (fd_accdb_specread.h). + + Layout: + bits [32,64) version (same as fd_vinyl_line_ctl) + bit 25 EVICTING + bit 24 CHANCE + bits [0,24) ref + 1 (combined client + specread ref count) + + Specread pin: FETCH_AND_ADD(&ctl, 1UL), check old & EVICTING + Specread unpin: FETCH_AND_SUB(&ctl, 1UL) + CHANCE set: FETCH_AND_OR(&ctl, FD_ACCDB_LINE_CTL_CHANCE) + CHANCE clear: FETCH_AND_AND(&ctl, ~FD_ACCDB_LINE_CTL_CHANCE) + EVICTING set: CAS or FETCH_AND_OR on ctl + Version bump: CAS loop (preserves in-flight specread refs) */ + +#include "../../util/fd_util_base.h" + +#define FD_ACCDB_LINE_CTL_CHANCE (1UL << 24) +#define FD_ACCDB_LINE_CTL_EVICTING (1UL << 25) + +FD_PROTOTYPES_BEGIN + +FD_FN_CONST static inline ulong +fd_accdb_line_ctl( ulong ver, long ref ) { + return (ver << 32) | ((ulong)(ref + 1L)); +} + +FD_FN_CONST static inline ulong fd_accdb_line_ctl_ver( ulong ctl ) { return ctl >> 32; } +FD_FN_CONST static inline long fd_accdb_line_ctl_ref( ulong ctl ) { return ((long)(ctl & ((1UL<<24)-1UL))) - 1L; } + +FD_PROTOTYPES_END + +#endif /* HEADER_fd_src_discof_accdb_fd_accdb_line_ctl_h */ diff --git a/src/discof/accdb/fd_accdb_tile.c b/src/discof/accdb/fd_accdb_tile.c index 6bd722083de..593b70c38d8 100644 --- a/src/discof/accdb/fd_accdb_tile.c +++ b/src/discof/accdb/fd_accdb_tile.c @@ -8,11 +8,9 @@ - Sandboxing */ #define _GNU_SOURCE +#include "fd_accdb_tile_private.h" #include "../../disco/topo/fd_topo.h" #include "../../disco/metrics/fd_metrics.h" -#include "../../discof/restore/fd_snapct_tile.h" -#include "../../vinyl/fd_vinyl.h" -#include "../../vinyl/fd_vinyl_base.h" #include "../../vinyl/io/ur/fd_vinyl_io_ur.h" #include "../../util/pod/fd_pod_format.h" #include "../../util/io_uring/fd_io_uring_setup.h" @@ -28,29 +26,14 @@ #define NAME "accdb" #define MAX_INS 8 +#include "fd_accdb_tile_cache.c" +#include "fd_accdb_tile_root.c" + /* For io_ur backend, this controls the size of the write-back cache. This should be larger than the cumulative record size of all unique changed accounts in a slot. */ #define IO_SPAD_MAX (128UL<<20) -#define FD_VINYL_CLIENT_MAX (1024UL) -#define FD_VINYL_REQ_MAX (1024UL) - -struct fd_vinyl_client { - fd_vinyl_rq_t * rq; /* Channel for requests from this client (could be shared by multiple vinyl instances) */ - fd_vinyl_cq_t * cq; /* Channel for completions from this client to this vinyl instance - (could be shared by multiple receivers of completions from this vinyl instance). */ - ulong burst_max; /* Max requests receive from this client at a time */ - ulong seq; /* Sequence number of the next request to receive in the rq */ - ulong link_id; /* Identifies requests from this client to this vinyl instance in the rq */ - ulong laddr0; /* A valid non-zero gaddr from this client maps to the vinyl instance's laddr laddr0 + gaddr ... */ - ulong laddr1; /* ... and thus is in (laddr0,laddr1). A zero gaddr maps to laddr NULL. */ - ulong quota_rem; /* Num of remaining acquisitions this client is allowed on this vinyl instance */ - ulong quota_max; /* Max quota */ -}; - -typedef struct fd_vinyl_client fd_vinyl_client_t; - /* MAP_REQ_GADDR maps a request global address req_gaddr to an array of cnt T's into the local address space as a T * pointer. If the result is not properly aligned or the entire range does not completely fall @@ -74,81 +57,6 @@ fd_vinyl_laddr( ulong req_gaddr, req_laddr0, 0UL ); } -struct fd_vinyl_tile { - - /* Vinyl objects */ - - fd_vinyl_t vinyl[1]; - void * io_mem; - - /* Tile architecture */ - - uint booted : 1; - uint shutdown : 1; - struct { - ulong state_expected; - ulong volatile const * state; - ulong volatile const * pair_cnt; - /* When booting from genesis only */ - struct { - ulong io_seed; - } from_genesis; - } boot; - - /* I/O */ - - int bstream_fd; - ulong bstream_file_sz; - - /* io_uring */ - - fd_io_uring_t ring[1]; - void * ioring_shmem; /* shared between kernel and user */ - - /* Clients */ - - fd_vinyl_client_t _client[ FD_VINYL_CLIENT_MAX ]; - ulong client_cnt; - ulong client_idx; - - /* Received requests */ - - fd_vinyl_req_t _req[ FD_VINYL_REQ_MAX ]; - ulong req_head; /* Requests [0,req_head) have been processed */ - ulong req_tail; /* Requests [req_head,req_tail) are pending */ - /* Requests [req_tail,ULONG_MAX) have not been received */ - ulong exec_max; - - /* accum_dead_cnt is the number of dead blocks that have been - written since the last partition block. - - accum_move_cnt is the number of move blocks that have been - written since this last partition block. - - accum_garbage_cnt / sz is the number of items / bytes garbage in - the bstream that have accumulated since the last time we compacted - the bstream. We use this to estimate the number of rounds of - compaction to do in async handling. */ - - ulong accum_dead_cnt; - ulong accum_garbage_cnt; - ulong accum_garbage_sz; - - /* Run loop state */ - - ulong seq_part; - - /* Periodic syncing */ - - long sync_next_ns; - - /* Vinyl limit on the number of pairs the meta map will accept. - Exceeding this limit will trigger a LOG_ERR. */ - ulong pair_cnt_limit; -}; - -typedef struct fd_vinyl_tile fd_vinyl_tile_t; - /* Vinyl state object */ static ulong @@ -160,7 +68,6 @@ struct fd_accdb_tile_layout { ulong footprint; ulong io_off; ulong io_uring_shmem_off; - ulong vinyl_line_off; }; typedef struct fd_accdb_tile_layout fd_accdb_tile_layout_t; @@ -171,7 +78,7 @@ fd_accdb_tile_layout( fd_accdb_tile_layout_t * layout, memset( layout, 0, sizeof(fd_accdb_tile_layout_t) ); FD_SCRATCH_ALLOC_INIT( l, NULL ); - ulong ctx_off = (ulong)FD_SCRATCH_ALLOC_APPEND( l, alignof(fd_vinyl_tile_t), sizeof(fd_vinyl_tile_t) ); + ulong ctx_off = (ulong)FD_SCRATCH_ALLOC_APPEND( l, alignof(fd_accdb_tile_t), sizeof(fd_accdb_tile_t) ); FD_TEST( ctx_off==0UL ); switch( tile->accdb.io_type ) { @@ -189,8 +96,6 @@ fd_accdb_tile_layout( fd_accdb_tile_layout_t * layout, FD_LOG_CRIT(( "invalid tile->accdb.io_type %d", tile->accdb.io_type )); } - layout->vinyl_line_off = (ulong)FD_SCRATCH_ALLOC_APPEND( - l, alignof(fd_vinyl_line_t), sizeof(fd_vinyl_line_t)*tile->accdb.line_max ); layout->footprint = FD_SCRATCH_ALLOC_FINI( l, scratch_align() ); } @@ -216,7 +121,7 @@ populate_allowed_fds( fd_topo_t const * topo, void * scratch = fd_topo_obj_laddr( topo, tile->tile_obj_id ); FD_SCRATCH_ALLOC_INIT( l, scratch ); - fd_vinyl_tile_t * ctx = FD_SCRATCH_ALLOC_APPEND( l, alignof(fd_vinyl_tile_t), sizeof(fd_vinyl_tile_t) ); + fd_accdb_tile_t * ctx = FD_SCRATCH_ALLOC_APPEND( l, alignof(fd_accdb_tile_t), sizeof(fd_accdb_tile_t) ); out_fds[ out_cnt++ ] = ctx->bstream_fd; @@ -232,14 +137,14 @@ populate_allowed_seccomp( fd_topo_t const * topo, struct sock_filter * out ) { void * scratch = fd_topo_obj_laddr( topo, tile->tile_obj_id ); FD_SCRATCH_ALLOC_INIT( l, scratch ); - fd_vinyl_tile_t * ctx = FD_SCRATCH_ALLOC_APPEND( l, alignof(fd_vinyl_tile_t), sizeof(fd_vinyl_tile_t) ); + fd_accdb_tile_t * ctx = FD_SCRATCH_ALLOC_APPEND( l, alignof(fd_accdb_tile_t), sizeof(fd_accdb_tile_t) ); populate_sock_filter_policy_fd_accdb_tile( out_cnt, out, (uint)fd_log_private_logfile_fd(), (uint)ctx->bstream_fd, (uint)ctx->ring->ioring_fd ); return sock_filter_policy_fd_accdb_tile_instr_cnt; } static void -vinyl_io_uring_init( fd_vinyl_tile_t * ctx, +vinyl_io_uring_init( fd_accdb_tile_t * ctx, uint uring_depth, int dev_fd ) { fd_io_uring_params_t params[1]; @@ -286,10 +191,10 @@ privileged_init( fd_topo_t * topo, FD_LOG_ERR(( "invalid vinyl_line_max %lu", tile->accdb.line_max )); } - fd_vinyl_tile_t * ctx = fd_topo_obj_laddr( topo, tile->tile_obj_id ); + fd_accdb_tile_t * ctx = fd_topo_obj_laddr( topo, tile->tile_obj_id ); ulong ctx_laddr = (ulong)ctx; - memset( ctx, 0, sizeof(fd_vinyl_tile_t) ); + memset( ctx, 0, sizeof(fd_accdb_tile_t) ); ctx->bstream_fd = -1; ctx->ring->ioring_fd = -1; @@ -303,11 +208,11 @@ privileged_init( fd_topo_t * topo, ctx->ioring_shmem = (void *)( ctx_laddr + layout->io_uring_shmem_off ); } - fd_vinyl_line_t * _line = (void *)( ctx_laddr + layout->vinyl_line_off ); + fd_vinyl_line_t * _line = fd_topo_obj_laddr( topo, tile->accdb.line_obj_id ); vinyl->cnc = NULL; vinyl->io = NULL; - vinyl->line = (fd_vinyl_line_t *)_line; + vinyl->line = _line; vinyl->line_footprint = line_footprint; /* FIXME use O_DIRECT? */ @@ -338,7 +243,7 @@ static void unprivileged_init( fd_topo_t * topo, fd_topo_tile_t * tile ) { - fd_vinyl_tile_t * ctx = fd_topo_obj_laddr( topo, tile->tile_obj_id ); + fd_accdb_tile_t * ctx = fd_topo_obj_laddr( topo, tile->tile_obj_id ); fd_vinyl_t * vinyl = ctx->vinyl; ctx->sync_next_ns = fd_log_wallclock(); @@ -381,7 +286,6 @@ unprivileged_init( fd_topo_t * topo, vinyl->gc_thresh = gc_thresh; vinyl->gc_eager = gc_eager; vinyl->style = FD_VINYL_BSTREAM_CTL_STYLE_RAW; - vinyl->line_idx_lru = 0U; vinyl->pair_cnt = 0UL; vinyl->garbage_sz = 0UL; @@ -392,13 +296,13 @@ unprivileged_init( fd_topo_t * topo, fd_vinyl_line_t * line = vinyl->line; for( ulong line_idx=0UL; line_idxclock_hand = 0U; + # undef TEST ulong snapwm_tile_idx = fd_topo_find_tile( topo, "snapwm", 0UL ); @@ -522,12 +426,46 @@ unprivileged_init( fd_topo_t * topo, } /* client join loop */ + /* Initialize rooting state */ + + ctx->root_txn = NULL; + ctx->root_rec = NULL; + ctx->root_txn_idx = 0UL; + ctx->root_target_xid = (fd_funk_txn_xid_t){ .ul = { ULONG_MAX, ULONG_MAX } }; + ctx->write_delay_slots = tile->accdb.write_delay_slots; + + /* Join funk for rooting operations */ + + ulong funk_obj_id = fd_pod_query_ulong( topo->props, "funk", ULONG_MAX ); + ulong funk_locks_obj_id = fd_pod_query_ulong( topo->props, "funk_locks", ULONG_MAX ); + if( funk_obj_id!=ULONG_MAX && funk_locks_obj_id!=ULONG_MAX ) { + FD_TEST( fd_funk_join( ctx->funk, + fd_topo_obj_laddr( topo, funk_obj_id ), + fd_topo_obj_laddr( topo, funk_locks_obj_id ) ) ); + } + + /* Discover the replay_accdb input link for root messages */ + + ctx->root_in_mem = NULL; + ctx->root_in_chunk0 = 0UL; + ctx->root_in_wmark = 0UL; + + for( ulong i=0UL; iin_cnt; i++ ) { + fd_topo_link_t const * in_link = &topo->links[ tile->in_link_id[ i ] ]; + if( !strcmp( in_link->name, "replay_accdb" ) ) { + ctx->root_in_mem = topo->workspaces[ topo->objs[ in_link->dcache_obj_id ].wksp_id ].wksp; + ctx->root_in_chunk0 = fd_dcache_compact_chunk0( ctx->root_in_mem, in_link->dcache ); + ctx->root_in_wmark = fd_dcache_compact_wmark ( ctx->root_in_mem, in_link->dcache, in_link->mtu ); + break; + } + } + } /* during_housekeeping is called periodically (approx every STEM_LAZY ns) */ static void -during_housekeeping( fd_vinyl_tile_t * ctx ) { +during_housekeeping( fd_accdb_tile_t * ctx ) { fd_vinyl_t * vinyl = ctx->vinyl; @@ -558,6 +496,61 @@ during_housekeeping( fd_vinyl_tile_t * ctx ) { ctx->booted = 1; } + /* --- Root processing --- + Continue any in-progress batch, or start rooting the oldest + unrooted funk txn if write_delay_slots allows. */ + + if( FD_UNLIKELY( ctx->root_rec ) ) { + /* Batch in progress — do one batch */ + ctx->root_rec = fd_accdb_v2_root_batch( ctx, ctx->root_rec ); + if( !ctx->root_rec ) { + fd_accdb_txn_root_fini( ctx, ctx->root_txn, ctx->root_txn_idx ); + ctx->root_txn = NULL; + } + + } else if( FD_UNLIKELY( ctx->root_txn ) ) { + /* Previous root had no records left — clean up */ + fd_accdb_txn_root_fini( ctx, ctx->root_txn, ctx->root_txn_idx ); + ctx->root_txn = NULL; + + } else if( FD_LIKELY( ctx->root_target_xid.ul[0]!=ULONG_MAX ) ) { + /* Check if there's a child txn to root */ + fd_funk_t * funk = ctx->funk; + ulong child_idx = fd_funk_txn_idx( funk->shmem->child_head_cidx ); + + if( !fd_funk_txn_idx_is_null( child_idx ) ) { + fd_funk_txn_t * child = &funk->txn_pool->ele[ child_idx ]; + fd_funk_txn_xid_t const * child_xid = fd_funk_txn_xid( child ); + + ulong target_slot = ctx->root_target_xid.ul[0]; + ulong child_slot = child_xid->ul[0]; + + if( child_slot <= target_slot ) { + int genesis_override = !child_slot; + int delay_ok = genesis_override + || !ctx->write_delay_slots + || (target_slot - child_slot >= ctx->write_delay_slots); + + if( delay_ok ) { + ctx->root_txn = child; + ctx->root_txn_idx = child_idx; + ctx->root_rec = fd_accdb_txn_root_start( ctx, child ); + + if( ctx->root_rec ) { + ctx->root_rec = fd_accdb_v2_root_batch( ctx, ctx->root_rec ); + if( !ctx->root_rec ) { + fd_accdb_txn_root_fini( ctx, ctx->root_txn, ctx->root_txn_idx ); + ctx->root_txn = NULL; + } + } else { + fd_accdb_txn_root_fini( ctx, ctx->root_txn, ctx->root_txn_idx ); + ctx->root_txn = NULL; + } + } + } + } + } + /* If we've written enough to justify appending a parallel recovery partition, append one. */ @@ -607,21 +600,21 @@ during_housekeeping( fd_vinyl_tile_t * ctx ) { times for highest performance, etc) and unaccounted zero padding garbage to be absorbed when nothing else is going on. */ - int gc_eager = vinyl->gc_eager; - if( FD_LIKELY( gc_eager>=0 ) ) { + // int gc_eager = vinyl->gc_eager; + // if( FD_LIKELY( gc_eager>=0 ) ) { - /* Saturating wide left shift */ - ulong overflow = (ctx->accum_garbage_cnt >> (63-gc_eager) >> 1); /* sigh ... avoid wide shift UB */ - ulong compact_max = fd_ulong_max( fd_ulong_if( !overflow, ctx->accum_garbage_cnt << gc_eager, ULONG_MAX ), 1UL ); + // /* Saturating wide left shift */ + // ulong overflow = (ctx->accum_garbage_cnt >> (63-gc_eager) >> 1); /* sigh ... avoid wide shift UB */ + // ulong compact_max = fd_ulong_max( fd_ulong_if( !overflow, ctx->accum_garbage_cnt << gc_eager, ULONG_MAX ), 1UL ); - /**/ ctx->accum_garbage_cnt = 0UL; - vinyl->garbage_sz += ctx->accum_garbage_sz; ctx->accum_garbage_sz = 0UL; + // /**/ ctx->accum_garbage_cnt = 0UL; + // vinyl->garbage_sz += ctx->accum_garbage_sz; ctx->accum_garbage_sz = 0UL; - ulong garbage_pre = vinyl->garbage_sz; - fd_vinyl_compact( vinyl, compact_max ); - FD_MCNT_INC( ACCDB, CUM_GC_BYTES, garbage_pre - vinyl->garbage_sz ); + // ulong garbage_pre = vinyl->garbage_sz; + // fd_accdb_compact( vinyl, compact_max ); + // FD_MCNT_INC( ACCDB, CUM_GC_BYTES, garbage_pre - vinyl->garbage_sz ); - } + // } /* Update vinyl sync block (Required to reclaim bstream space freed by compaction) */ @@ -655,7 +648,7 @@ during_housekeeping( fd_vinyl_tile_t * ctx ) { /* If should_shutdown returns non-zero, the vinyl tile is shut down */ static int -should_shutdown( fd_vinyl_tile_t * ctx ) { +should_shutdown( fd_accdb_tile_t * ctx ) { if( FD_UNLIKELY( !ctx->booted ) ) return 0; if( FD_LIKELY( !ctx->shutdown ) ) return 0; @@ -699,7 +692,7 @@ should_shutdown( fd_vinyl_tile_t * ctx ) { } static void -metrics_write( fd_vinyl_tile_t * ctx ) { +metrics_write( fd_accdb_tile_t * ctx ) { if( FD_UNLIKELY( !ctx->booted ) ) return; fd_vinyl_t * vinyl = ctx->vinyl; fd_vinyl_io_t * io = vinyl->io; @@ -731,7 +724,7 @@ metrics_write( fd_vinyl_tile_t * ctx ) { /* before_credit runs every main loop iteration */ static void -before_credit( fd_vinyl_tile_t * ctx, +before_credit( fd_accdb_tile_t * ctx, fd_stem_context_t * stem, int * charge_busy ) { (void)stem; @@ -744,17 +737,11 @@ before_credit( fd_vinyl_tile_t * ctx, fd_vinyl_line_t * line = vinyl->line; fd_vinyl_data_t * data = vinyl->data; - ulong pair_max = vinyl->pair_max; - fd_vinyl_meta_ele_t * ele0 = meta->ele; ulong ele_max = meta->ele_max; ulong meta_seed = meta->seed; - ulong * lock = meta->lock; - int lock_shift = meta->lock_shift; - ulong data_laddr0 = (ulong)data->laddr0; - fd_vinyl_data_vol_t const * vol = data->vol; - ulong vol_cnt = data->vol_cnt; + void * data_laddr0 = data->laddr0; ulong line_cnt = vinyl->line_cnt; @@ -865,40 +852,15 @@ before_credit( fd_vinyl_tile_t * ctx, ulong fail_cnt = 0UL; ulong read_cnt = 0UL; - ulong append_cnt = 0UL; - ulong accum_cache_hit = 0UL; switch( req->type ) { - -# include "../../vinyl/fd_vinyl_case_acquire.c" -# include "../../vinyl/fd_vinyl_case_release.c" -# include "../../vinyl/fd_vinyl_case_erase.c" - /* FIXME support more request types */ - +# include "fd_accdb_case_acquire.c" default: FD_LOG_CRIT(( "unsupported request type %u", (uint)req->type )); comp_err = FD_VINYL_ERR_INVAL; break; } - FD_MCNT_INC( ACCDB, REQUEST_BATCHES, 1UL ); - switch( req->type ) { - case FD_VINYL_REQ_TYPE_ACQUIRE: - FD_MCNT_INC( ACCDB, REQUESTS_ACQUIRE, batch_cnt ); - FD_MCNT_INC( ACCDB, READ_OPS_SHARED_CACHE, accum_cache_hit ); - break; - case FD_VINYL_REQ_TYPE_RELEASE: - /* FIXME missing metrics: - - ReadBytes(SharedCache) - - WriteOps(SharedCache) - - WriteBytes(SharedCache) */ - FD_MCNT_INC( ACCDB, REQUESTS_RELEASE, batch_cnt ); - break; - case FD_VINYL_REQ_TYPE_ERASE: - FD_MCNT_INC( ACCDB, REQUESTS_ERASE, batch_cnt ); - break; - } - for( ; read_cnt; read_cnt-- ) { fd_vinyl_io_rd_t * _rd; /* avoid pointer escape */ fd_vinyl_io_poll( io, &_rd, FD_VINYL_IO_FLAG_BLOCKING ); @@ -929,8 +891,8 @@ before_credit( fd_vinyl_tile_t * ctx, ulong line_idx = obj->line_idx; - FD_CRIT( line_idxctl = fd_vinyl_bstream_ctl( FD_VINYL_BSTREAM_CTL_TYPE_PAIR, FD_VINYL_BSTREAM_CTL_STYLE_RAW, val_sz ); - phdr->key = cphdr->key; - phdr->info = cphdr->info; - } else { FD_LOG_CRIT(( "corrupt bstream record (seq=%lu cpair_style=%d)", seq, cpair_style )); } @@ -987,28 +933,10 @@ before_credit( fd_vinyl_tile_t * ctx, } - if( FD_UNLIKELY( append_cnt ) ) fd_vinyl_io_commit( io, FD_VINYL_IO_FLAG_BLOCKING ); - if( FD_LIKELY( comp_err<=0 ) ) fd_vinyl_cq_send( cq, comp, req_id, link_id, comp_err, batch_cnt, fail_cnt, quota_rem ); client->quota_rem = quota_rem; - /* Update metrics. Derive counters from vinyl locals - - append_cnt is incremented in these places: - - fd_vinyl_case_erase.c (fd_vinyl_io_append_dead, with accum_dead_cnt) - - fd_vinyl_case_move.c (fd_vinyl_io_append_move, with accum_move_cnt) - - fd_vinyl_case_move.c (fd_vinyl_io_append(pair)) - - fd_vinyl_case_release.c (fd_vinyl_io_append_pair_inplace) - - fd_vinyl_case_release.c (fd_vinyl_io_append_dead, with accum_dead_cnt) - - We can thus infer the number of pair blocks appended by - subtracting accum_* */ - - ulong const dead_cnt = accum_dead_cnt - ctx->accum_dead_cnt; - FD_MCNT_INC( ACCDB, BLOCKS_PAIR, append_cnt - dead_cnt ); - FD_MCNT_INC( ACCDB, BLOCKS_DEAD, dead_cnt ); - } ctx->accum_dead_cnt = accum_dead_cnt; @@ -1016,11 +944,49 @@ before_credit( fd_vinyl_tile_t * ctx, ctx->accum_garbage_sz = accum_garbage_sz; } +/* during_frag copies the root xid from the replay_accdb link dcache + into pending_xid (scratch). Committed in after_frag. */ + +static inline void +during_frag( fd_accdb_tile_t * ctx, + ulong in_idx FD_PARAM_UNUSED, + ulong seq FD_PARAM_UNUSED, + ulong sig FD_PARAM_UNUSED, + ulong chunk, + ulong sz, + ulong ctl FD_PARAM_UNUSED ) { + if( FD_UNLIKELY( sz!=sizeof(fd_funk_txn_xid_t) ) ) return; + if( FD_UNLIKELY( chunkroot_in_chunk0 || chunk>ctx->root_in_wmark ) ) { + FD_LOG_ERR(( "chunk %lu out of range [%lu,%lu]", chunk, ctx->root_in_chunk0, ctx->root_in_wmark )); + } + fd_funk_txn_xid_t const * xid = fd_chunk_to_laddr_const( ctx->root_in_mem, chunk ); + ctx->pending_xid = *xid; +} + +/* after_frag updates root_target_xid with the confirmed frag. + Always consumes immediately — root processing happens in + before_credit using funk's child list. */ + +static inline void +after_frag( fd_accdb_tile_t * ctx, + ulong in_idx FD_PARAM_UNUSED, + ulong seq FD_PARAM_UNUSED, + ulong sig FD_PARAM_UNUSED, + ulong sz, + ulong tsorig FD_PARAM_UNUSED, + ulong tspub FD_PARAM_UNUSED, + fd_stem_context_t * stem FD_PARAM_UNUSED ) { + if( FD_UNLIKELY( sz!=sizeof(fd_funk_txn_xid_t) ) ) return; + ctx->root_target_xid = ctx->pending_xid; +} + #define STEM_BURST (1UL) #define STEM_LAZY (10000) /* housekeep every 10 us */ -#define STEM_CALLBACK_CONTEXT_TYPE fd_vinyl_tile_t +#define STEM_CALLBACK_CONTEXT_TYPE fd_accdb_tile_t #define STEM_CALLBACK_CONTEXT_ALIGN fd_vinyl_align() #define STEM_CALLBACK_BEFORE_CREDIT before_credit +#define STEM_CALLBACK_DURING_FRAG during_frag +#define STEM_CALLBACK_AFTER_FRAG after_frag #define STEM_CALLBACK_DURING_HOUSEKEEPING during_housekeeping #define STEM_CALLBACK_METRICS_WRITE metrics_write #define STEM_CALLBACK_SHOULD_SHUTDOWN should_shutdown diff --git a/src/discof/accdb/fd_accdb_tile_cache.c b/src/discof/accdb/fd_accdb_tile_cache.c new file mode 100644 index 00000000000..756e7413dc0 --- /dev/null +++ b/src/discof/accdb/fd_accdb_tile_cache.c @@ -0,0 +1,231 @@ +/* fd_accdb_compact is the accdb tile's version of fd_vinyl_compact. + + It is functionally identical except: + - Lines use obj_gaddr (ulong) instead of obj (pointer). + Resolved via fd_vinyl_data_laddr( gaddr, data->laddr0 ). + - Lines use fd_accdb_line_ctl_ref (24-bit ref with CHANCE/EVICTING + bits) instead of fd_vinyl_line_ctl_ref (32-bit ref). */ + +FD_FN_UNUSED static void +fd_accdb_compact( fd_vinyl_t * vinyl, + ulong compact_max ) { + + fd_vinyl_io_t * io = vinyl->io; + ulong gc_thresh = vinyl->gc_thresh; + int gc_eager = vinyl->gc_eager; + int style = vinyl->style; + + ulong io_seed = fd_vinyl_io_seed ( io ); (void)io_seed; + ulong seq_past = fd_vinyl_io_seq_past ( io ); + ulong seq_present = fd_vinyl_io_seq_present( io ); + + if( FD_UNLIKELY( (!compact_max) | ((seq_present-seq_past)<=gc_thresh) | (gc_eager<0) ) ) return; + + fd_vinyl_meta_t * meta = vinyl->meta; + fd_vinyl_line_t * line = vinyl->line; + ulong line_cnt = vinyl->line_cnt; + ulong garbage_sz = vinyl->garbage_sz; + + fd_vinyl_meta_ele_t * ele0 = meta->ele; + ulong ele_max = meta->ele_max; + ulong meta_seed = meta->seed; + + fd_vinyl_data_t * data = vinyl->data; + + fd_vinyl_data_vol_t * vol = data->vol; (void)vol; + ulong vol_cnt = data->vol_cnt; (void)vol_cnt; + + void * data_laddr0 = data->laddr0; + + ulong seq = seq_past; + + for( ulong rem=compact_max; rem; rem-- ) { + + ulong past_sz_new = fd_vinyl_io_seq_future( io ) - seq; + if( FD_UNLIKELY( (past_sz_new <= gc_thresh ) | + (garbage_sz <= (past_sz_new >> gc_eager)) | + (fd_vinyl_seq_ge( seq, seq_present ) ) ) ) { + FD_CRIT( fd_vinyl_seq_le( seq, seq_present ), "corruption detected" ); + if( FD_UNLIKELY( fd_vinyl_seq_eq( seq, seq_present ) ) ) FD_CRIT( !garbage_sz, "corruption detected" ); + break; + } + + fd_vinyl_bstream_block_t block[1]; + + fd_vinyl_io_read_imm( io, seq, block, FD_VINYL_BSTREAM_BLOCK_SZ ); + + ulong ctl = block->ctl; + + int type = fd_vinyl_bstream_ctl_type( ctl ); + + switch( type ) { + + case FD_VINYL_BSTREAM_CTL_TYPE_PAIR: { + + int pair_style = fd_vinyl_bstream_ctl_style( ctl ); + ulong pair_val_esz = fd_vinyl_bstream_ctl_sz ( ctl ); + fd_vinyl_key_t const * pair_key = &block->phdr.key; + ulong pair_val_sz = (ulong)block->phdr.info.val_sz; + + ulong pair_sz = fd_vinyl_bstream_pair_sz( pair_val_esz ); + + int truncated = (pair_sz > (seq_present - seq)); /* Wrapping safe */ + int bad_esz = (pair_val_esz > FD_VINYL_VAL_MAX); + int bad_sz = (pair_val_sz > FD_VINYL_VAL_MAX); + + FD_CRIT( !(truncated | bad_esz | bad_sz), truncated ? "truncated pair" : + bad_esz ? "unexpected pair value encoded size" : + "pair value size too large" ); + +# if FD_PARANOID + fd_vinyl_bstream_block_t _ftr[1]; + fd_vinyl_bstream_block_t * ftr = _ftr; + + if( FD_UNLIKELY( pair_sz <= FD_VINYL_BSTREAM_BLOCK_SZ ) ) ftr = block; + else fd_vinyl_io_read_imm( io, seq + pair_sz - FD_VINYL_BSTREAM_BLOCK_SZ, ftr, FD_VINYL_BSTREAM_BLOCK_SZ ); + + FD_ALERT( !fd_vinyl_bstream_pair_test_fast( io_seed, seq, block, ftr ), "corruption detected" ); +# endif + + ulong pair_memo = fd_vinyl_key_memo( meta_seed, pair_key ); + + ulong _ele_idx; /* avoid pointer escape */ + int err = fd_vinyl_meta_query_fast( ele0, ele_max, pair_key, pair_memo, &_ele_idx ); + ulong ele_idx = _ele_idx; + + if( FD_LIKELY( !err ) ) { + + if( FD_LIKELY( fd_vinyl_meta_ele_in_bstream( &ele0[ ele_idx ] ) ) ) { + + ulong pair_seq = ele0[ ele_idx ].seq; + + if( FD_LIKELY( fd_vinyl_seq_eq( pair_seq, seq ) ) ) { + + FD_CRIT( !memcmp( &ele0[ ele_idx ].phdr, &block->phdr, sizeof(fd_vinyl_bstream_phdr_t) ), "corruption detected" ); + + int pair_style_new; + ulong pair_val_esz_new; + ulong pair_seq_new; + + int do_copy = 1; + + ulong line_idx = ele0[ ele_idx ].line_idx; + + if( FD_LIKELY( line_idx!=ULONG_MAX ) ) { /* Pair is in cache */ + + FD_CRIT( line_idxline_idx==line_idx, "corruption detected" ); + FD_CRIT ( !obj->rd_active, "corruption detected" ); + + ulong line_ctl = line[ line_idx ].ctl; + + if( FD_LIKELY( fd_accdb_line_ctl_ref( line_ctl )>=0L ) ) { /* Pair cached and not acquired for modify */ + + fd_vinyl_bstream_phdr_t * phdr = fd_vinyl_data_obj_phdr( obj ); + + FD_ALERT( !memcmp( phdr, &block->phdr, sizeof(fd_vinyl_bstream_phdr_t) ), "corruption detected" ); + + pair_seq_new = fd_vinyl_io_append_pair_inplace( io, style, phdr, &pair_style_new, &pair_val_esz_new ); + + do_copy = 0; + + } + + } + + if( do_copy ) { /* Pair is either not in cache or acquired for modify, append from the bstream */ + + if( FD_LIKELY( (pair_style!=FD_VINYL_BSTREAM_CTL_STYLE_RAW) | + (style ==FD_VINYL_BSTREAM_CTL_STYLE_RAW) | + (pair_sz ==FD_VINYL_BSTREAM_BLOCK_SZ ) ) ) { + + pair_style_new = pair_style; + pair_val_esz_new = fd_vinyl_bstream_ctl_sz( ele0[ ele_idx ].phdr.ctl ); + pair_seq_new = fd_vinyl_io_copy( io, pair_seq, pair_sz ); + + } else { + + ulong cpair_max = fd_vinyl_bstream_pair_sz( (ulong)LZ4_COMPRESSBOUND( (int)pair_val_sz ) ); + ulong scratch_max = cpair_max + pair_sz; + + fd_vinyl_bstream_phdr_t * cphdr = (fd_vinyl_bstream_phdr_t *) + fd_vinyl_io_alloc( io, scratch_max, FD_VINYL_IO_FLAG_BLOCKING ); + + fd_vinyl_bstream_phdr_t * phdr = (fd_vinyl_bstream_phdr_t *)((ulong)cphdr + cpair_max); + + fd_vinyl_io_read_imm( io, seq, phdr, pair_sz ); + + fd_vinyl_io_trim( io, scratch_max ); + + pair_seq_new = fd_vinyl_io_append_pair_inplace( io, style, phdr, &pair_style_new, &pair_val_esz_new ); + + if( FD_UNLIKELY( pair_style_new==FD_VINYL_BSTREAM_CTL_STYLE_RAW ) ) io->spad_used += scratch_max; + + } + } + + ele0[ ele_idx ].phdr.ctl = fd_vinyl_bstream_ctl( FD_VINYL_BSTREAM_CTL_TYPE_PAIR, pair_style_new, pair_val_esz_new ); + ele0[ ele_idx ].seq = pair_seq_new; + + } else { + + FD_CRIT( fd_vinyl_seq_gt( pair_seq, seq ), "corruption detected" ); + + garbage_sz -= pair_sz; + + } + + } else { + + garbage_sz -= pair_sz; + + } + + } else { + + garbage_sz -= pair_sz; + + } + + seq += pair_sz; + break; + + } + + case FD_VINYL_BSTREAM_CTL_TYPE_DEAD: + case FD_VINYL_BSTREAM_CTL_TYPE_MOVE: + case FD_VINYL_BSTREAM_CTL_TYPE_PART: { + + FD_ALERT( !fd_vinyl_bstream_block_test( io_seed, block ), "corruption detected" ); + + garbage_sz -= FD_VINYL_BSTREAM_BLOCK_SZ; + seq += FD_VINYL_BSTREAM_BLOCK_SZ; + break; + + } + + case FD_VINYL_BSTREAM_CTL_TYPE_ZPAD: { + + FD_ALERT( !fd_vinyl_bstream_zpad_test( io_seed, seq, block ), "corruption detected" ); + + seq += FD_VINYL_BSTREAM_BLOCK_SZ; + break; + + } + + default: FD_LOG_CRIT(( "%016lx: unknown type (%x)", seq, (uint)type )); + + } + + } + + fd_vinyl_io_commit( io, FD_VINYL_IO_FLAG_BLOCKING ); + fd_vinyl_io_forget( io, seq ); + + vinyl->garbage_sz = garbage_sz; +} diff --git a/src/discof/accdb/fd_accdb_tile_private.h b/src/discof/accdb/fd_accdb_tile_private.h new file mode 100644 index 00000000000..623ae0ac95d --- /dev/null +++ b/src/discof/accdb/fd_accdb_tile_private.h @@ -0,0 +1,236 @@ +#ifndef HEADER_fd_src_discof_accdb_fd_accdb_tile_private_h +#define HEADER_fd_src_discof_accdb_fd_accdb_tile_private_h + +#include "../../vinyl/fd_vinyl.h" +#include "../../funk/fd_funk.h" +#include "../../util/io_uring/fd_io_uring.h" +#include "fd_accdb_line_ctl.h" + +/* fd_accdb_line_ctl_clear atomically bumps the version, clears + EVICTING and CHANCE, and sets ref to new_ref. Uses a CAS loop + to handle concurrent specreader pin/unpin safely. Any in-flight + specreader ADD/SUBs that race with the CAS simply cause a retry + (the specreader bails on EVICTING and SUBs back immediately). */ + +static inline void +fd_accdb_line_ctl_clear( fd_vinyl_line_t * line, + ulong line_idx, + long new_ref ) { + for(;;) { + ulong cur = FD_VOLATILE_CONST( line[ line_idx ].ctl ); + ulong new = fd_accdb_line_ctl( fd_accdb_line_ctl_ver( cur )+1UL, new_ref ); + if( FD_LIKELY( FD_ATOMIC_CAS( &line[ line_idx ].ctl, cur, new )==cur ) ) return; + FD_SPIN_PAUSE(); + } +} + +#define FD_VINYL_CLIENT_MAX (1024UL) +#define FD_VINYL_REQ_MAX (1024UL) + +struct fd_vinyl_client { + fd_vinyl_rq_t * rq; /* Channel for requests from this client (could be shared by multiple vinyl instances) */ + fd_vinyl_cq_t * cq; /* Channel for completions from this client to this vinyl instance + (could be shared by multiple receivers of completions from this vinyl instance). */ + ulong burst_max; /* Max requests receive from this client at a time */ + ulong seq; /* Sequence number of the next request to receive in the rq */ + ulong link_id; /* Identifies requests from this client to this vinyl instance in the rq */ + ulong laddr0; /* A valid non-zero gaddr from this client maps to the vinyl instance's laddr laddr0 + gaddr ... */ + ulong laddr1; /* ... and thus is in (laddr0,laddr1). A zero gaddr maps to laddr NULL. */ + ulong quota_rem; /* Num of remaining acquisitions this client is allowed on this vinyl instance */ + ulong quota_max; /* Max quota */ +}; + +typedef struct fd_vinyl_client fd_vinyl_client_t; + +struct fd_accdb_tile { + + fd_funk_t funk[1]; + + /* Vinyl objects */ + + fd_vinyl_t vinyl[1]; + void * io_mem; + + /* Tile architecture */ + + uint booted : 1; + uint shutdown : 1; + struct { + ulong state_expected; + ulong volatile const * state; + ulong volatile const * pair_cnt; + /* When booting from genesis only */ + struct { + ulong io_seed; + } from_genesis; + } boot; + + /* I/O */ + + int bstream_fd; + ulong bstream_file_sz; + + /* io_uring */ + + fd_io_uring_t ring[1]; + void * ioring_shmem; /* shared between kernel and user */ + + /* Clients */ + + fd_vinyl_client_t _client[ FD_VINYL_CLIENT_MAX ]; + ulong client_cnt; + ulong client_idx; + + /* Received requests */ + + fd_vinyl_req_t _req[ FD_VINYL_REQ_MAX ]; + ulong req_head; /* Requests [0,req_head) have been processed */ + ulong req_tail; /* Requests [req_head,req_tail) are pending */ + /* Requests [req_tail,ULONG_MAX) have not been received */ + ulong exec_max; + + /* accum_dead_cnt is the number of dead blocks that have been + written since the last partition block. + + accum_move_cnt is the number of move blocks that have been + written since this last partition block. + + accum_garbage_cnt / sz is the number of items / bytes garbage in + the bstream that have accumulated since the last time we compacted + the bstream. We use this to estimate the number of rounds of + compaction to do in async handling. */ + + ulong accum_dead_cnt; + ulong accum_garbage_cnt; + ulong accum_garbage_sz; + + /* Run loop state */ + + ulong seq_part; + + /* Periodic syncing */ + + long sync_next_ns; + + /* Vinyl limit on the number of pairs the meta map will accept. + Exceeding this limit will trigger a LOG_ERR. */ + ulong pair_cnt_limit; + + uint clock_hand; /* CLOCK sweep position, in [0,line_cnt) */ + int root_populate_cache; /* If non-zero, root_batch copies rooted pairs into cache with least priority */ + + /* Rooting — the replay tile sends root target xids via stem link. + The accdb tile consumes them immediately (after_frag) and walks + funk's child list in during_housekeeping to find the oldest unrooted + txn, publishing it to vinyl subject to write_delay_slots. */ + + fd_funk_txn_t * root_txn; /* txn being rooted, NULL if idle */ + fd_funk_rec_t * root_rec; /* next rec head for root_batch, NULL if done */ + ulong root_txn_idx; /* index of root_txn in txn_pool */ + + fd_funk_txn_xid_t root_target_xid; /* newest root xid from replay; ul[0]==ULONG_MAX means none received yet */ + ulong write_delay_slots; + + /* Stem input link for root messages from replay */ + + fd_wksp_t * root_in_mem; + ulong root_in_chunk0; + ulong root_in_wmark; + + /* Scratch for during_frag → after_frag handoff */ + + fd_funk_txn_xid_t pending_xid; +}; + +typedef struct fd_accdb_tile fd_accdb_tile_t; + +FD_PROTOTYPES_BEGIN + +/* fd_accdb_clock_evict uses a CLOCK sweep to select and evict a + cache line. Scans from clock_hand (mod line_cnt), giving each + unreferenced line with chance==1 a "second chance" (clearing chance + to 0) and spinning until it finds an unreferenced line with + chance==0 that it can claim via CAS. Frees the data obj, + disconnects meta, and bumps the version inline. Returns the + evicted line_idx. */ + +static inline ulong +fd_accdb_clock_evict( fd_accdb_tile_t * ctx, + fd_vinyl_line_t * line, + ulong line_cnt, + fd_vinyl_meta_ele_t * ele0, + ulong ele_max, + fd_vinyl_data_t * data ) { + uint hand = ctx->clock_hand; + + for(;;) { + + ulong hand_ctl = line[ hand ].ctl; + + if( FD_LIKELY( !fd_accdb_line_ctl_ref( hand_ctl ) ) ) { + + if( FD_UNLIKELY( hand_ctl & FD_ACCDB_LINE_CTL_CHANCE ) ) { + FD_ATOMIC_FETCH_AND_AND( &line[ hand ].ctl, + ~FD_ACCDB_LINE_CTL_CHANCE ); + hand = (uint)((hand+1U<(uint)line_cnt) ? hand+1U : 0U); + continue; + } + + /* Try to claim for eviction via CAS. CAS proves ref==0 at + this instant. */ + if( FD_LIKELY( FD_ATOMIC_CAS( &line[ hand ].ctl, + hand_ctl, + hand_ctl | FD_ACCDB_LINE_CTL_EVICTING )==hand_ctl ) ) { + + /* Drain any specread pins that raced with the EVICTING CAS. + A specread that did FETCH_AND_ADD after our CAS will see + EVICTING in old_ctl and immediately FETCH_AND_SUB back. + We must wait for that SUB to land before ctl_clear, which + would otherwise capture the transient +1 ref in its own + CAS and leave ref at -1 after the specread's SUB. */ + while( FD_UNLIKELY( fd_accdb_line_ctl_ref( + FD_VOLATILE_CONST( line[ hand ].ctl ) ) > 0L ) ) { + FD_SPIN_PAUSE(); + } + + break; + } + } + + hand = (uint)((hand+1U<(uint)line_cnt) ? hand+1U : 0U); + } + + ctx->clock_hand = (uint)((hand+1U<(uint)line_cnt) ? hand+1U : 0U); + + /* Evict: free data obj, disconnect meta */ + + void * data_laddr0 = data->laddr0; + ulong obj_gaddr = line[ hand ].obj_gaddr; + ulong ele_idx = line[ hand ].ele_idx; + + if( FD_LIKELY( obj_gaddr ) ) { + FD_LOG_ERR(( "evicting obj_gaddr=%lu", obj_gaddr )); + fd_vinyl_data_obj_t * obj = fd_vinyl_data_laddr( obj_gaddr, data_laddr0 ); + FD_CRIT( obj->line_idx==(ulong)hand, "corruption detected" ); + FD_CRIT( !obj->rd_active, "corruption detected" ); + fd_vinyl_data_free( data, obj ); + line[ hand ].obj_gaddr = 0UL; + } + + if( FD_LIKELY( ele_idxrec_pool->ele ); + ulong volatile * vl = &funk->rec_lock[ rec_idx ]; + ulong const ver_lock = FD_VOLATILE_CONST( *vl ); + ulong const ver = fd_funk_rec_ver_bits ( ver_lock ); + ulong const lock = fd_funk_rec_lock_bits( ver_lock ); + if( FD_UNLIKELY( lock ) ) { + /* Active readers — yield to caller */ + return ULONG_MAX; + } + ulong const new_ver = fd_funk_rec_ver_inc( ver ); + ulong const new_vl = fd_funk_rec_ver_lock( new_ver, FD_FUNK_REC_LOCK_MASK ); + if( FD_UNLIKELY( FD_ATOMIC_CAS( vl, ver_lock, new_vl )!=ver_lock ) ) { + /* CAS failed (race with another lock operation) — yield to caller */ + return ULONG_MAX; + } + return new_vl; +} + +static void +fd_funk_rec_admin_unlock( fd_funk_t const * funk, + fd_funk_rec_t * rec, + ulong ver_lock ) { + ulong rec_idx = (ulong)( rec - funk->rec_pool->ele ); + ulong volatile * vl = &funk->rec_lock[ rec_idx ]; + FD_VOLATILE( *vl ) = fd_funk_rec_ver_lock( fd_funk_rec_ver_bits( ver_lock ), 0UL ); +} + +/* funk_free_rec_locked frees a funk record that already has the admin + lock held (ver_lock from fd_funk_rec_admin_lock). */ + +static void +funk_free_rec_locked( fd_funk_t * funk, + fd_funk_rec_t * rec, + ulong ver_lock ) { + memset( &rec->pair, 0, sizeof(fd_funk_xid_key_pair_t) ); + FD_COMPILER_MFENCE(); + rec->map_next = FD_FUNK_REC_IDX_NULL; + fd_funk_val_flush( rec, funk->alloc, funk->wksp ); + fd_funk_rec_admin_unlock( funk, rec, ver_lock ); + fd_funk_rec_pool_release( funk->rec_pool, rec, 1 ); +} + +/* funk_free_rec attempts to admin-lock and free a funk record. + Returns 0 on success, 1 if the lock could not be acquired (active + readers). On failure the caller should retry later. */ + +static int +funk_free_rec( fd_funk_t * funk, + fd_funk_rec_t * rec ) { + FD_COMPILER_MFENCE(); + ulong ver_lock = fd_funk_rec_admin_lock( funk, rec ); + if( FD_UNLIKELY( ver_lock==ULONG_MAX ) ) return 1; + funk_free_rec_locked( funk, rec, ver_lock ); + return 0; +} + +/* funk_gc_chain optimistically deletes all but the newest rooted + revisions of rec. This possibly deletes 'rec'. Returns rec if rec + is the only known rooted revision, otherwise returns NULL (if rec was + deleted). Note that due to edge cases, revisions that are not in the + oldest tracked slot, may not reliably get cleaned up. (The oldest + tracked slot always gets cleaned up, though.) */ + +static fd_funk_rec_t * +funk_gc_chain( ulong root_slot, + fd_funk_t * funk, + fd_funk_rec_t * const rec ) { + + fd_funk_rec_t * rec_pool = funk->rec_pool->ele; + ulong rec_max = funk->rec_pool->ele_max; + ulong seed = funk->rec_map->map->seed; + ulong chain_cnt = funk->rec_map->map->chain_cnt; + + ulong hash = fd_funk_rec_map_key_hash( &rec->pair, seed ); + ulong chain_idx = (hash & (chain_cnt-1UL) ); + + /* Lock rec_map chain */ + + int lock_err = fd_funk_rec_map_iter_lock( funk->rec_map, &chain_idx, 1UL, FD_MAP_FLAG_BLOCKING ); + if( FD_UNLIKELY( lock_err!=FD_MAP_SUCCESS ) ) { + FD_LOG_CRIT(( "fd_funk_rec_map_iter_lock failed (%i-%s)", lock_err, fd_map_strerror( lock_err ) )); + } + + fd_funk_rec_map_shmem_private_chain_t * chain = + fd_funk_rec_map_shmem_private_chain( funk->rec_map->map, 0UL ) + chain_idx; + ulong ver = + fd_funk_rec_map_private_vcnt_ver( FD_VOLATILE_CONST( chain->ver_cnt ) ); + FD_CRIT( ver&1UL, "chain is not locked" ); + + /* Walk map chain */ + + fd_funk_rec_t * found_rec = NULL; + uint * pnext = &chain->head_cidx; + uint cur = *pnext; + ulong chain_len = 0UL; + ulong iter = 0UL; + while( cur!=FD_FUNK_REC_IDX_NULL ) { + if( FD_UNLIKELY( iter++ > rec_max ) ) FD_LOG_CRIT(( "cycle detected in rec_map chain %lu", chain_idx )); + + /* Is this node garbage? */ + + fd_funk_rec_t * node = &funk->rec_pool->ele[ cur ]; + if( FD_UNLIKELY( cur==node->map_next ) ) FD_LOG_CRIT(( "accdb corruption detected: cycle in rec_map chain %lu", chain_idx )); + cur = node->map_next; + if( !fd_funk_rec_key_eq( rec->pair.key, node->pair.key ) ) goto retain; + if( node->pair.xid->ul[0]>root_slot ) goto retain; + if( !found_rec ) { + found_rec = node; + goto retain; + } + + /* No longer need this node */ + + if( node->pair.xid->ul[0] > rec->pair.xid->ul[0] ) { + /* If this node is newer than the to-be-deleted slot, need to + remove it from the transaction's record list. */ + uint neigh_prev = node->prev_idx; + uint neigh_next = node->next_idx; + if( neigh_prev==FD_FUNK_REC_IDX_NULL || + neigh_next==FD_FUNK_REC_IDX_NULL ) { + /* Node is first or last of transaction -- too bothersome to + remove it from the transaction's record list */ + goto retain; + } + rec_pool[ neigh_next ].prev_idx = neigh_prev; + rec_pool[ neigh_prev ].next_idx = neigh_next; + } + + /* Destroy this node (skip if lock is contended — will retry + on the next root batch) */ + + if( FD_UNLIKELY( funk_free_rec( funk, node ) ) ) goto retain; + *pnext = cur; + continue; + + retain: + pnext = &node->map_next; + chain_len++; + } + + /* Unlock rec_map chain */ + + FD_COMPILER_MFENCE(); + FD_VOLATILE( chain->ver_cnt ) = + fd_funk_rec_map_private_vcnt( ver+1UL, chain_len ); + FD_COMPILER_MFENCE(); + return found_rec==rec ? found_rec : NULL; +} + +/* accdb_invalidate_line sets the EVICTING flag on a cached line, + checks that all specread pins have drained, then frees the data obj, + disconnects the line from meta, and bumps the version via CAS. + Returns 0 on success, 1 if specread refs are still active (caller + should retry later). Caller must be the vinyl tile (single + writer). */ + +static int +accdb_invalidate_line( fd_vinyl_line_t * line, + fd_vinyl_meta_ele_t * ele0, + fd_vinyl_data_t * data, + ulong line_idx, + ulong ele_idx ) { + + /* Must not be acquired for modify by a vinyl client. Transient + specread pins (ref > 0) are OK — the EVICTING flag below will + cause them to bail. */ + FD_CRIT( fd_accdb_line_ctl_ref( line[ line_idx ].ctl ) >= 0L, + "cannot invalidate line acquired for modify" ); + + /* Set EVICTING — new specreaders will see it and bail */ + FD_ATOMIC_FETCH_AND_OR( &line[ line_idx ].ctl, + FD_ACCDB_LINE_CTL_EVICTING ); + + /* Check if existing specread refs have drained. If not, undo + EVICTING and yield to caller so accdb can service requests. */ + if( FD_UNLIKELY( fd_accdb_line_ctl_ref( FD_VOLATILE_CONST( line[ line_idx ].ctl ) ) > 0L ) ) { + FD_ATOMIC_FETCH_AND_AND( &line[ line_idx ].ctl, + ~FD_ACCDB_LINE_CTL_EVICTING ); + return 1; + } + + /* Free data obj */ + ulong obj_gaddr = line[ line_idx ].obj_gaddr; + if( FD_LIKELY( obj_gaddr ) ) { + fd_vinyl_data_obj_t * obj = fd_vinyl_data_laddr( obj_gaddr, data->laddr0 ); + fd_vinyl_data_free( data, obj ); + line[ line_idx ].obj_gaddr = 0UL; + } + + /* Disconnect line <-> meta */ + ele0[ ele_idx ].line_idx = ULONG_MAX; + line[ line_idx ].ele_idx = ULONG_MAX; + + /* Bump version, clear EVICTING via CAS */ + fd_accdb_line_ctl_clear( line, line_idx, 0L ); + return 0; +} + +/* accdb_populate_line evicts a cache line via CLOCK sweep, allocates + a data object, copies the pair into it, and returns the new + line_idx. The line is inserted with least eviction priority (no + CHANCE bit) so the CLOCK sweep will reclaim it first. Returns + ULONG_MAX if the data allocation fails (the evicted line is left + disconnected). Caller must set ele0[ele_idx].line_idx to the + returned value. */ + +static ulong +accdb_populate_line( fd_accdb_tile_t * ctx, + fd_vinyl_line_t * line, + ulong line_cnt, + fd_vinyl_meta_ele_t * ele0, + ulong ele_max, + fd_vinyl_data_t * data, + ulong ele_idx, + fd_vinyl_key_t const * key, + fd_vinyl_info_t const * info, + void const * val, + ulong val_sz ) { + + void * data_laddr0 = data->laddr0; + + ulong new_line_idx = fd_accdb_clock_evict( ctx, line, line_cnt, ele0, ele_max, data ); + + ulong szc = fd_vinyl_data_szc( val_sz ); + fd_vinyl_data_obj_t * obj = fd_vinyl_data_alloc( data, szc ); + if( FD_UNLIKELY( !obj ) ) return ULONG_MAX; + + line[ new_line_idx ].obj_gaddr = fd_vinyl_data_gaddr( obj, data_laddr0 ); + line[ new_line_idx ].ele_idx = ele_idx; + obj->line_idx = new_line_idx; + obj->rd_active = (short)0; + + fd_vinyl_bstream_phdr_t * phdr = fd_vinyl_data_obj_phdr( obj ); + phdr->ctl = fd_vinyl_bstream_ctl( FD_VINYL_BSTREAM_CTL_TYPE_PAIR, + FD_VINYL_BSTREAM_CTL_STYLE_RAW, + val_sz ); + phdr->key = *key; + phdr->info = *info; + fd_memcpy( fd_vinyl_data_obj_val( obj ), val, val_sz ); + + /* No CHANCE bit — least eviction priority */ + return new_line_idx; +} + +fd_funk_rec_t * +fd_accdb_v2_root_batch( fd_accdb_tile_t * accdb, + fd_funk_rec_t * rec0 ) { + fd_funk_t * funk = accdb->funk; + fd_wksp_t * funk_wksp = funk->wksp; /* shm workspace containing unrooted accounts */ + fd_funk_rec_t * rec_pool = funk->rec_pool->ele; /* funk rec arena */ + + fd_vinyl_t * vinyl = accdb->vinyl; + fd_vinyl_io_t * io = vinyl->io; + fd_vinyl_meta_t * meta = vinyl->meta; + fd_vinyl_line_t * line = vinyl->line; + fd_vinyl_data_t * data = vinyl->data; + + fd_vinyl_meta_ele_t * ele0 = meta->ele; + ulong ele_max = meta->ele_max; + ulong meta_seed = meta->seed; + ulong * lock = meta->lock; + int lock_shift = meta->lock_shift; + ulong line_cnt = vinyl->line_cnt; + + ulong append_cnt = 0UL; + ulong root_slot = funk->shmem->last_publish->ul[0]; + + /* Collect funk request batch */ + + fd_funk_rec_t * recs[ FD_ACCDB_ROOT_BATCH_MAX ]; + ulong rec_cnt; + + fd_funk_rec_t * next = rec0; + for( rec_cnt=0UL; next && rec_cntnext_idx ) ) { + next = NULL; + } else { + next = &rec_pool[ cur->next_idx ]; + } + cur->prev_idx = FD_FUNK_REC_IDX_NULL; + cur->next_idx = FD_FUNK_REC_IDX_NULL; + + if( funk_gc_chain( root_slot, funk, cur ) ) { + recs[ rec_cnt++ ] = cur; + } + } + + for( ulong i=0UL; ival_sz>=sizeof(fd_account_meta_t), "corrupt funk_rec" ); + + fd_vinyl_key_t const * key = + (fd_vinyl_key_t const *)fd_funk_rec_key( recs[ i ] ); + ulong memo = fd_vinyl_key_memo( meta_seed, key ); + + ulong ele_idx; + int found = fd_vinyl_meta_query_fast( ele0, ele_max, key, memo, + &ele_idx ); + + if( acct->lamports ) { + /* --- Append pair block --- */ + + ulong val_sz = (ulong)recs[ i ]->val_sz; + + fd_vinyl_info_t info; + memset( &info, 0, sizeof(fd_vinyl_info_t) ); + info.val_sz = (uint)val_sz; + + if( FD_LIKELY( !found ) ) { + /* Existing key — overwrite */ + + /* Invalidate cache if cached */ + ulong cur_line_idx = ele0[ ele_idx ].line_idx; + if( FD_LIKELY( cur_line_idx!=ULONG_MAX ) ) { + if( FD_UNLIKELY( accdb_invalidate_line( line, ele0, data, cur_line_idx, ele_idx ) ) ) + goto skip_rec; + } + + /* Garbage accounting for old pair */ + ulong val_esz_before = + fd_vinyl_bstream_ctl_sz( ele0[ ele_idx ].phdr.ctl ); + accdb->accum_garbage_cnt++; + accdb->accum_garbage_sz += + fd_vinyl_bstream_pair_sz( val_esz_before ); + + /* Append new pair to bstream */ + ulong seq = fd_vinyl_io_append_pair_raw( io, key, &info, + (void const *)acct ); + append_cnt++; + + /* Optionally copy into cache with least eviction priority */ + ulong new_line_idx = ULONG_MAX; + if( FD_LIKELY( accdb->root_populate_cache ) ) { + new_line_idx = accdb_populate_line( accdb, line, line_cnt, + ele0, ele_max, data, + ele_idx, key, &info, + (void const *)acct, val_sz ); + } + + /* Update meta (prepare/publish for existing element) */ + fd_vinyl_meta_prepare_fast( lock, lock_shift, ele_idx ); + + ele0[ ele_idx ].phdr.ctl = + fd_vinyl_bstream_ctl( FD_VINYL_BSTREAM_CTL_TYPE_PAIR, + FD_VINYL_BSTREAM_CTL_STYLE_RAW, + val_sz ); + ele0[ ele_idx ].phdr.info = info; + ele0[ ele_idx ].seq = seq; + ele0[ ele_idx ].line_idx = new_line_idx; + + fd_vinyl_meta_publish_fast( lock, lock_shift, ele_idx ); + + } else { + /* New key — insert */ + + /* Append to bstream first (need seq for meta) */ + ulong seq = fd_vinyl_io_append_pair_raw( io, key, &info, + (void const *)acct ); + append_cnt++; + + /* Optionally copy into cache with least eviction priority */ + ulong new_line_idx = ULONG_MAX; + if( FD_LIKELY( accdb->root_populate_cache ) ) { + new_line_idx = accdb_populate_line( accdb, line, line_cnt, + ele0, ele_max, data, + ele_idx, key, &info, + (void const *)acct, val_sz ); + } + + /* Insert into meta at the empty slot. Per meta.h safety tip: + "Inserting without doing a prepare is fine so long as + phdr.ctl becomes visible last." */ + + ele0[ ele_idx ].memo = memo; + ele0[ ele_idx ].phdr.key = *key; + ele0[ ele_idx ].phdr.info = info; + ele0[ ele_idx ].seq = seq; + ele0[ ele_idx ].line_idx = new_line_idx; + FD_COMPILER_MFENCE(); + ele0[ ele_idx ].phdr.ctl = + fd_vinyl_bstream_ctl( FD_VINYL_BSTREAM_CTL_TYPE_PAIR, + FD_VINYL_BSTREAM_CTL_STYLE_RAW, + val_sz ); + FD_COMPILER_MFENCE(); + + vinyl->pair_cnt++; + } + + } else { + /* --- Append erase block --- */ + + if( FD_LIKELY( !found ) ) { + /* Key exists in meta — erase it */ + + FD_CRIT( ele0[ ele_idx ].phdr.ctl!=ULONG_MAX, + "cannot erase key being created" ); + + /* Invalidate cache if cached */ + ulong cur_line_idx = ele0[ ele_idx ].line_idx; + if( FD_LIKELY( cur_line_idx!=ULONG_MAX ) ) { + if( FD_UNLIKELY( accdb_invalidate_line( line, ele0, data, cur_line_idx, ele_idx ) ) ) + goto skip_rec; + } + + /* Garbage: old pair + dead block itself */ + ulong val_esz_before = + fd_vinyl_bstream_ctl_sz( ele0[ ele_idx ].phdr.ctl ); + accdb->accum_garbage_cnt += 2UL; + accdb->accum_garbage_sz += + fd_vinyl_bstream_pair_sz( val_esz_before ) + + FD_VINYL_BSTREAM_BLOCK_SZ; + + fd_vinyl_io_append_dead( io, &ele0[ ele_idx ].phdr, NULL, 0UL ); + append_cnt++; + accdb->accum_dead_cnt++; + + /* Remove from meta (handles its own locking) */ + fd_vinyl_meta_remove_fast( ele0, ele_max, lock, lock_shift, + line, line_cnt, ele_idx ); + vinyl->pair_cnt--; + } + /* else: erase of non-existent key — no-op */ + } + continue; + + skip_rec: + /* Cache line has active specread refs — re-chain record for + next batch. The vinyl write will be harmlessly re-done. */ + FD_LOG_NOTICE(( "vinyl data contention" )); + recs[ i ]->next_idx = next ? (uint)(ulong)( next - rec_pool ) : FD_FUNK_REC_IDX_NULL; + next = recs[ i ]; + recs[ i ] = NULL; + } + + /* Commit result */ + if( FD_LIKELY( append_cnt ) ) { + fd_vinyl_io_commit( io, FD_VINYL_IO_FLAG_BLOCKING ); + } + + /* Remove funk records. Try admin lock first — if contended + (active readers), skip the record and re-chain it onto next for + the next batch. The vinyl write will be harmlessly re-done. */ + + for( ulong i=0UL; inext_idx = next ? (uint)(ulong)( next - rec_pool ) : FD_FUNK_REC_IDX_NULL; + next = recs[ i ]; + continue; + } + fd_funk_xid_key_pair_t pair = recs[ i ]->pair; + fd_funk_rec_query_t query[1]; + int rm_err = fd_funk_rec_map_remove( funk->rec_map, &pair, NULL, query, FD_MAP_FLAG_BLOCKING ); + if( FD_UNLIKELY( rm_err!=FD_MAP_SUCCESS ) ) FD_LOG_CRIT(( "fd_funk_rec_map_remove failed (%i-%s)", rm_err, fd_map_strerror( rm_err ) )); + funk_free_rec_locked( funk, recs[ i ], ver_lock ); + } + + return next; +} + +/* fd_accdb_txn_root_start prepares a funk transaction for rooting on + the accdb tile. This does: + 1. Reparent children of the txn to root (shmem child_head/tail) + 2. Mark last_publish atomically + 3. Drain users (rwlock_write + set state=PUBLISH) + 4. Detach rec list from txn + Returns the head of the detached record list, or NULL if the txn + has no records. Caller stores the returned head as root_rec and + the txn pool index as root_txn_idx for later use. */ + +fd_funk_rec_t * +fd_accdb_txn_root_start( fd_accdb_tile_t * ctx, + fd_funk_txn_t * txn ) { + fd_funk_t * funk = ctx->funk; + + /* Phase 1: Reparent children to root */ + + funk->shmem->child_head_cidx = txn->child_head_cidx; + funk->shmem->child_tail_cidx = txn->child_tail_cidx; + ulong child_idx = fd_funk_txn_idx( txn->child_head_cidx ); + while( !fd_funk_txn_idx_is_null( child_idx ) ) { + funk->txn_pool->ele[ child_idx ].parent_cidx = fd_funk_txn_cidx( FD_FUNK_TXN_IDX_NULL ); + child_idx = fd_funk_txn_idx( funk->txn_pool->ele[ child_idx ].sibling_next_cidx ); + } + + /* Phase 2: Mark as last published */ + + fd_funk_txn_xid_t xid[1]; + fd_funk_txn_xid_copy( xid, fd_funk_txn_xid( txn ) ); + fd_funk_txn_xid_st_atomic( funk->shmem->last_publish, xid ); + FD_LOG_INFO(( "accdb tile root_start xid %lu:%lu", xid->ul[0], xid->ul[1] )); + + /* Phase 3: Drain users */ + + ulong txn_idx = (ulong)( txn - funk->txn_pool->ele ); + fd_rwlock_write( &funk->txn_lock[ txn_idx ] ); + FD_VOLATILE( txn->state ) = FD_FUNK_TXN_STATE_PUBLISH; + + /* Phase 4: Detach record list */ + + fd_funk_rec_t * head = NULL; + if( !fd_funk_rec_idx_is_null( txn->rec_head_idx ) ) { + head = &funk->rec_pool->ele[ txn->rec_head_idx ]; + } + txn->rec_head_idx = FD_FUNK_REC_IDX_NULL; + txn->rec_tail_idx = FD_FUNK_REC_IDX_NULL; + + return head; +} + +/* fd_accdb_txn_root_fini completes rooting of a funk transaction. + Called after all record batches have been migrated. Removes the + txn from the txn_map, releases the rwlock, and frees the txn. */ + +void +fd_accdb_txn_root_fini( fd_accdb_tile_t * ctx, + fd_funk_txn_t * txn, + ulong txn_idx ) { + fd_funk_t * funk = ctx->funk; + + /* Phase 5: Remove txn from txn_map */ + + fd_funk_txn_xid_t xid[1]; + fd_funk_txn_xid_copy( xid, fd_funk_txn_xid( txn ) ); + fd_funk_txn_map_query_t query[1]; + int rm_err = fd_funk_txn_map_remove( funk->txn_map, xid, NULL, query, 0 ); + if( FD_UNLIKELY( rm_err!=FD_MAP_SUCCESS ) ) { + FD_LOG_CRIT(( "txn_map_remove failed xid=%lu:%lu: %i-%s", + xid->ul[0], xid->ul[1], rm_err, fd_map_strerror( rm_err ) )); + } + + /* Phase 6: Free txn */ + + fd_rwlock_unwrite( &funk->txn_lock[ txn_idx ] ); + FD_VOLATILE( txn->state ) = FD_FUNK_TXN_STATE_FREE; + txn->parent_cidx = UINT_MAX; + txn->sibling_prev_cidx = UINT_MAX; + txn->sibling_next_cidx = UINT_MAX; + txn->child_head_cidx = UINT_MAX; + txn->child_tail_cidx = UINT_MAX; + fd_funk_txn_pool_release( funk->txn_pool, txn, 1 ); + + FD_LOG_INFO(( "accdb tile root_fini xid %lu:%lu", xid->ul[0], xid->ul[1] )); +} diff --git a/src/discof/execle/fd_execle_tile.c b/src/discof/execle/fd_execle_tile.c index 10fbed15e21..55815aafa14 100644 --- a/src/discof/execle/fd_execle_tile.c +++ b/src/discof/execle/fd_execle_tile.c @@ -98,6 +98,14 @@ metrics_write( fd_execle_tile_t * ctx ) { FD_MCNT_ENUM_COPY( EXECLE, TRANSACTION_LANDED, ctx->metrics.txn_landed ); FD_MCNT_SET( EXECLE, COMPUTE_UNITS_TOTAL, ctx->runtime->metrics.cu_cum ); + + fd_accdb_user_t * accdb = ctx->accdb; + FD_MCNT_SET( EXECLE, ACCDB_LOOKUP_FUNK, accdb->base.lookup_funk ); + FD_MCNT_SET( EXECLE, ACCDB_LOOKUP_SPECRD, accdb->base.lookup_specrd ); + FD_MCNT_SET( EXECLE, ACCDB_LOOKUP_ACCDB, accdb->base.lookup_accdb ); + FD_MCNT_SET( EXECLE, ACCDB_DT_FUNK, (ulong)accdb->base.dt_funk ); + FD_MCNT_SET( EXECLE, ACCDB_DT_SPECRD, (ulong)accdb->base.dt_specrd ); + FD_MCNT_SET( EXECLE, ACCDB_DT_VINYL, (ulong)accdb->base.dt_vinyl ); } static int diff --git a/src/discof/execrp/fd_execrp_tile.c b/src/discof/execrp/fd_execrp_tile.c index 67e03abc6af..96efb6bf18b 100644 --- a/src/discof/execrp/fd_execrp_tile.c +++ b/src/discof/execrp/fd_execrp_tile.c @@ -138,7 +138,13 @@ metrics_write( fd_execrp_tile_t * ctx ) { FD_MCNT_SET( EXECRP, VM_REGIME_INTERPRETER, exec_ticks ); fd_accdb_user_t * accdb = ctx->accdb; - FD_MCNT_SET( EXECRP, ACCDB_CREATED, accdb->base.created_cnt ); + FD_MCNT_SET( EXECRP, ACCDB_CREATED, accdb->base.created_cnt ); + FD_MCNT_SET( EXECRP, ACCDB_LOOKUP_FUNK, accdb->base.lookup_funk ); + FD_MCNT_SET( EXECRP, ACCDB_LOOKUP_SPECRD, accdb->base.lookup_specrd ); + FD_MCNT_SET( EXECRP, ACCDB_LOOKUP_ACCDB, accdb->base.lookup_accdb ); + FD_MCNT_SET( EXECRP, ACCDB_DT_FUNK, (ulong)accdb->base.dt_funk ); + FD_MCNT_SET( EXECRP, ACCDB_DT_SPECRD, (ulong)accdb->base.dt_specrd ); + FD_MCNT_SET( EXECRP, ACCDB_DT_VINYL, (ulong)accdb->base.dt_vinyl ); FD_STATIC_ASSERT( sizeof(runtime->metrics.txn_account_save)/sizeof(ulong)==FD_METRICS_ENUM_ACCOUNT_CHANGE_CNT, enum ); FD_MCNT_ENUM_COPY( EXECRP, TXN_ACCOUNT_CHANGES, runtime->metrics.txn_account_save ); diff --git a/src/discof/fd_accdb_topo.c b/src/discof/fd_accdb_topo.c index d1dad5ebf4b..72669742b1f 100644 --- a/src/discof/fd_accdb_topo.c +++ b/src/discof/fd_accdb_topo.c @@ -3,6 +3,7 @@ #include "../flamenco/accdb/fd_accdb_impl_v2.h" #include "../flamenco/progcache/fd_progcache_user.h" #include "../util/pod/fd_pod.h" +#include "../util/pod/fd_pod_format.h" void fd_accdb_init_from_topo( fd_accdb_user_t * accdb, @@ -22,14 +23,29 @@ fd_accdb_init_from_topo( fd_accdb_user_t * accdb, } else { fd_topo_obj_t const * vinyl_rq = fd_topo_find_tile_obj( topo, tile, "vinyl_rq" ); fd_topo_obj_t const * vinyl_req_pool = fd_topo_find_tile_obj( topo, tile, "vinyl_rpool" ); + fd_topo_obj_t const * vinyl_line = fd_topo_find_tile_obj( topo, tile, "vinyl_line" ); FD_TEST( fd_accdb_user_v2_init( accdb, fd_topo_obj_laddr( topo, funk_obj_id ), fd_topo_obj_laddr( topo, locks_obj_id ), fd_topo_obj_laddr( topo, vinyl_rq->id ), topo->workspaces[ vinyl_data->wksp_id ].wksp, fd_topo_obj_laddr( topo, vinyl_req_pool->id ), + fd_topo_obj_laddr( topo, vinyl_line->id ), vinyl_rq->id, max_depth ) ); + + /* Enable speculative reads if the tile has access to meta/ele */ + fd_topo_obj_t const * vinyl_meta = fd_topo_find_tile_obj( topo, tile, "vinyl_meta" ); + fd_topo_obj_t const * vinyl_ele = fd_topo_find_tile_obj( topo, tile, "vinyl_meta_e" ); + if( vinyl_meta && vinyl_ele && vinyl_line ) { + ulong line_cnt = fd_pod_queryf_ulong( topo->props, 0UL, + "obj.%lu.line_cnt", vinyl_line->id ); + fd_accdb_user_v2_init_cache( accdb, + fd_topo_obj_laddr( topo, vinyl_meta->id ), + fd_topo_obj_laddr( topo, vinyl_ele->id ), + fd_topo_obj_laddr( topo, vinyl_line->id ), + line_cnt ); + } } } diff --git a/src/discof/genesis/fd_genesi_tile.c b/src/discof/genesis/fd_genesi_tile.c index 2f301cd34b2..ba632a69660 100644 --- a/src/discof/genesis/fd_genesi_tile.c +++ b/src/discof/genesis/fd_genesi_tile.c @@ -476,18 +476,10 @@ unprivileged_init( fd_topo_t * topo, if( !vinyl_data ) { FD_TEST( fd_accdb_admin_v1_init( ctx->accdb_admin, fd_topo_obj_laddr( topo, funk_obj_id ), fd_topo_obj_laddr( topo, funk_locks_obj_id ) ) ); } else { - fd_topo_obj_t const * vinyl_rq = fd_topo_find_tile_obj( topo, tile, "vinyl_rq" ); - fd_topo_obj_t const * vinyl_req_pool = fd_topo_find_tile_obj( topo, tile, "vinyl_rpool" ); - FD_TEST( vinyl_rq ); - FD_TEST( vinyl_req_pool ); FD_TEST( fd_accdb_admin_v2_init( ctx->accdb_admin, fd_topo_obj_laddr( topo, funk_obj_id ), - fd_topo_obj_laddr( topo, funk_locks_obj_id ), - fd_topo_obj_laddr( topo, vinyl_rq->id ), - topo->workspaces[ vinyl_data->wksp_id ].wksp, - fd_topo_obj_laddr( topo, vinyl_req_pool->id ), - vinyl_rq->id, - tile->genesi.accdb_max_depth ) ); + fd_topo_obj_laddr( topo, funk_locks_obj_id ) ) ); + fd_accdb_admin_v2_max_depth_set( ctx->accdb_admin, tile->genesi.accdb_max_depth ); } fd_accdb_init_from_topo( ctx->accdb, topo, tile, tile->genesi.accdb_max_depth ); diff --git a/src/discof/replay/fd_replay_tile.c b/src/discof/replay/fd_replay_tile.c index d28507aeeaa..678854a1dbc 100644 --- a/src/discof/replay/fd_replay_tile.c +++ b/src/discof/replay/fd_replay_tile.c @@ -417,6 +417,8 @@ struct fd_replay_tile { fd_replay_out_link_t epoch_out[1]; + fd_replay_out_link_t accdb_out[1]; + /* The gui tile needs to reliably own a reference to the most recent completed active bank. Replay needs to know if the gui as a consumer is enabled so it can increment the bank's refcnt before @@ -548,17 +550,15 @@ metrics_write( fd_replay_tile_t * ctx ) { FD_MCNT_SET( REPLAY, PROGCACHE_ROOTED, ctx->progcache_admin->metrics.root_cnt ); FD_MCNT_SET( REPLAY, PROGCACHE_GC_ROOT, ctx->progcache_admin->metrics.gc_root_cnt ); - FD_MCNT_SET( REPLAY, ACCDB_CREATED, ctx->accdb->base.created_cnt ); - FD_MCNT_SET( REPLAY, ACCDB_REVERTED, ctx->accdb_admin->base.revert_cnt ); - FD_MCNT_SET( REPLAY, ACCDB_ROOTED, ctx->accdb_admin->base.root_cnt ); - FD_MCNT_SET( REPLAY, ACCDB_ROOTED_BYTES, ctx->accdb_admin->base.root_tot_sz ); - FD_MCNT_SET( REPLAY, ACCDB_GC_ROOT, ctx->accdb_admin->base.gc_root_cnt ); - FD_MCNT_SET( REPLAY, ACCDB_RECLAIMED, ctx->accdb_admin->base.reclaim_cnt ); - FD_MHIST_COPY( REPLAY, ROOT_SLOT_DURATION_SECONDS, ctx->metrics.root_slot_dur ); - FD_MHIST_COPY( REPLAY, ROOT_ACCOUNT_DURATION_SECONDS, ctx->metrics.root_account_dur ); - FD_MCNT_SET( REPLAY, ROOT_ELAPSED_SECONDS_DB, (ulong)ctx->accdb_admin->base.dt_vinyl ); - FD_MCNT_SET( REPLAY, ROOT_ELAPSED_SECONDS_COPY, (ulong)ctx->accdb_admin->base.dt_copy ); - FD_MCNT_SET( REPLAY, ROOT_ELAPSED_SECONDS_GC, (ulong)ctx->accdb_admin->base.dt_gc ); + FD_MCNT_SET( REPLAY, ACCDB_CREATED, ctx->accdb->base.created_cnt ); + FD_MCNT_SET( REPLAY, ACCDB_REVERTED, ctx->accdb_admin->base.revert_cnt ); + + FD_MCNT_SET( REPLAY, ACCDB_LOOKUP_FUNK, ctx->accdb->base.lookup_funk ); + FD_MCNT_SET( REPLAY, ACCDB_LOOKUP_SPECRD, ctx->accdb->base.lookup_specrd ); + FD_MCNT_SET( REPLAY, ACCDB_LOOKUP_ACCDB, ctx->accdb->base.lookup_accdb ); + FD_MCNT_SET( REPLAY, ACCDB_DT_FUNK, (ulong)ctx->accdb->base.dt_funk ); + FD_MCNT_SET( REPLAY, ACCDB_DT_SPECRD, (ulong)ctx->accdb->base.dt_specrd ); + FD_MCNT_SET( REPLAY, ACCDB_DT_VINYL, (ulong)ctx->accdb->base.dt_vinyl ); } static inline ulong @@ -1414,6 +1414,8 @@ store_xinsert( fd_store_t * store, } FD_STORE_XLOCK_END; } +static void accdb_advance_root( fd_replay_tile_t * ctx, fd_stem_context_t * stem, ulong slot, ulong bank_idx ); + static void boot_genesis( fd_replay_tile_t * ctx, fd_stem_context_t * stem, @@ -1435,7 +1437,7 @@ boot_genesis( fd_replay_tile_t * ctx, fd_funk_txn_xid_t target_xid = { .ul = { 0UL, 0UL } }; fd_accdb_attach_child( ctx->accdb_admin, &root_xid, &target_xid ); fd_runtime_read_genesis( ctx->banks, bank, ctx->accdb, &xid, NULL, &meta->genesis_hash, &meta->lthash, ctx->genesis, genesis_blob, &ctx->runtime_stack ); - fd_accdb_advance_root( ctx->accdb_admin, &target_xid ); + accdb_advance_root( ctx, stem, target_xid.ul[0], target_xid.ul[1] ); static const fd_txncache_fork_id_t txncache_root = { .val = USHORT_MAX }; bank->data->txncache_fork_id = fd_txncache_attach_child( ctx->txncache, txncache_root ); @@ -2071,9 +2073,10 @@ accdb_root_op_total( fd_replay_tile_t const * ctx ) { } static void -accdb_advance_root( fd_replay_tile_t * ctx, - ulong slot, - ulong bank_idx ) { +accdb_advance_root( fd_replay_tile_t * ctx, + fd_stem_context_t * stem, + ulong slot, + ulong bank_idx ) { fd_funk_txn_xid_t xid = { .ul[0] = slot, .ul[1] = bank_idx }; FD_LOG_DEBUG(( "advancing root to slot=%lu", slot )); @@ -2085,11 +2088,25 @@ accdb_advance_root( fd_replay_tile_t * ctx, fd_histf_sample( ctx->metrics.root_slot_dur, (ulong)root_accounts_dt ); fd_histf_sample( ctx->metrics.root_account_dur, (ulong)root_accounts_dt / (ulong)fd_long_max( rooted_accounts, 1L ) ); + /* Send root request to accdb tile via stem link. + sig carries the slot so the accdb tile can peek at the mcache + to determine write_delay_slots deferral without consuming. */ + if( FD_LIKELY( ctx->accdb_out->idx!=ULONG_MAX ) ) { + fd_funk_txn_xid_t * msg = fd_chunk_to_laddr( ctx->accdb_out->mem, ctx->accdb_out->chunk ); + *msg = xid; + fd_stem_publish( stem, ctx->accdb_out->idx, slot, ctx->accdb_out->chunk, + sizeof(fd_funk_txn_xid_t), 0UL, 0UL, + fd_frag_meta_ts_comp( fd_tickcount() ) ); + ctx->accdb_out->chunk = fd_dcache_compact_next( ctx->accdb_out->chunk, + sizeof(fd_funk_txn_xid_t), ctx->accdb_out->chunk0, ctx->accdb_out->wmark ); + } + fd_progcache_txn_advance_root( ctx->progcache_admin, &xid ); } static int -advance_published_root( fd_replay_tile_t * ctx ) { +advance_published_root( fd_replay_tile_t * ctx, + fd_stem_context_t * stem ) { fd_block_id_ele_t * block_id_ele = fd_block_id_map_ele_query( ctx->block_id_map, &ctx->consensus_root, NULL, ctx->block_id_arr ); if( FD_UNLIKELY( !block_id_ele ) ) { @@ -2121,7 +2138,7 @@ advance_published_root( fd_replay_tile_t * ctx ) { fd_block_id_ele_t * advanceable_root_ele = &ctx->block_id_arr[ advanceable_root_idx ]; ulong advanceable_root_slot = fd_bank_slot_get( bank ); - accdb_advance_root( ctx, advanceable_root_slot, bank->data->idx ); + accdb_advance_root( ctx, stem, advanceable_root_slot, bank->data->idx ); fd_txncache_advance_root( ctx->txncache, bank->data->txncache_fork_id ); fd_sched_advance_root( ctx->sched, advanceable_root_idx ); @@ -2178,7 +2195,7 @@ after_credit( fd_replay_tile_t * ctx, /* If the published_root is not caught up to the consensus root, then we should try to advance the published root. */ - if( FD_UNLIKELY( ctx->consensus_root_bank_idx!=ctx->published_root_bank_idx && advance_published_root( ctx ) ) ) { + if( FD_UNLIKELY( ctx->consensus_root_bank_idx!=ctx->published_root_bank_idx && advance_published_root( ctx, stem ) ) ) { *charge_busy = 1; *opt_poll_in = 0; return; @@ -2819,17 +2836,11 @@ unprivileged_init( fd_topo_t * topo, fd_topo_obj_laddr( topo, funk_obj_id ), fd_topo_obj_laddr( topo, funk_locks_obj_id ) ) ); } else { - fd_topo_obj_t const * vinyl_rq = fd_topo_find_tile_obj( topo, tile, "vinyl_rq" ); - fd_topo_obj_t const * vinyl_req_pool = fd_topo_find_tile_obj( topo, tile, "vinyl_rpool" ); FD_TEST( fd_accdb_admin_v2_init( ctx->accdb_admin, fd_topo_obj_laddr( topo, funk_obj_id ), - fd_topo_obj_laddr( topo, funk_locks_obj_id ), - fd_topo_obj_laddr( topo, vinyl_rq->id ), - topo->workspaces[ vinyl_data->wksp_id ].wksp, - fd_topo_obj_laddr( topo, vinyl_req_pool->id ), - vinyl_rq->id, - max_depth ) ); + fd_topo_obj_laddr( topo, funk_locks_obj_id ) ) ); fd_accdb_admin_v2_delay_set( ctx->accdb_admin, tile->replay.write_delay_slots ); + fd_accdb_admin_v2_max_depth_set( ctx->accdb_admin, max_depth ); } fd_accdb_init_from_topo( ctx->accdb, topo, tile, max_depth ); @@ -2941,9 +2952,10 @@ unprivileged_init( fd_topo_t * topo, else FD_LOG_ERR(( "unexpected input link name %s", link->name )); } - *ctx->epoch_out = out1( topo, tile, "replay_epoch" ); FD_TEST( ctx->epoch_out->idx!=ULONG_MAX ); - *ctx->replay_out = out1( topo, tile, "replay_out" ); FD_TEST( ctx->replay_out->idx!=ULONG_MAX ); - *ctx->exec_out = out1( topo, tile, "replay_execrp" ); FD_TEST( ctx->exec_out->idx!=ULONG_MAX ); + *ctx->epoch_out = out1( topo, tile, "replay_epoch" ); FD_TEST( ctx->epoch_out->idx!=ULONG_MAX ); + *ctx->replay_out = out1( topo, tile, "replay_out" ); FD_TEST( ctx->replay_out->idx!=ULONG_MAX ); + *ctx->exec_out = out1( topo, tile, "replay_execrp" ); FD_TEST( ctx->exec_out->idx!=ULONG_MAX ); + *ctx->accdb_out = out1( topo, tile, "replay_accdb" ); /* idx==ULONG_MAX when vinyl disabled */ ctx->gui_enabled = fd_topo_find_tile( topo, "gui", 0UL )!=ULONG_MAX; ctx->rpc_enabled = fd_topo_find_tile( topo, "rpc", 0UL )!=ULONG_MAX; @@ -2989,11 +3001,6 @@ unprivileged_init( fd_topo_t * topo, fd_histf_join( fd_histf_new( ctx->metrics.store_query_work, FD_MHIST_SECONDS_MIN( REPLAY, STORE_QUERY_WORK ), FD_MHIST_SECONDS_MAX( REPLAY, STORE_QUERY_WORK ) ) ); - fd_histf_join( fd_histf_new( ctx->metrics.root_slot_dur, FD_MHIST_SECONDS_MIN( REPLAY, ROOT_SLOT_DURATION_SECONDS ), - FD_MHIST_SECONDS_MAX( REPLAY, ROOT_SLOT_DURATION_SECONDS ) ) ); - fd_histf_join( fd_histf_new( ctx->metrics.root_account_dur, FD_MHIST_SECONDS_MIN( REPLAY, ROOT_ACCOUNT_DURATION_SECONDS ), - FD_MHIST_SECONDS_MAX( REPLAY, ROOT_ACCOUNT_DURATION_SECONDS ) ) ); - /* Ensure precompiles are available, crash fast otherwise */ fd_precompiles(); diff --git a/src/flamenco/accdb/Local.mk b/src/flamenco/accdb/Local.mk index 026aa13971e..6a968332d8d 100644 --- a/src/flamenco/accdb/Local.mk +++ b/src/flamenco/accdb/Local.mk @@ -29,7 +29,6 @@ $(call add-objs,fd_accdb_admin_v1 fd_accdb_impl_v1,fd_flamenco) ifdef FD_HAS_ATOMIC $(call add-hdrs,fd_accdb_admin_v2.h fd_accdb_impl_v2.h) $(call add-objs,fd_accdb_admin_v2,fd_flamenco) -$(call add-objs,fd_accdb_admin_v2_root,fd_flamenco) $(call add-objs,fd_accdb_impl_v2,fd_flamenco) $(call add-hdrs,fd_vinyl_req_pool.h) $(call add-objs,fd_vinyl_req_pool,fd_flamenco) @@ -38,14 +37,8 @@ endif # Debug APIs $(call add-hdrs,fd_accdb_fsck.h) $(call add-objs,fd_accdb_fsck_funk fd_accdb_fsck_vinyl,fd_flamenco) -ifdef FD_HAS_LZ4 -$(call make-bin,fd_accdb_ctl,fd_accdb_ctl,fd_vinyl fd_tango fd_ballet fd_util) -endif ifdef FD_HAS_ATOMIC $(call make-unit-test,test_accdb_v1,test_accdb_v1,fd_flamenco fd_funk fd_ballet fd_util) $(call run-unit-test,test_accdb_v1) -ifdef FD_HAS_LZ4 -$(call make-unit-test,test_accdb_v2,test_accdb_v2,fd_flamenco fd_vinyl fd_funk fd_tango fd_ballet fd_util) -endif endif diff --git a/src/flamenco/accdb/fd_accdb_admin_v2.c b/src/flamenco/accdb/fd_accdb_admin_v2.c index 30c20cb07f1..538354cc2e9 100644 --- a/src/flamenco/accdb/fd_accdb_admin_v2.c +++ b/src/flamenco/accdb/fd_accdb_admin_v2.c @@ -1,4 +1,4 @@ -#include "fd_accdb_admin_v2_private.h" +#include "fd_accdb_admin_v2.h" FD_STATIC_ASSERT( alignof(fd_accdb_admin_v2_t)<=alignof(fd_accdb_admin_t), layout ); FD_STATIC_ASSERT( sizeof (fd_accdb_admin_v2_t)<=sizeof(fd_accdb_admin_t), layout ); @@ -6,37 +6,13 @@ FD_STATIC_ASSERT( sizeof (fd_accdb_admin_v2_t)<=sizeof(fd_accdb_admin_t), layou fd_accdb_admin_t * fd_accdb_admin_v2_init( fd_accdb_admin_t * accdb_, void * shfunk, - void * shlocks, - void * vinyl_rq, - void * vinyl_data, - void * vinyl_req_pool, - ulong vinyl_link_id, - ulong max_depth ) { + void * shlocks ) { /* Call superclass constructor */ if( FD_UNLIKELY( !fd_accdb_admin_v1_init( accdb_, shfunk, shlocks ) ) ) { return NULL; } - if( FD_UNLIKELY( !vinyl_data ) ) { - FD_LOG_WARNING(( "NULL vinyl_data" )); - return NULL; - } - - fd_vinyl_rq_t * rq = fd_vinyl_rq_join( vinyl_rq ); - fd_vinyl_req_pool_t * req_pool = fd_vinyl_req_pool_join( vinyl_req_pool ); - if( FD_UNLIKELY( !rq || !req_pool ) ) { - /* component joins log warning if this is reached */ - FD_LOG_WARNING(( "Failed to initialize database client" )); - return NULL; - } fd_accdb_admin_v2_t * accdb = fd_type_pun( accdb_ ); - accdb->root_lineage->max_depth = max_depth; - accdb->vinyl_req_id = 0UL; - accdb->vinyl_rq = rq; - accdb->vinyl_link_id = vinyl_link_id; - accdb->vinyl_data_wksp = vinyl_data; - accdb->vinyl_req_wksp = fd_wksp_containing( req_pool ); - accdb->vinyl_req_pool = req_pool; accdb->base.accdb_type = FD_ACCDB_TYPE_V2; accdb->base.vt = &fd_accdb_admin_v2_vt; return accdb_; @@ -57,8 +33,6 @@ void fd_accdb_admin_v2_fini( fd_accdb_admin_t * admin_ ) { fd_accdb_admin_v2_t * admin = downcast( admin_ ); - fd_vinyl_rq_leave( admin->vinyl_rq ); - /* superclass destructor */ admin->base.accdb_type = FD_ACCDB_TYPE_V1; fd_accdb_admin_v1_fini( admin_ ); @@ -73,11 +47,58 @@ void fd_accdb_v2_attach_child( fd_accdb_admin_t * admin_, fd_funk_txn_xid_t const * xid_parent, fd_funk_txn_xid_t const * xid_new ) { - fd_accdb_admin_v1_t * db = downcast( admin_ )->v1; + fd_accdb_admin_v2_t * accdb = downcast( admin_ ); + fd_accdb_admin_v1_t * db = accdb->v1; + fd_funk_t * funk = db->funk; + + /* Ensure fork depth stays within limits. This thread is the only + one that appends to the fork graph. Other threads may concurrently + remove from the graph (by advancing root), which can only decrease + the depth. Therefore we can safely spin until there is room. */ + + ulong max_depth = accdb->max_depth; + if( FD_LIKELY( max_depth ) ) { + for(;;) { + /* Compute depth of the new child = 1 (for the child itself) + + number of ancestors from parent to root. */ + + ulong depth = 1UL; + + if( !fd_funk_txn_xid_eq( xid_parent, funk->shmem->last_publish ) ) { + /* Parent is not root -- walk the parent chain */ + + fd_funk_txn_map_query_t query[1]; + int err; + for(;;) { + err = fd_funk_txn_map_query_try( funk->txn_map, xid_parent, NULL, query, 0 ); + if( FD_LIKELY( err!=FD_MAP_ERR_AGAIN ) ) break; + FD_SPIN_PAUSE(); + } + + if( FD_LIKELY( err==FD_MAP_SUCCESS ) ) { + fd_funk_txn_t const * txn = fd_funk_txn_map_query_ele( query ); + depth++; /* count parent */ + + ulong parent_idx = fd_funk_txn_idx( txn->parent_cidx ); + while( !fd_funk_txn_idx_is_null( parent_idx ) ) { + txn = &funk->txn_pool->ele[ parent_idx ]; + depth++; + parent_idx = fd_funk_txn_idx( txn->parent_cidx ); + } + } + /* If err==FD_MAP_ERR_KEY, parent was concurrently rooted. + depth stays at 1, which is always within limits. */ + } + + if( FD_LIKELY( depthul[0], xid_new ->ul[1], xid_parent->ul[0], xid_parent->ul[1] )); - fd_funk_txn_prepare( db->funk, xid_parent, xid_new ); + fd_funk_txn_prepare( funk, xid_parent, xid_new ); } void @@ -86,94 +107,12 @@ fd_accdb_v2_cancel( fd_accdb_admin_t * admin, fd_accdb_v1_cancel( admin, xid ); } -static void -publish_recs( fd_accdb_admin_v2_t * admin, - fd_funk_txn_t * txn ) { - fd_funk_rec_t * rec_pool = admin->v1->funk->rec_pool->ele; - fd_funk_rec_t * head = !fd_funk_rec_idx_is_null( txn->rec_head_idx ) ? - &rec_pool[ txn->rec_head_idx ] : NULL; - txn->rec_head_idx = FD_FUNK_REC_IDX_NULL; - txn->rec_tail_idx = FD_FUNK_REC_IDX_NULL; - while( head ) { - head = fd_accdb_v2_root_batch( admin, head ); - } -} - -static void -txn_unregister( fd_funk_t * funk, - fd_funk_txn_t * txn ) { - ulong child_idx = fd_funk_txn_idx( txn->child_head_cidx ); - while( FD_UNLIKELY( !fd_funk_txn_idx_is_null( child_idx ) ) ) { - funk->txn_pool->ele[ child_idx ].parent_cidx = fd_funk_txn_cidx( FD_FUNK_TXN_IDX_NULL ); - child_idx = fd_funk_txn_idx( funk->txn_pool->ele[ child_idx ].sibling_next_cidx ); - } - - fd_funk_txn_xid_t xid[1]; fd_funk_txn_xid_copy( xid, fd_funk_txn_xid( txn ) ); - fd_funk_txn_map_query_t query[1]; - int remove_err = fd_funk_txn_map_remove( funk->txn_map, xid, NULL, query, 0 ); - if( FD_UNLIKELY( remove_err!=FD_MAP_SUCCESS ) ) { - FD_LOG_CRIT(( "fd_accdb_publish failed: fd_funk_txn_map_remove failed: %i-%s", remove_err, fd_map_strerror( remove_err ) )); - } -} - -static void -txn_free( fd_funk_t * funk, - fd_funk_txn_t * txn ) { - FD_VOLATILE( txn->state ) = FD_FUNK_TXN_STATE_FREE; - txn->parent_cidx = UINT_MAX; - txn->sibling_prev_cidx = UINT_MAX; - txn->sibling_next_cidx = UINT_MAX; - txn->child_head_cidx = UINT_MAX; - txn->child_tail_cidx = UINT_MAX; - fd_funk_txn_pool_release( funk->txn_pool, txn, 1 ); -} - -static void -fd_accdb_txn_publish_one( fd_accdb_admin_v2_t * accdb, - fd_funk_txn_t * txn ) { - fd_funk_t * funk = accdb->v1->funk; - - /* Children of transaction are now children of root */ - funk->shmem->child_head_cidx = txn->child_head_cidx; - funk->shmem->child_tail_cidx = txn->child_tail_cidx; - - /* Phase 1: Mark transaction as "last published" */ - - fd_funk_txn_xid_t xid[1]; fd_funk_txn_xid_copy( xid, fd_funk_txn_xid( txn ) ); - if( FD_UNLIKELY( !fd_funk_txn_idx_is_null( fd_funk_txn_idx( txn->parent_cidx ) ) ) ) { - FD_LOG_CRIT(( "fd_accdb_txn_advance_root: parent of txn %lu:%lu is not root", xid->ul[0], xid->ul[1] )); - } - fd_funk_txn_xid_st_atomic( funk->shmem->last_publish, xid ); - FD_LOG_INFO(( "accdb txn laddr=%p xid %lu:%lu: publish", (void *)txn, txn->xid.ul[0], txn->xid.ul[1] )); - - /* Phase 2: Drain users from transaction */ - - ulong txn_idx = (ulong)( txn - funk->txn_pool->ele ); - fd_rwlock_write( &funk->txn_lock[ txn_idx ] ); - FD_VOLATILE( txn->state ) = FD_FUNK_TXN_STATE_PUBLISH; - - /* Phase 3: Move records from funk to vinyl */ - - publish_recs( accdb, txn ); - - /* Phase 4: Unregister transaction */ - - txn_unregister( funk, txn ); - - /* Phase 5: Free transaction object */ - - fd_rwlock_unwrite( &funk->txn_lock[ txn_idx ] ); - txn_free( funk, txn ); -} - void fd_accdb_v2_advance_root( fd_accdb_admin_t * accdb_, fd_funk_txn_xid_t const * xid ) { fd_accdb_admin_v2_t * accdb = downcast( accdb_ ); fd_funk_t * funk = accdb->v1->funk; - fd_accdb_lineage_set_fork( accdb->root_lineage, funk, xid ); - /* Assume no concurrent access to txn_map */ fd_funk_txn_map_query_t query[1]; @@ -190,23 +129,8 @@ fd_accdb_v2_advance_root( fd_accdb_admin_t * accdb_, fd_accdb_txn_cancel_siblings( accdb->v1, txn ); - fd_accdb_lineage_t * lineage = accdb->root_lineage; - fd_funk_txn_xid_t oldest_xid = lineage->fork[ lineage->fork_depth-1UL ]; - if( fd_funk_txn_xid_eq_root( &oldest_xid ) && lineage->fork_depth>1UL ) { - oldest_xid = lineage->fork[ lineage->fork_depth-2UL ]; - } - - ulong delay = xid->ul[0] - oldest_xid.ul[0]; - /* genesis_override is necessary when bootstrapping from genesis, - without requiring fd_accdb_admin_v2_delay_set to accept 0. */ - int genesis_override = !xid->ul[0]; - if( delay >= accdb->slot_delay || genesis_override ) { - FD_LOG_INFO(( "accdb xid %lu:%lu: pruning", - oldest_xid.ul[0], oldest_xid.ul[1] )); - fd_funk_txn_t * oldest = &funk->txn_pool->ele[ funk->shmem->child_head_cidx ]; - FD_TEST( fd_funk_txn_xid_eq( &oldest_xid, &oldest->xid ) ); - fd_accdb_txn_publish_one( accdb, oldest ); - } + /* Root message is sent to the accdb tile by the replay tile via + the replay_accdb stem link (see fd_replay_tile.c). */ } void @@ -217,6 +141,13 @@ fd_accdb_admin_v2_delay_set( fd_accdb_admin_t * accdb_, accdb->slot_delay = slot_delay; } +void +fd_accdb_admin_v2_max_depth_set( fd_accdb_admin_t * accdb_, + ulong max_depth ) { + fd_accdb_admin_v2_t * accdb = downcast( accdb_ ); + accdb->max_depth = max_depth; +} + fd_accdb_admin_vt_t const fd_accdb_admin_v2_vt = { .fini = fd_accdb_admin_v2_fini, .root_get = fd_accdb_v2_root_get, diff --git a/src/flamenco/accdb/fd_accdb_admin_v2.h b/src/flamenco/accdb/fd_accdb_admin_v2.h index f211afe020c..5da2e3deec6 100644 --- a/src/flamenco/accdb/fd_accdb_admin_v2.h +++ b/src/flamenco/accdb/fd_accdb_admin_v2.h @@ -5,6 +5,25 @@ account database. */ #include "fd_accdb_admin.h" +#include "fd_accdb_admin_v1.h" +#include "../../tango/fd_tango_base.h" + +struct fd_accdb_admin_v2 { + union { + fd_accdb_admin_base_t base; + fd_accdb_admin_v1_t v1[1]; + }; + + ulong slot_delay; + ulong max_depth; /* Max fork depth (distance from any tip to root). + attach_child spins if adding a child would exceed this. */ + + fd_frag_meta_t * mcache; + ulong depth; + ulong seq; +}; + +typedef struct fd_accdb_admin_v2 fd_accdb_admin_v2_t; FD_PROTOTYPES_BEGIN @@ -13,17 +32,16 @@ extern fd_accdb_admin_vt_t const fd_accdb_admin_v2_vt; fd_accdb_admin_t * fd_accdb_admin_v2_init( fd_accdb_admin_t * admin_, void * shfunk, - void * shlocks, - void * vinyl_rq, - void * vinyl_data, - void * vinyl_req_pool, - ulong vinyl_link_id, - ulong max_depth ); + void * shlocks ); void fd_accdb_admin_v2_delay_set( fd_accdb_admin_t * admin, ulong slot_delay ); +void +fd_accdb_admin_v2_max_depth_set( fd_accdb_admin_t * admin, + ulong max_depth ); + void fd_accdb_admin_v2_fini( fd_accdb_admin_t * ljoin ); diff --git a/src/flamenco/accdb/fd_accdb_admin_v2_private.h b/src/flamenco/accdb/fd_accdb_admin_v2_private.h deleted file mode 100644 index 6151000589d..00000000000 --- a/src/flamenco/accdb/fd_accdb_admin_v2_private.h +++ /dev/null @@ -1,62 +0,0 @@ -#ifndef HEADER_fd_src_flamenco_accdb_fd_accdb_admin_v2_private_h -#define HEADER_fd_src_flamenco_accdb_fd_accdb_admin_v2_private_h - -#include "fd_accdb_admin_v2.h" -#include "fd_accdb_admin_v1.h" -#include "fd_vinyl_req_pool.h" - -/* FD_ACCDB_ROOT_BATCH_MAX controls how many accounts to write in - batches to the vinyl DB server. */ - -#define FD_ACCDB_ROOT_BATCH_MAX (128UL) - -struct fd_accdb_admin_v2 { - union { - fd_accdb_admin_base_t base; - fd_accdb_admin_v1_t v1[1]; - }; - - fd_accdb_lineage_t root_lineage[1]; - ulong slot_delay; - - /* Vinyl client */ - ulong vinyl_req_id; - fd_vinyl_rq_t * vinyl_rq; - ulong vinyl_link_id; - fd_wksp_t * vinyl_data_wksp; - fd_wksp_t * vinyl_req_wksp; - fd_vinyl_req_pool_t * vinyl_req_pool; -}; - -typedef struct fd_accdb_admin_v2 fd_accdb_admin_v2_t; - -FD_PROTOTYPES_BEGIN - -/* fd_accdb_v2_root_batch "roots" a batch of funk accounts. - - rec0 is the head of the batch linked list to root (NULL is fine). - Up to FD_ACCDB_ROOT_BATCH_MAX records starting at rec0 are migrated - to vinyl. This frees rec0 and subsequent items. Returns the next - record in the linked list that is not yet rooted. - - It is assumed that the rec0 linked list is not owned by a funk_txn at - this point. (The funk_txn that used to own rec0 has child_head and - child_tail set to sentinel.) - - Each record is considered as follows: - - If another newer revision of this record exists that was already - marked as rooted, this record is thrown away. - - Otherwise, the record is moved to vinyl. - - The move to vinyl is done in a thread-safe manner (writes to vinyl - first, then once the write is globally visible, removes from funk). - - Updates the following metrics: root_cnt, reclaim_cnt. */ - -fd_funk_rec_t * -fd_accdb_v2_root_batch( fd_accdb_admin_v2_t * admin, - fd_funk_rec_t * rec0 ); - -FD_PROTOTYPES_END - -#endif /* HEADER_fd_src_flamenco_accdb_fd_accdb_admin_v2_private_h */ diff --git a/src/flamenco/accdb/fd_accdb_admin_v2_root.c b/src/flamenco/accdb/fd_accdb_admin_v2_root.c deleted file mode 100644 index 0430d025446..00000000000 --- a/src/flamenco/accdb/fd_accdb_admin_v2_root.c +++ /dev/null @@ -1,379 +0,0 @@ -#include "fd_accdb_admin_v2_private.h" -#include "../fd_flamenco_base.h" -#include "../runtime/fd_runtime_const.h" /* FD_RUNTIME_ACC_SZ_MAX */ -#include "../../vinyl/data/fd_vinyl_data.h" - -/*********************************************************************** - - fd_accdb_admin_v2_root.c contains the account rooting algorithm. - - This algorithm is designed to amortize vinyl I/O latency by - processing accounts in batches. - - For each batch of accounts, it does the following logic: - - - ACQUIRE batch request for account updates - - ERASE batch request for account deletions - - Spin wait for ACQUIRE completion - - Copy back modified accounts - - RELEASE batch request for account updates - - Spin wait for ACQUIRE, ERASE completions - - Free records from funk - -***********************************************************************/ - -/* vinyl_spin_wait waits for completion of a vinyl request and asserts - that all requests completed successfully. */ - -static void -vinyl_spin_wait( fd_vinyl_comp_t const * comp, - fd_vinyl_key_t const * key0, - schar const * err0, - ulong cnt, - char const * req_type_cstr ) { - - /* FIXME use a load-acquire here, such that later loads are ordered - past this load */ - while( FD_VOLATILE_CONST( comp->seq )!=1UL ) FD_SPIN_PAUSE(); - FD_COMPILER_MFENCE(); - int comp_err = FD_VOLATILE_CONST( comp->err ); - if( FD_UNLIKELY( comp_err!=FD_VINYL_SUCCESS ) ) { - FD_LOG_CRIT(( "vinyl tile rejected my %s request (%i-%s)", - req_type_cstr, comp_err, fd_vinyl_strerror( comp_err ) )); - } - - for( ulong i=0UL; irec_pool->ele ); - ulong volatile * vl = &funk->rec_lock[ rec_idx ]; - for(;;) { - ulong const ver_lock = FD_VOLATILE_CONST( *vl ); - ulong const ver = fd_funk_rec_ver_bits ( ver_lock ); - ulong const lock = fd_funk_rec_lock_bits( ver_lock ); - if( FD_UNLIKELY( lock ) ) { - /* Spin while there are active readers */ - /* FIXME kill client after spinning for 30 seconds to prevent silent deadlock */ - FD_SPIN_PAUSE(); - continue; - } - ulong const new_ver = fd_funk_rec_ver_inc( ver ); - ulong const new_vl = fd_funk_rec_ver_lock( new_ver, FD_FUNK_REC_LOCK_MASK ); - if( FD_UNLIKELY( FD_ATOMIC_CAS( vl, ver_lock, new_vl )!=ver_lock ) ) { - FD_SPIN_PAUSE(); - continue; - } - return new_vl; - } -} - -static void -fd_funk_rec_admin_unlock( fd_funk_t const * funk, - fd_funk_rec_t * rec, - ulong ver_lock ) { - ulong rec_idx = (ulong)( rec - funk->rec_pool->ele ); - ulong volatile * vl = &funk->rec_lock[ rec_idx ]; - FD_VOLATILE( *vl ) = fd_funk_rec_ver_lock( fd_funk_rec_ver_bits( ver_lock ), 0UL ); -} - -static void -funk_free_rec( fd_funk_t * funk, - fd_funk_rec_t * rec ) { - /* Acquire admin lock (kick out readers) - - Note: At this point, well-behaving external readers will abandon a - read-lock attempt if they observe this active write lock. (An - admin lock always implies the record is about to die) */ - - FD_COMPILER_MFENCE(); - ulong ver_lock = fd_funk_rec_admin_lock( funk, rec ); - - /* Free record */ - - memset( &rec->pair, 0, sizeof(fd_funk_xid_key_pair_t) ); - FD_COMPILER_MFENCE(); - rec->map_next = FD_FUNK_REC_IDX_NULL; - fd_funk_val_flush( rec, funk->alloc, funk->wksp ); - fd_funk_rec_admin_unlock( funk, rec, ver_lock ); - fd_funk_rec_pool_release( funk->rec_pool, rec, 1 ); -} - -/* funk_gc_chain optimistically deletes all but the newest rooted - revisions of rec. This possibly deletes 'rec'. Returns rec if rec - is the only known rooted revision, otherwise returns NULL (if rec was - deleted). Note that due to edge cases, revisions that are not in the - oldest tracked slot, may not reliably get cleaned up. (The oldest - tracked slot always gets cleaned up, though.) */ - -static fd_funk_rec_t * -funk_gc_chain( fd_accdb_admin_v2_t * const admin, - fd_funk_rec_t * const rec ) { - - fd_accdb_lineage_t * lineage = admin->root_lineage; - fd_funk_t * funk = admin->v1->funk; - fd_funk_rec_t * rec_pool = funk->rec_pool->ele; - ulong rec_max = funk->rec_pool->ele_max; - ulong seed = funk->rec_map->map->seed; - ulong chain_cnt = funk->rec_map->map->chain_cnt; - ulong root_slot = lineage->fork[0].ul[0]; - - ulong hash = fd_funk_rec_map_key_hash( &rec->pair, seed ); - ulong chain_idx = (hash & (chain_cnt-1UL) ); - - /* Lock rec_map chain */ - - int lock_err = fd_funk_rec_map_iter_lock( funk->rec_map, &chain_idx, 1UL, FD_MAP_FLAG_BLOCKING ); - if( FD_UNLIKELY( lock_err!=FD_MAP_SUCCESS ) ) { - FD_LOG_CRIT(( "fd_funk_rec_map_iter_lock failed (%i-%s)", lock_err, fd_map_strerror( lock_err ) )); - } - - fd_funk_rec_map_shmem_private_chain_t * chain = - fd_funk_rec_map_shmem_private_chain( funk->rec_map->map, 0UL ) + chain_idx; - ulong ver = - fd_funk_rec_map_private_vcnt_ver( FD_VOLATILE_CONST( chain->ver_cnt ) ); - FD_CRIT( ver&1UL, "chain is not locked" ); - - /* Walk map chain */ - - fd_funk_rec_t * found_rec = NULL; - uint * pnext = &chain->head_cidx; - uint cur = *pnext; - ulong chain_len = 0UL; - ulong iter = 0UL; - while( cur!=FD_FUNK_REC_IDX_NULL ) { - if( FD_UNLIKELY( iter++ > rec_max ) ) FD_LOG_CRIT(( "cycle detected in rec_map chain %lu", chain_idx )); - - /* Is this node garbage? */ - - fd_funk_rec_t * node = &funk->rec_pool->ele[ cur ]; - if( FD_UNLIKELY( cur==node->map_next ) ) FD_LOG_CRIT(( "accdb corruption detected: cycle in rec_map chain %lu", chain_idx )); - cur = node->map_next; - if( !fd_funk_rec_key_eq( rec->pair.key, node->pair.key ) ) goto retain; - if( node->pair.xid->ul[0]>root_slot ) goto retain; - if( !found_rec ) { - found_rec = node; - goto retain; - } - - /* No longer need this node */ - - if( node->pair.xid->ul[0] > rec->pair.xid->ul[0] ) { - /* If this node is newer than the to-be-deleted slot, need to - remove it from the transaction's record list. */ - uint neigh_prev = node->prev_idx; - uint neigh_next = node->next_idx; - if( neigh_prev==FD_FUNK_REC_IDX_NULL || - neigh_next==FD_FUNK_REC_IDX_NULL ) { - /* Node is first or last of transaction -- too bothersome to - remove it from the transaction's record list */ - goto retain; - } - rec_pool[ neigh_next ].prev_idx = neigh_prev; - rec_pool[ neigh_prev ].next_idx = neigh_next; - } - - /* Destroy this node */ - - funk_free_rec( funk, node ); - *pnext = cur; - continue; - - retain: - pnext = &node->map_next; - chain_len++; - } - - /* Unlock rec_map chain */ - - FD_COMPILER_MFENCE(); - FD_VOLATILE( chain->ver_cnt ) = - fd_funk_rec_map_private_vcnt( ver+1UL, chain_len ); - FD_COMPILER_MFENCE(); - return found_rec==rec ? found_rec : NULL; -} - -/* Main algorithm */ - -fd_funk_rec_t * -fd_accdb_v2_root_batch( fd_accdb_admin_v2_t * admin, - fd_funk_rec_t * rec0 ) { - long t_start = fd_tickcount(); - - fd_funk_t * funk = admin->v1->funk; /* unrooted DB */ - fd_wksp_t * funk_wksp = funk->wksp; /* shm workspace containing unrooted accounts */ - fd_funk_rec_t * rec_pool = funk->rec_pool->ele; /* funk rec arena */ - fd_vinyl_rq_t * rq = admin->vinyl_rq; /* "request queue "*/ - fd_vinyl_req_pool_t * req_pool = admin->vinyl_req_pool; /* "request pool" */ - fd_wksp_t * req_wksp = admin->vinyl_req_wksp; /* shm workspace containing request buffer */ - fd_wksp_t * data_wksp = admin->vinyl_data_wksp; /* shm workspace containing vinyl data cache */ - ulong link_id = admin->vinyl_link_id; /* vinyl client ID */ - - /* Collect funk request batch */ - - fd_funk_rec_t * recs[ FD_ACCDB_ROOT_BATCH_MAX ]; - ulong rec_cnt; - - fd_funk_rec_t * next = rec0; - for( rec_cnt=0UL; next && rec_cntnext_idx ) ) { - next = NULL; - } else { - next = &rec_pool[ cur->next_idx ]; - } - cur->prev_idx = FD_FUNK_REC_IDX_NULL; - cur->next_idx = FD_FUNK_REC_IDX_NULL; - - if( funk_gc_chain( admin, cur ) ) { - recs[ rec_cnt++ ] = cur; - } - } - - /* Partition batch into ACQUIRE (updates) and ERASE (deletions) */ - - ulong acq_cnt = 0UL; - ulong del_cnt; - for( ulong i=0UL; ival_sz>=sizeof(fd_account_meta_t), "corrupt funk_rec" ); - if( meta->lamports ) { - fd_funk_rec_t * tmp = recs[ i ]; - recs[ i ] = recs[ acq_cnt ]; - recs[ acq_cnt ] = tmp; - acq_cnt++; - } - } - del_cnt = rec_cnt - acq_cnt; - - /* Create ACQUIRE and ERASE batch requests */ - - ulong del_batch = fd_vinyl_req_pool_acquire( req_pool ); /* ERASE */ - ulong acq_batch = fd_vinyl_req_pool_acquire( req_pool ); /* ACQUIRE */ - fd_vinyl_key_t * acq_key0 = fd_vinyl_req_batch_key( req_pool, acq_batch ); - fd_vinyl_key_t * del_key0 = fd_vinyl_req_batch_key( req_pool, del_batch ); - - for( ulong i=0UL; ipair.key, 32UL ); - } - for( ulong i=0UL; ipair.key, 32UL ); - } - - /* Send off ACQUIRE and ERASE requests */ - - fd_vinyl_comp_t * acq_comp = fd_vinyl_req_batch_comp ( req_pool, acq_batch ); - fd_vinyl_comp_t * del_comp = fd_vinyl_req_batch_comp ( req_pool, del_batch ); - schar * acq_err0 = fd_vinyl_req_batch_err ( req_pool, acq_batch ); - schar * del_err0 = fd_vinyl_req_batch_err ( req_pool, del_batch ); - ulong * acq_val_gaddr0 = fd_vinyl_req_batch_val_gaddr( req_pool, acq_batch ); - - memset( acq_comp, 0, sizeof(fd_vinyl_comp_t) ); - memset( del_comp, 0, sizeof(fd_vinyl_comp_t) ); - for( ulong i=0UL; idlen; - FD_CRIT( data_sz<=FD_RUNTIME_ACC_SZ_MAX, "oversize account record" ); - - ulong val_sz = sizeof(fd_account_meta_t) + data_sz; - acq_val_gaddr0[ i ] = val_sz; - admin->base.root_tot_sz += val_sz; - } - - fd_vinyl_req_send_batch( - rq, req_pool, req_wksp, - admin->vinyl_req_id++, link_id, - FD_VINYL_REQ_TYPE_ACQUIRE, - FD_VINYL_REQ_FLAG_MODIFY | - FD_VINYL_REQ_FLAG_IGNORE | - FD_VINYL_REQ_FLAG_CREATE, - acq_batch, acq_cnt - ); - fd_vinyl_req_send_batch( - rq, req_pool, req_wksp, - admin->vinyl_req_id++, link_id, - FD_VINYL_REQ_TYPE_ERASE, - 0UL, - del_batch, del_cnt - ); - - /* Spin for ACQUIRE completion */ - - vinyl_spin_wait( acq_comp, acq_key0, acq_err0, acq_cnt, "ACQUIRE" ); - long t_acquire = fd_tickcount(); - - /* Copy back modified accounts */ - - for( ulong i=0UL; idlen; - ulong val_sz = sizeof(fd_account_meta_t) + data_sz; - FD_CRIT( data_sz<=FD_RUNTIME_ACC_SZ_MAX, "oversize account record" ); - - fd_account_meta_t * dst_meta = fd_wksp_laddr_fast( data_wksp, acq_val_gaddr0[ i ] ); - fd_vinyl_info_t * val_info = fd_vinyl_data_info( dst_meta ); - - fd_memcpy( dst_meta, src_meta, val_sz ); - val_info->val_sz = (uint)val_sz; - } - - /* Send off RELEASE batch request (reuse acq_batch) */ - - memset( acq_comp, 0, sizeof(fd_vinyl_comp_t) ); - for( ulong i=0UL; ivinyl_req_id++, link_id, - FD_VINYL_REQ_TYPE_RELEASE, - FD_VINYL_REQ_FLAG_MODIFY, - acq_batch, acq_cnt - ); - long t_copy = fd_tickcount(); - - /* Spin for ERASE, RELEASE completions */ - - vinyl_spin_wait( del_comp, del_key0, del_err0, del_cnt, "ERASE" ); - fd_vinyl_req_pool_release( req_pool, del_batch ); - - vinyl_spin_wait( acq_comp, acq_key0, acq_err0, acq_cnt, "RELEASE" ); - fd_vinyl_req_pool_release( req_pool, acq_batch ); - long t_release = fd_tickcount(); - - /* Remove funk records */ - - for( ulong i=0UL; ipair; - fd_funk_rec_query_t query[1]; - int rm_err = fd_funk_rec_map_remove( funk->rec_map, &pair, NULL, query, FD_MAP_FLAG_BLOCKING ); - if( FD_UNLIKELY( rm_err!=FD_MAP_SUCCESS ) ) FD_LOG_CRIT(( "fd_funk_rec_map_remove failed (%i-%s)", rm_err, fd_map_strerror( rm_err ) )); - funk_free_rec( funk, recs[ i ] ); - } - long t_gc = fd_tickcount(); - - /* Update metrics */ - - admin->base.root_cnt += (uint)acq_cnt; - admin->base.reclaim_cnt += (uint)del_cnt; - admin->base.dt_vinyl += ( t_acquire - t_start ) + ( t_release - t_copy ); - admin->base.dt_copy += ( t_copy - t_acquire ); - admin->base.dt_gc += ( t_gc - t_release ); - - return next; -} diff --git a/src/flamenco/accdb/fd_accdb_base.h b/src/flamenco/accdb/fd_accdb_base.h index a4c44a81deb..6bbebcf645f 100644 --- a/src/flamenco/accdb/fd_accdb_base.h +++ b/src/flamenco/accdb/fd_accdb_base.h @@ -13,6 +13,7 @@ typedef struct fd_accdb_user fd_accdb_user_t; #define FD_ACCDB_TYPE_V0 (80U) /* minimal single chain */ #define FD_ACCDB_TYPE_V1 (1U) /* funk */ #define FD_ACCDB_TYPE_V2 (2U) /* read-only vinyl + read-write funk */ +#define FD_ACCDB_TYPE_V2S (3U) /* vinyl speculative read (pinned) */ #define FD_ACCDB_REF_INVAL 0 /* not a valid reference */ #define FD_ACCDB_REF_RO 1 /* read only */ diff --git a/src/flamenco/accdb/fd_accdb_ctl.c b/src/flamenco/accdb/fd_accdb_ctl.c deleted file mode 100644 index dc5deb35477..00000000000 --- a/src/flamenco/accdb/fd_accdb_ctl.c +++ /dev/null @@ -1,771 +0,0 @@ -/* fd_accdb_ctl.c is a command-line debugging tool for interacting with - a Firedancer account database. */ - -#include "../../vinyl/fd_vinyl.h" -#include "../../flamenco/fd_flamenco_base.h" -#include "../../ballet/base58/fd_base58.h" -#include "../../util/cstr/fd_cstr.h" -#include "../../util/pod/fd_pod.h" -#include -#include /* offsetof */ -#include - -/* req_info contains various request metadata R/W mapped into the vinyl - tile. */ - -struct req_info { - fd_vinyl_key_t key[1]; - ulong val_gaddr[1]; - schar err[1]; - fd_vinyl_comp_t comp[1]; -}; - -typedef struct req_info req_info_t; - -/* The client class contains local handles to client-related vinyl - objects. */ - -struct client { - fd_vinyl_rq_t * rq; - fd_vinyl_cq_t * cq; - ulong req_id; - ulong link_id; - - fd_vinyl_meta_t * meta; - - req_info_t * req_info; - ulong req_info_gaddr; - fd_wksp_t * val_wksp; - fd_wksp_t * client_wksp; - - /* Vinyl client status */ - ulong quota_rem; - ulong cq_seq; -}; - -typedef struct client client_t; - -static char const bin2hex[ 16 ] = { '0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f' }; - -static void -hexdump( uchar const * data, - uint sz ) { - ulong sz_align = fd_ulong_align_dn( sz, 16UL ); - uint i; - for( i=0U; i>4 ] ); - p = fd_cstr_append_char( p, bin2hex[ data[ i+j ]&15 ] ); - p = fd_cstr_append_char( p, ' ' ); - } - p = fd_cstr_append_char( p, ' ' ); - for( ulong j=0UL; j<16UL; j++ ) { - int c = data[ i+j ]; - p = fd_cstr_append_char( p, fd_char_if( fd_isalnum( c ) | fd_ispunct( c ) | (c==' '), (char)c, '.' ) ); - } - p = fd_cstr_append_char( p, '\n' ); - ulong len = (ulong)( p-line ); - fd_cstr_fini( p ); - fwrite( line, 1UL, len, stdout ); - } - if( sz ) { - char line[ 80 ]; - char * p = fd_cstr_init( line ); - p = fd_cstr_append_uint_as_hex( p, '0', i, 7UL ); - p = fd_cstr_append_text( p, ": ", 3UL ); - for( ; i>4 ] ); - p = fd_cstr_append_char( p, bin2hex[ data[ i ]&15 ] ); - p = fd_cstr_append_char( p, ' ' ); - } - p = fd_cstr_append_char( p, '\n' ); - ulong len = (ulong)( p-line ); - fd_cstr_fini( p ); - fwrite( line, 1UL, len, stdout ); - } - fflush( stdout ); -} - -static void -client_query( client_t * client, - char ** arg, - ulong arg_cnt ) { - req_info_t * req_info = client->req_info; - if( FD_UNLIKELY( arg_cnt!=1UL ) ) { - puts( "ERR(query): invalid query command, usage is \"query \"" ); - return; - } - char const * acc_addr_b58 = arg[0]; - fd_vinyl_key_t * acc_key = req_info->key; - if( FD_UNLIKELY( !fd_base58_decode_32( acc_addr_b58, acc_key->uc ) ) ) { - puts( "ERR(query): invalid account address" ); - return; - } - - /* Send an acquire request */ - - req_info->comp->seq = 0UL; - req_info->val_gaddr[0] = FD_VINYL_VAL_MAX; - fd_vinyl_rq_send( - client->rq, - client->req_id++, - client->link_id, - FD_VINYL_REQ_TYPE_ACQUIRE, /* type */ - 0UL, /* flags */ - 1UL, - /* key_gaddr */ client->req_info_gaddr + offsetof( req_info_t, key ), - /* val_gaddr_gaddr */ client->req_info_gaddr + offsetof( req_info_t, val_gaddr ), - /* err_gaddr */ client->req_info_gaddr + offsetof( req_info_t, err ), - /* comp_gaddr */ client->req_info_gaddr + offsetof( req_info_t, comp ) - ); - - /* Poll direct completion for acquire (not via CQ) */ - - fd_vinyl_comp_t * comp = req_info->comp; - while( FD_VOLATILE_CONST( comp->seq )!=1UL ) FD_SPIN_PAUSE(); - int acquire_err = req_info->err[0]; - if( acquire_err==FD_VINYL_SUCCESS ) { - fd_account_meta_t const * val = fd_wksp_laddr_fast( client->val_wksp, req_info->val_gaddr[0] ); - void const * data = (void const *)( val+1 ); - - FD_BASE58_ENCODE_32_BYTES( val->owner, owner_b58 ); - printf( - "\n" - "Public Key: %s\n" - "Balance: %lu.%lu SOL\n" - "Owner: %s\n" - "Executable: %s\n" - "Length: %u (0x%x) bytes\n", - acc_addr_b58, - val->lamports / 1000000000UL, - val->lamports % 1000000000UL, - owner_b58, - val->executable ? "true" : "false", - val->dlen, - val->dlen - ); - hexdump( data, val->dlen ); - - /* Send a release request */ - - req_info->comp->seq = 0UL; - req_info->val_gaddr[0] = FD_VINYL_VAL_MAX; - fd_vinyl_rq_send( - client->rq, - client->req_id++, - client->link_id, - FD_VINYL_REQ_TYPE_RELEASE, /* type */ - 0UL, /* flags */ - 1UL, - 0UL, - /* val_gaddr_gaddr */ client->req_info_gaddr + offsetof( req_info_t, val_gaddr ), - /* err_gaddr */ client->req_info_gaddr + offsetof( req_info_t, err ), - /* comp_gaddr */ client->req_info_gaddr + offsetof( req_info_t, comp ) - ); - - /* Poll direct completion for release (not via CQ) */ - - while( FD_VOLATILE_CONST( comp->seq )!=1UL ) FD_SPIN_PAUSE(); - FD_TEST( req_info->err[0]==FD_VINYL_SUCCESS ); - - puts( "" ); - } else if( acquire_err==FD_VINYL_ERR_KEY ) { - printf( - "\n" - "Public Key: %s\n" - "Account does not exist\n" - "\n", - acc_addr_b58 - ); - } else { - FD_LOG_ERR(( "Vinyl acquire request failed (err %i-%s)", acquire_err, fd_vinyl_strerror( acquire_err ) )); - } -} - -typedef struct batch_req batch_req_t; -struct batch_req { - batch_req_t * prev; - batch_req_t * next; - - ulong key_off; - ulong err_off; - ulong val_gaddr_off; - - ulong req_id; -}; - -static ulong -batch_req_align( void ) { - return fd_ulong_max( alignof(batch_req_t), alignof(fd_vinyl_key_t) ); -} - -static ulong -batch_req_footprint( ulong depth ) { - ulong l = FD_LAYOUT_INIT; - l = FD_LAYOUT_APPEND( l, alignof(batch_req_t), sizeof(batch_req_t) ); - l = FD_LAYOUT_APPEND( l, alignof(fd_vinyl_key_t), depth*sizeof(fd_vinyl_key_t) ); - l = FD_LAYOUT_APPEND( l, alignof(schar), depth*sizeof(schar) ); - l = FD_LAYOUT_APPEND( l, alignof(ulong), depth*sizeof(ulong) ); - return FD_LAYOUT_FINI( l, batch_req_align() ); -} - -static batch_req_t * -batch_req_new( void * mem, - ulong depth ) { - FD_SCRATCH_ALLOC_INIT( l, mem ); - batch_req_t * req = FD_SCRATCH_ALLOC_APPEND( l, alignof(batch_req_t), sizeof(batch_req_t) ); - fd_vinyl_key_t * key = FD_SCRATCH_ALLOC_APPEND( l, alignof(fd_vinyl_key_t), depth*sizeof(fd_vinyl_key_t) ); - schar * err = FD_SCRATCH_ALLOC_APPEND( l, alignof(schar), depth*sizeof(schar) ); - ulong * val_gaddr = FD_SCRATCH_ALLOC_APPEND( l, alignof(ulong), depth*sizeof(ulong) ); - FD_SCRATCH_ALLOC_FINI( l, batch_req_align() ); - - *req = (batch_req_t) { - .prev = NULL, - .next = NULL, - - .key_off = (ulong)key - (ulong)mem, - .err_off = (ulong)err - (ulong)mem, - .val_gaddr_off = (ulong)val_gaddr - (ulong)mem - }; - return req; -} - -static inline fd_vinyl_key_t * -batch_req_key( batch_req_t * req ) { - return (fd_vinyl_key_t *)( (ulong)req + req->key_off ); -} - -static inline schar * -batch_req_err( batch_req_t * req ) { - return (schar *)( (ulong)req + req->err_off ); -} - -static inline ulong * -batch_req_val_gaddr( batch_req_t * req ) { - return (ulong *)( (ulong)req + req->val_gaddr_off ); -} - -struct bench_query_rand { - batch_req_t * req_free; /* free entries */ - batch_req_t * req_wait_lo; /* list of entries awaiting completion */ - batch_req_t * req_wait_hi; - ulong batch_depth; - - ulong iter_rem; - fd_vinyl_key_t * sample; - ulong sample_idx; - ulong sample_max; - - ulong found_cnt; - ulong miss_cnt; -}; -typedef struct bench_query_rand bench_query_rand_t; - -/* bqr_free_push adds a wait queue entry to the free stack. */ - -static void -bqr_free_push( bench_query_rand_t * bqr, - batch_req_t * req ) { - req->prev = NULL; - req->next = bqr->req_free; - if( bqr->req_free ) bqr->req_free->prev = req; - bqr->req_free = req; -} - -/* bqr_free_pop removes a wait queue entry from the free stack (alloc). */ - -static batch_req_t * -bqr_free_pop( bench_query_rand_t * bqr ) { - batch_req_t * req = bqr->req_free; - bqr->req_free = req->next; - if( bqr->req_free ) bqr->req_free->prev = NULL; - req->prev = req->next = NULL; - return req; -} - -/* bqr_wait_push adds a new wait queue entry. */ - -static void -bqr_wait_push( bench_query_rand_t * bqr, - batch_req_t * req ) { - req->prev = bqr->req_wait_hi; - req->next = NULL; - if( bqr->req_wait_hi ) bqr->req_wait_hi->next = req; - bqr->req_wait_hi = req; - if( !bqr->req_wait_lo ) bqr->req_wait_lo = req; -} - -/* bqr_wait_pop removes the oldest wait queue entry. */ - -static batch_req_t * -bqr_wait_pop( bench_query_rand_t * bqr ) { - batch_req_t * req = bqr->req_wait_lo; - bqr->req_wait_lo = req->next; - req->prev = req->next = NULL; - if( bqr->req_wait_lo ) bqr->req_wait_lo->prev = NULL; - else bqr->req_wait_hi = NULL; - return req; -} - -/* bqr_req_release sends a batch RELEASE request for a batch of values. - Completions arriving for RELEASE will replenish quota. */ - -static void -bqr_req_release( client_t * client, - bench_query_rand_t * bqr, - batch_req_t * req, - uint cnt ) { - FD_CRIT( !req->prev && !req->next, "attempt to release a request that is already free or still pending" ); - - schar * err = batch_req_err( req ); - for( uint i=0U; ireq_id++, 63 ); - ulong link_id = client->link_id; - int type = FD_VINYL_REQ_TYPE_RELEASE; - ulong flags = 0UL; - ulong batch_cnt = (ulong)cnt; - ulong val_gaddr_gaddr = fd_wksp_gaddr_fast( client->client_wksp, batch_req_val_gaddr( req ) ); - ulong err_gaddr = fd_wksp_gaddr_fast( client->client_wksp, err ); - fd_vinyl_rq_send( client->rq, req_id, link_id, type, flags, batch_cnt, 0UL, val_gaddr_gaddr, err_gaddr, 0UL ); - - req->req_id = req_id; - bqr_wait_push( bqr, req ); -} - -/* bqr_handle_cq handles an ACQUIRE or RELEASE completion. */ - -static void -bqr_handle_cq( client_t * client, - bench_query_rand_t * bqr, - fd_vinyl_comp_t * comp ) { - FD_CRIT( bqr->req_wait_lo, "received completion even though no request is pending" ); - batch_req_t * req = bqr_wait_pop( bqr ); - FD_CRIT( req->req_id==comp->req_id, "received completion for unexpected req_id" ); - FD_CRIT( comp->batch_cnt<=bqr->batch_depth, "corrupt comp->batch_cnt" ); - - /* The high bit of the request ID indicates whether this was an - ACQUIRE or RELEASE request. */ - int const is_release = fd_ulong_extract_bit( comp->req_id, 63 ); - - fd_vinyl_key_t * key = batch_req_key( req ); - ulong * val_gaddr = batch_req_val_gaddr( req ); - schar * err = batch_req_err( req ); - - if( !is_release ) { - - uint j=0U; - for( uint i=0U; ibatch_cnt; i++ ) { - int e = err[ i ]; - if( FD_UNLIKELY( e!=FD_VINYL_SUCCESS && e!=FD_VINYL_ERR_KEY ) ) { - FD_LOG_CRIT(( "Unexpected vinyl error %i-%s", e, fd_vinyl_strerror( e ) )); - } - if( e==FD_VINYL_SUCCESS ) { - bqr->found_cnt++; - key [ j ] = key[ i ]; - val_gaddr[ j ] = val_gaddr[ i ]; - j++; - } else { - bqr->miss_cnt++; - client->quota_rem++; - } - } - - if( j ) bqr_req_release( client, bqr, req, j ); - else bqr_free_push( bqr, req ); - - } else { - - schar * err = batch_req_err( req ); - uint cnt = comp->batch_cnt; - for( uint i=0U; iquota_rem += comp->batch_cnt; - bqr_free_push( bqr, req ); - - } - -} - -/* bqr_drain_cq drains all completion queue entries. */ - -static void -bqr_drain_cq( client_t * client, - bench_query_rand_t * bqr ) { - for(;;) { - fd_vinyl_comp_t comp[1]; - long cq_err = fd_vinyl_cq_recv( client->cq, client->cq_seq, comp ); - if( FD_UNLIKELY( cq_err<0 ) ) { - FD_LOG_CRIT(( "Vinyl completion queue overrun detected" )); - } - if( cq_err>0 ) break; - bqr_handle_cq( client, bqr, comp ); - client->cq_seq++; - } -} - -/* bqr_req_acquire sends a batch of ACQUIRE requests. */ - -static void -bqr_req_acquire( client_t * client, - bench_query_rand_t * bqr ) { - FD_CRIT( bqr->req_free, "attempt to acquire a request when none are free" ); - batch_req_t * req = bqr_free_pop( bqr ); - ulong cnt = bqr->batch_depth; - if( FD_UNLIKELY( cnt>bqr->iter_rem ) ) cnt = bqr->iter_rem; - - /* Prepare request descriptor */ - fd_vinyl_key_t * key = batch_req_key ( req ); - schar * err = batch_req_err ( req ); - ulong * val_gaddr = batch_req_val_gaddr( req ); - for( ulong i=0UL; isample_idx; - key [ i ] = bqr->sample[ idx ]; - err [ i ] = 0; - val_gaddr[ i ] = 0UL; - bqr->sample_idx++; - if( bqr->sample_idx>=bqr->sample_max ) bqr->sample_idx = 0UL; - } - - /* Send request */ - ulong req_id = fd_ulong_clear_bit( client->req_id++, 63 ); - ulong link_id = client->link_id; - int type = FD_VINYL_REQ_TYPE_ACQUIRE; - ulong flags = 0UL; - ulong key_gaddr = fd_wksp_gaddr_fast( client->client_wksp, batch_req_key ( req ) ); - ulong val_gaddr_gaddr = fd_wksp_gaddr_fast( client->client_wksp, batch_req_val_gaddr( req ) ); - ulong err_gaddr = fd_wksp_gaddr_fast( client->client_wksp, batch_req_err ( req ) ); - fd_vinyl_rq_send( client->rq, req_id, link_id, type, flags, cnt, key_gaddr, val_gaddr_gaddr, err_gaddr, 0UL ); - - /* Update quotas */ - bqr->iter_rem -= cnt; - client->quota_rem -= cnt; - - req->req_id = req_id; - bqr_wait_push( bqr, req ); -} - -/* bench_query_rand_poll sends as many random read requests to vinyl as - possible. Returns 1 if there is more work to do, 0 if the benchmark - is done. */ - -static int -bench_query_rand_poll( client_t * client, - bench_query_rand_t * bqr ) { - if( bqr->req_wait_lo ) { - bqr_drain_cq( client, bqr ); - } - while( bqr->req_free && bqr->iter_rem ) { - bqr_req_acquire( client, bqr ); - } - return (!!bqr->req_wait_lo) | (!!bqr->iter_rem); -} - -/* client_bench_query_rand runs a random read benchmark against vinyl. - Assumes that RQ and CQ are clean and quota_rem==quota_max. */ - -static void -client_bench_query_rand( client_t * client, - int * pargc, - char *** pargv ) { - - /* Prepare a random query benchmark - - 1. Randomly sample keys into an array (--keys) - 2. Inject random keys at a configurable rate (--miss) to exercise - index query misses - 3. Loop through the sampled keys array until (--iter) queries have - been submitted, while doing batches of (--batch) keys at a time - - The benchmark loop is pipelined/asynchronous. The client will - submit request batches until it is blocked by quota, RQ, or CQ. */ - - ulong batch_depth = fd_env_strip_cmdline_ulong( pargc, pargv, "--batch", NULL, 1UL ); - ulong key_cnt = fd_env_strip_cmdline_ulong( pargc, pargv, "--keys", NULL, 262144UL ); - ulong const iter_cnt = fd_env_strip_cmdline_ulong( pargc, pargv, "--iter", NULL, 1048576UL ); - ulong const seed = fd_env_strip_cmdline_ulong( pargc, pargv, "--seed", NULL, (ulong)fd_tickcount() ); - float const miss_r = fd_env_strip_cmdline_float( pargc, pargv, "--miss", NULL, 0.1f ); - - batch_depth = fd_ulong_max( batch_depth, 1UL ); - key_cnt = fd_ulong_min( key_cnt, UINT_MAX ); - - fd_rng_t _rng[1]; fd_rng_t * rng = fd_rng_join( fd_rng_new( _rng, (uint)fd_ulong_hash( seed ), 0UL ) ); - - fd_vinyl_meta_t * meta = client->meta; - ulong const ele_max = fd_vinyl_meta_ele_max ( meta ); - ulong const probe_max = fd_vinyl_meta_probe_max( meta ); - - /* Allocate a huge page backed scratch memory region to back keys */ - - ulong sample_fp = fd_ulong_align_up( key_cnt*sizeof(fd_vinyl_key_t), FD_SHMEM_HUGE_PAGE_SZ ); - ulong sample_page_sz = FD_SHMEM_NORMAL_PAGE_SZ; - ulong sample_page_cnt = sample_fp>>FD_SHMEM_NORMAL_LG_PAGE_SZ; - fd_vinyl_key_t * sample = fd_shmem_acquire( sample_page_sz, sample_page_cnt, fd_log_cpu_id() ); - ulong sample_cnt = 0UL; - if( FD_UNLIKELY( !sample ) ) { - FD_LOG_WARNING(( "Cannot acquire scratch memory to hold %lu vinyl keys (out of memory). Aborting benchmark", key_cnt )); - return; - } - - /* Determine pipeline depth */ - - ulong const rq_ele_depth = fd_vinyl_rq_req_cnt ( client->rq )*batch_depth; - ulong const cq_ele_depth = fd_vinyl_cq_comp_cnt( client->cq )*batch_depth; - ulong const quota_max = fd_ulong_min( client->quota_rem, fd_ulong_min( rq_ele_depth, cq_ele_depth ) ); - ulong const batch_max = ( quota_max + batch_depth - 1UL ) / batch_depth; - - /* Allocate request queue entries */ - - ulong req_footprint = batch_req_footprint( batch_depth ); - ulong req_batch_footprint = batch_max*req_footprint; - ulong req_laddr = (ulong)fd_wksp_alloc_laddr( client->client_wksp, batch_req_align(), req_batch_footprint, 1UL ); - if( FD_UNLIKELY( !req_laddr ) ) { - FD_LOG_WARNING(( "Vinyl client wksp is too small to hold requests. Aborting benchmark" )); - fd_shmem_release( sample, sample_page_sz, sample_page_cnt ); - return; - } - for( ulong batch_idx=0UL, - batch_cur=req_laddr; - batch_idxprev = batch_idx>0UL ? (batch_req_t *)( batch_cur - req_footprint ) : NULL; - req->next = batch_idx+1ULele + meta_idx; - if( FD_LIKELY( fd_vinyl_meta_ele_in_use( ele ) ) ) { - sample[ i ] = ele->phdr.key; - sample_cnt++; - break; - } - meta_idx = (meta_idx+1UL) % ele_max; - } - if( !probe_rem ) { /* no key found (low hashmap utilization) ... */ - for( uint j=0U; j<32U; j+=4U ) FD_STORE( uint, sample[ i ].uc+j, fd_rng_uint( rng ) ); - } - - } - dt += fd_log_wallclock(); - -# if FD_HAS_DOUBLE - FD_LOG_NOTICE(( "Sampled %lu keys in %gs (miss ratio %g)", - key_cnt, (double)dt/1e9, (double)( key_cnt-sample_cnt )/(double)key_cnt )); -# else - FD_LOG_NOTICE(( "Sampled %lu keys in %ldns (%lu missed)", - key_cnt, dt, key_cnt-sample_cnt )); -# endif - - /* Run benchmark */ - - bench_query_rand_t bqr = { - .req_free = req_free, - .req_wait_lo = NULL, - .req_wait_hi = NULL, - .batch_depth = batch_depth, - .iter_rem = iter_cnt, - .sample = sample, - .sample_idx = 0UL, - .sample_max = key_cnt - }; - dt = -fd_log_wallclock(); - while( bench_query_rand_poll( client, &bqr ) ); - dt += fd_log_wallclock(); - -# if FD_HAS_DOUBLE - FD_LOG_NOTICE(( "Completed %lu queries (%lu found, %lu missed) in %gs (%g q/s)", - iter_cnt, bqr.found_cnt, bqr.miss_cnt, - (double)dt/1e9, - (double)iter_cnt / ( (double)dt/1e9 ) )); -# else - FD_LOG_NOTICE(( "Completed %lu queries (%lu found, %lu missed) in %ldns", - iter_cnt, bqr.found_cnt, bqr.miss_cnt, dt )); -# endif - - /* Clean up */ - - fd_rng_delete( fd_rng_leave( rng ) ); - - fd_wksp_free_laddr( (void *)req_laddr ); - - fd_shmem_release( sample, sample_page_sz, sample_page_cnt ); -} - -static int -client_cmd( client_t * client, - char ** tok, - ulong tok_cnt ) { - if( FD_UNLIKELY( !tok_cnt ) ) return 1; - char const * cmd = tok[0]; - if( !strcmp( cmd, "query" ) ) { - client_query( client, tok+1, tok_cnt-1 ); - } else if( !strcmp( cmd, "bench-query-rand" ) ) { - int argc = (int)tok_cnt; - client_bench_query_rand( client, &argc, &tok ); - } else if( !strcmp( cmd, "quit" ) || !strcmp( cmd, "exit" ) ) { - return 0; - } else { - printf( "ERR: unknown command `%s`\n", cmd ); - } - return 1; -} - -static void -repl( client_t * client ) { - char line[ 4096 ] = {0}; -# define TOK_MAX 16 - char * tokens[ 16 ] = {0}; - for(;;) { - fputs( "accdb> ", stdout ); - fflush( stdout ); - - /* Read command */ - if( fgets( line, sizeof(line), stdin )==NULL ) { - putc( '\n', stdout ); - break; - } - line[ strcspn( line, "\n" ) ] = '\0'; - line[ sizeof(line)-1 ] = '\0'; - - /* Interpret command */ - ulong tok_cnt = fd_cstr_tokenize( tokens, TOK_MAX, line, ' ' ); - if( !client_cmd( client, tokens, tok_cnt ) ) break; - } -# undef TOK_MAX -} - -int -main( int argc, - char ** argv ) { - fd_boot( &argc, &argv ); - - char const * cfg_gaddr = fd_env_strip_cmdline_cstr ( &argc, &argv, "--cfg", NULL, NULL ); - char const * wksp_name = fd_env_strip_cmdline_cstr ( &argc, &argv, "--wksp", NULL, NULL ); - ulong const burst_max = fd_env_strip_cmdline_ulong( &argc, &argv, "--burst-max", NULL, 1UL ); - ulong const quota_max = fd_env_strip_cmdline_ulong( &argc, &argv, "--quota-max", NULL, 2UL ); - if( FD_UNLIKELY( !cfg_gaddr ) ) FD_LOG_ERR(( "Missing required argument --cfg" )); - if( FD_UNLIKELY( !wksp_name ) ) FD_LOG_ERR(( "Missing required argument --wksp" )); - - argc--; argv++; - - /* Join server shared memory structures */ - - uchar * pod = fd_pod_join( fd_wksp_map( cfg_gaddr ) ); - if( FD_UNLIKELY( !pod ) ) FD_LOG_ERR(( "Invalid --cfg pod" )); - - void * _cnc = fd_wksp_pod_map( pod, "cnc" ); - void * _meta = fd_wksp_pod_map( pod, "meta" ); - void * _ele = fd_wksp_pod_map( pod, "ele" ); - void * _obj = fd_wksp_pod_map( pod, "obj" ); - - fd_cnc_t * cnc = fd_cnc_join( _cnc ); FD_TEST( cnc ); - fd_vinyl_meta_t meta[1]; - FD_TEST( fd_vinyl_meta_join( meta, _meta, _ele ) ); - - ulong vinyl_status = fd_cnc_signal_query( cnc ); - if( FD_UNLIKELY( vinyl_status!=FD_CNC_SIGNAL_RUN ) ) { - char status_cstr[ FD_CNC_SIGNAL_CSTR_BUF_MAX ]; - FD_LOG_ERR(( "Vinyl tile not running (status %lu-%s)", vinyl_status, fd_cnc_signal_cstr( vinyl_status, status_cstr ) )); - } - - /* Allocate client structures */ - - fd_wksp_t * wksp = fd_wksp_attach( wksp_name ); - FD_TEST( wksp ); - - ulong const rq_max = 32UL; - ulong const cq_max = 32UL; - void * _rq = fd_wksp_alloc_laddr( wksp, fd_vinyl_rq_align(), fd_vinyl_rq_footprint( rq_max ), 1UL ); - void * _cq = fd_wksp_alloc_laddr( wksp, fd_vinyl_cq_align(), fd_vinyl_cq_footprint( cq_max ), 1UL ); - fd_vinyl_rq_t * rq = fd_vinyl_rq_join( fd_vinyl_rq_new( _rq, rq_max ) ); - fd_vinyl_cq_t * cq = fd_vinyl_cq_join( fd_vinyl_cq_new( _cq, cq_max ) ); - if( FD_UNLIKELY( !rq || !cq ) ) { - FD_LOG_WARNING(( "Failed to allocate request/completion queues" )); - goto dealloc2; - } - - ulong req_info_gaddr = fd_wksp_alloc( wksp, alignof(req_info_t), sizeof(req_info_t), 1UL ); - if( FD_UNLIKELY( !req_info_gaddr ) ) { - FD_LOG_WARNING(( "Failed to pre-allocate request metadata" )); - goto dealloc1; - } - req_info_t * req_info = fd_wksp_laddr_fast( wksp, req_info_gaddr ); - - /* Run client */ - - ulong const link_id = 0UL; - int join_err = fd_vinyl_client_join( cnc, rq, cq, wksp, link_id, burst_max, quota_max ); - if( FD_UNLIKELY( join_err ) ) FD_LOG_ERR(( "Failed to join vinyl client to server (err %i-%s)", join_err, fd_vinyl_strerror( join_err ) )); - - FD_LOG_NOTICE(( "Attached client" )); - - client_t client = { - .rq = rq, - .cq = cq, - .req_id = 0UL, - .link_id = link_id, - - .meta = meta, - - .req_info = req_info, - .req_info_gaddr = req_info_gaddr, - .val_wksp = fd_wksp_containing( _obj ), - .client_wksp = wksp, - - .quota_rem = quota_max, - .cq_seq = fd_vinyl_cq_seq( cq ) - }; - - if( argc>0 ) { - client_cmd( &client, argv, (ulong)argc ); - } else { - repl( &client ); - } - - FD_LOG_NOTICE(( "Detaching client" )); - - int leave_err = fd_vinyl_client_leave( cnc, link_id ); - if( FD_UNLIKELY( leave_err ) ) FD_LOG_ERR(( "Failed to leave vinyl client from server (err %i-%s)", leave_err, fd_vinyl_strerror( leave_err ) )); - -dealloc1: - fd_wksp_free( wksp, req_info_gaddr ); - -dealloc2: - fd_wksp_free_laddr( fd_vinyl_rq_delete( fd_vinyl_rq_leave( rq ) ) ); - fd_wksp_free_laddr( fd_vinyl_cq_delete( fd_vinyl_cq_leave( cq ) ) ); - - fd_wksp_unmap( fd_cnc_leave( cnc ) ); - fd_vinyl_meta_leave( meta ); - fd_wksp_unmap( _meta ); - fd_wksp_unmap( _ele ); - fd_wksp_unmap( _obj ); - fd_wksp_detach( wksp ); - - fd_halt(); - return 0; -} diff --git a/src/flamenco/accdb/fd_accdb_impl_v2.c b/src/flamenco/accdb/fd_accdb_impl_v2.c index 3c49d56f8de..daa8e1f42cf 100644 --- a/src/flamenco/accdb/fd_accdb_impl_v2.c +++ b/src/flamenco/accdb/fd_accdb_impl_v2.c @@ -1,6 +1,9 @@ #include "fd_accdb_impl_v2.h" #include "fd_accdb_funk.h" +#include "fd_accdb_specread.h" #include "fd_vinyl_req_pool.h" +#include "../../vinyl/data/fd_vinyl_data.h" +#include "../../ballet/base58/fd_base58.h" #include FD_STATIC_ASSERT( alignof(fd_accdb_user_v2_t)<=alignof(fd_accdb_user_t), layout ); @@ -244,11 +247,11 @@ funk_open_ref( fd_accdb_user_v2_t * accdb, /* Traverse chain for candidate */ fd_funk_rec_t * rec = NULL; int err; - for(;;) { + for( ulong backoff=1UL; ; ) { err = funk_rec_acquire( accdb, chain_idx, key, &rec, is_write ); if( FD_LIKELY( err!=ACQUIRE_FAILED ) ) break; - FD_SPIN_PAUSE(); - /* FIXME backoff */ + for( ulong i=0UL; iaddress, address, 32UL ); @@ -292,17 +295,84 @@ fd_accdb_user_v2_open_ro_multi( fd_accdb_user_t * accdb, fd_accdb_lineage_set_fork( v2->lineage, v2->funk, xid ); ulong addr_laddr = (ulong)addr0; + long t0 = fd_tickcount(); for( ulong i=0UL; ibase.ro_active++; + v2->base.lookup_funk++; } else { fd_accdb_ro_init_empty( &ro0[i], addr_i ); } } + v2->base.dt_funk += fd_tickcount() - t0; + + /* Speculative cache reads — attempt pin-based direct reads for + accounts not found in funk. On success, the caller gets a + zero-copy pointer into the vinyl data cache. + + Holding specread pins across the ACQUIRE spin-wait below can + deadlock (root invalidation spin-drains pins while blocking + request processing). Therefore specread is only useful when it + can resolve ALL remaining accounts, avoiding the ACQUIRE + entirely. If any specread misses, we unpin everything and fall + through to ACQUIRE for the whole batch. */ + + t0 = fd_tickcount(); + if( v2->vinyl_line_cnt ) { + + /* Pin pass — attempt specread for every non-funk account. + Track how many we still need so we can detect partial + coverage without a second scan. */ + + ulong spec_need = 0UL; + ulong spec_hit = 0UL; + for( ulong i=0UL; iaccdb_type!=FD_ACCDB_TYPE_NONE ) continue; + spec_need++; + void const * addr_i = (void const *)( (ulong)addr_laddr + i*32UL ); + fd_vinyl_key_t vkey[1]; + fd_vinyl_key_init( vkey, addr_i, 32UL ); + fd_account_meta_t const * meta; + ulong spec_line_idx; + int specerr = fd_accdb_specread_pin( v2->vinyl_meta, + v2->vinyl_line, v2->vinyl_line_cnt, v2->vinyl_specrd_wksp, + vkey, &meta, &spec_line_idx ); + if( specerr==FD_VINYL_SUCCESS ) { + spec_hit++; + ro0[i] = (fd_accdb_ro_t){0}; + memcpy( ro0[i].ref->address, addr_i, 32UL ); + ro0[i].ref->accdb_type = FD_ACCDB_TYPE_V2S; + ro0[i].ref->ref_type = FD_ACCDB_REF_RO; + ro0[i].ref->user_data = spec_line_idx; + ro0[i].meta = meta; + } else if( specerr==FD_VINYL_ERR_KEY ) { + fd_accdb_ro_init_empty( &ro0[i], addr_i ); + ro0[i].ref->user_data2 = 1; + } + } - /* For the accounts that were not found in funk, open vinyl records */ - + // if( spec_hitaccdb_type!=FD_ACCDB_TYPE_V2S ) continue; + // fd_accdb_specread_unpin( v2->vinyl_line, ro0[i].ref->user_data ); + // ro0[i].ref->accdb_type = FD_ACCDB_TYPE_NONE; + // ro0[i].ref->user_data = 0UL; + // ro0[i].meta = NULL; + // } + // } else { + // /* Full coverage — commit all specread results */ + v2->base.ro_active += spec_hit; + v2->base.lookup_specrd += spec_hit; + // } + } + v2->base.dt_specrd += fd_tickcount() - t0; + + /* For the accounts that were not found in funk or specread, + open vinyl records via rq/cq */ + + t0 = fd_tickcount(); ulong batch_idx = fd_vinyl_req_pool_acquire( req_pool ); /* req_pool_release called before returning */ fd_vinyl_comp_t * comp = fd_vinyl_req_batch_comp ( req_pool, batch_idx ); @@ -314,10 +384,12 @@ fd_accdb_user_v2_open_ro_multi( fd_accdb_user_t * accdb, ulong req_cnt = 0UL; for( ulong i=0UL; iaccdb_type!=FD_ACCDB_TYPE_NONE ) continue; - /* At this point, addr0[i] not found in funk, load from vinyl */ + if( ro0[i].ref->accdb_type!=FD_ACCDB_TYPE_NONE || + ro0[i].ref->user_data2!=0 ) { + continue; + } void const * addr_i = (void const *)( (ulong)addr0 + i*32UL ); - + FD_BASE58_ENCODE_32_BYTES( addr_i, addr_b58 ); fd_vinyl_key_init( req_key0+req_cnt, addr_i, 32UL ); req_err0 [ req_cnt ] = 0; req_val_gaddr0[ req_cnt ] = 0UL; @@ -326,9 +398,12 @@ fd_accdb_user_v2_open_ro_multi( fd_accdb_user_t * accdb, if( !req_cnt ) { /* All records were found in funk, bail early */ fd_vinyl_req_pool_release( req_pool, batch_idx ); + v2->base.dt_vinyl += fd_tickcount() - t0; return; } + v2->base.lookup_accdb += req_cnt; + /* Send read-only "ACQUIRE" batch to vinyl and wait for response */ ulong req_id = v2->vinyl_req_id++; @@ -347,7 +422,10 @@ fd_accdb_user_v2_open_ro_multi( fd_accdb_user_t * accdb, req_cnt = 0UL; for( ulong i=0UL; iaccdb_type!=FD_ACCDB_TYPE_NONE ) continue; + if( ro0[i].ref->accdb_type!=FD_ACCDB_TYPE_NONE || + ro0[i].ref->user_data2!=0 ) { + continue; + } void const * addr_i = (void const *)( (ulong)addr0 + i*32UL ); int req_err = FD_VOLATILE_CONST( req_err0 [ req_cnt ] ); @@ -365,12 +443,15 @@ fd_accdb_user_v2_open_ro_multi( fd_accdb_user_t * accdb, ro->ref->ref_type = FD_ACCDB_REF_RO; ro->meta = meta; } else if( FD_UNLIKELY( req_err!=FD_VINYL_ERR_KEY ) ) { - FD_LOG_CRIT(( "vinyl tile ACQUIRE request failed: %i-%s", req_err, fd_vinyl_strerror( req_err ) )); + FD_LOG_CRIT(( "vinyl tile ACQUIRE request failed: %i-%s (idx=%lu cnt=%lu)", + req_err, fd_vinyl_strerror( req_err ), + i, cnt )); } req_cnt++; } fd_vinyl_req_pool_release( req_pool, batch_idx ); + v2->base.dt_vinyl += fd_tickcount() - t0; /* At this point, ownership of vinyl records transitions to caller. (Released using close_ro_multi) */ @@ -416,14 +497,94 @@ fd_accdb_user_v2_open_rw_multi( fd_accdb_user_t * accdb, finishes) */ ulong addr_laddr = (ulong)addr0; + long t0 = fd_tickcount(); for( ulong i=0UL; ibase.dt_funk += fd_tickcount() - t0; + + /* Speculative cache reads — for accounts not found in funk, attempt + pin-based direct reads from the vinyl cache. On success, copy the + account into a new writable funk record and unpin immediately. + The pin is short-lived (held only for the memcpy) so there is no + deadlock risk with root invalidation. On miss/contention, fall + through to the rq/cq ACQUIRE path below. */ + + t0 = fd_tickcount(); + if( v2->vinyl_line_cnt ) { + for( ulong i=0UL; iref_type!=FD_ACCDB_REF_INVAL ) continue; + void const * addr_i = (void const *)( (ulong)addr_laddr + i*32UL ); + fd_vinyl_key_t vkey[1]; + fd_vinyl_key_init( vkey, addr_i, 32UL ); + fd_account_meta_t const * src_meta; + ulong spec_line_idx; + int specerr = fd_accdb_specread_pin( v2->vinyl_meta, + v2->vinyl_line, v2->vinyl_line_cnt, v2->vinyl_specrd_wksp, + vkey, &src_meta, &spec_line_idx ); + if( specerr==FD_VINYL_ERR_KEY ) goto tombstone; + if( specerr!=FD_VINYL_SUCCESS ) continue; /* fall through to ACQUIRE */ + + uchar const * src_data = (uchar *)( src_meta+1 ); + + if( FD_UNLIKELY( src_meta->lamports==0UL ) ) { + /* Tombstone — unpin and handle via create-or-skip */ + fd_accdb_specread_unpin( v2->vinyl_line, spec_line_idx ); +tombstone: + if( flag_create ) { + fd_accdb_funk_create( v2->funk, &rw0[i], txn, addr_i, data_max0[i] ); + fd_funk_rec_write_lock_uncontended( v2->funk, (fd_funk_rec_t *)rw0[i].ref->user_data ); + accdb->base.rw_active++; + } else { + memset( &rw0[i], 0, sizeof(fd_accdb_ref_t) ); + /* Mark as handled so it doesn't leak into ACQUIRE batch + (ref_type!=INVAL skips batch builder) and promotion loop + (user_data2!=0 skips RW branch). accdb_type remains + NONE so the caller sees a not-found result. */ + rw0[i].ref->ref_type = FD_ACCDB_REF_RW; + rw0[i].ref->user_data2 = ULONG_MAX; + } + continue; + } + + ulong acc_orig_sz = src_meta->dlen; + ulong val_sz_min = sizeof(fd_account_meta_t)+fd_ulong_max( data_max0[i], acc_orig_sz ); + ulong acc_sz = flag_truncate ? 0UL : acc_orig_sz; + ulong val_sz = sizeof(fd_account_meta_t)+acc_sz; + ulong val_max = 0UL; + void * val = fd_alloc_malloc_at_least( funk->alloc, 16UL, val_sz_min, &val_max ); + if( FD_UNLIKELY( !val ) ) { + FD_LOG_CRIT(( "Failed to modify account: out of memory allocating %lu bytes", acc_orig_sz )); + } + + fd_account_meta_t * dst_meta = val; + uchar * dst_data = (uchar *)( dst_meta+1 ); + ulong data_max_actual = val_max - sizeof(fd_account_meta_t); + if( flag_truncate ) fd_accdb_funk_copy_truncated( dst_meta, src_meta ); + else fd_accdb_funk_copy_account ( dst_meta, dst_data, src_meta, src_data ); + + /* Unpin immediately — data has been copied to funk */ + fd_accdb_specread_unpin( v2->vinyl_line, spec_line_idx ); + + if( acc_orig_szuser_data ); + accdb->base.rw_active++; + accdb->base.created_cnt++; + } + } + v2->base.dt_specrd += fd_tickcount() - t0; - /* For the accounts that were not found in funk, create writable funk - records from elements in vinyl. */ + /* For the accounts that were not found in funk or specread, create + writable funk records from elements in vinyl. */ + t0 = fd_tickcount(); ulong batch_idx = fd_vinyl_req_pool_acquire( req_pool ); /* req_pool_release called before returning */ fd_vinyl_comp_t * comp = fd_vinyl_req_batch_comp ( req_pool, batch_idx ); @@ -459,9 +620,11 @@ fd_accdb_user_v2_open_rw_multi( fd_accdb_user_t * accdb, FD_LOG_CRIT(( "vinyl tile rejected my ACQUIRE request: %i-%s", comp_err, fd_vinyl_strerror( comp_err ) )); } } + v2->base.dt_vinyl += fd_tickcount() - t0; /* Promote any found accounts to writable accounts */ + ulong vinyl_cnt = req_cnt; req_cnt = 0UL; for( ulong i=0UL; iref->ref_type==FD_ACCDB_REF_RW ) { + + /* Entries already created by specread have user_data2 set to + the txn pointer (by fd_accdb_funk_prep_create). Funk + write-locked entries from funk_open_ref have user_data2==0. */ + if( rw->ref->user_data2 ) continue; + /* Mutable record found, modify in-place */ if( FD_UNLIKELY( !flag_create && fd_accdb_ref_lamports( rw->ro )==0UL ) ) { @@ -553,7 +722,6 @@ fd_accdb_user_v2_open_rw_multi( fd_accdb_user_t * accdb, } else { memset( rw, 0, sizeof(fd_accdb_ref_t) ); } - req_cnt++; continue; } @@ -595,22 +763,16 @@ fd_accdb_user_v2_open_rw_multi( fd_accdb_user_t * accdb, accdb->base.created_cnt++; } - /* Send "RELEASE" batch (reuse val_gaddr values), - and wait for response */ - - if( req_cnt ) { - ulong req_id = v2->vinyl_req_id++; - memset( fd_vinyl_req_batch_comp( req_pool, batch_idx ), 0, sizeof(fd_vinyl_comp_t) ); - fd_vinyl_req_send_batch( rq, req_pool, req_wksp, req_id, link_id, FD_VINYL_REQ_TYPE_RELEASE, 0UL, batch_idx, req_cnt ); - - while( FD_VOLATILE_CONST( comp->seq )!=1UL ) FD_SPIN_PAUSE(); - FD_COMPILER_MFENCE(); - int comp_err = FD_VOLATILE_CONST( comp->err ); - if( FD_UNLIKELY( comp_err!=FD_VINYL_SUCCESS ) ) { - FD_LOG_CRIT(( "vinyl tile rejected my RELEASE request: %i-%s", comp_err, fd_vinyl_strerror( comp_err ) )); - } + /* Release vinyl records: decrement ref count directly in shared + memory. The data was copied to funk so we no longer need the + vinyl cache entries. */ + for( ulong i=0UL; ivinyl_line[ obj->line_idx ].ctl, 1UL ); } - fd_vinyl_req_pool_release( req_pool, batch_idx ); } @@ -641,56 +803,28 @@ void fd_accdb_user_v2_close_ref_multi( fd_accdb_user_t * accdb, fd_accdb_ref_t * ref0, ulong cnt ) { - fd_accdb_user_v2_t * v2 = (fd_accdb_user_v2_t *)accdb; - fd_vinyl_rq_t * rq = v2->vinyl_rq; /* "request queue "*/ - fd_vinyl_req_pool_t * req_pool = v2->vinyl_req_pool; /* "request pool" */ - fd_wksp_t * req_wksp = v2->vinyl_req_wksp; /* shm workspace containing request buffer */ - fd_wksp_t * data_wksp = v2->vinyl_data_wksp; /* shm workspace containing vinyl data cache */ - ulong link_id = v2->vinyl_link_id; /* vinyl client ID */ + fd_accdb_user_v2_t * v2 = (fd_accdb_user_v2_t *)accdb; - if( FD_UNLIKELY( cnt>fd_vinyl_req_batch_key_max( req_pool ) ) ) { - FD_LOG_CRIT(( "close_ref_multi cnt %lu exceeds vinyl request batch max %lu", - cnt, fd_vinyl_req_batch_key_max( req_pool ) )); + /* Release specread pins (V2S) */ + for( ulong i=0UL; iaccdb_type!=FD_ACCDB_TYPE_V2S ) continue; + fd_accdb_specread_unpin( v2->vinyl_line, ref->user_data ); + accdb->base.ro_active--; + memset( ref, 0, sizeof(fd_accdb_ref_t) ); } - /* First, release all references to vinyl records - (This is a prefetch friendly / fast loop) */ - - ulong batch_idx = fd_vinyl_req_pool_acquire( req_pool ); - /* req_pool_release called before returning */ - fd_vinyl_comp_t * comp = fd_vinyl_req_batch_comp ( req_pool, batch_idx ); - schar * req_err0 = fd_vinyl_req_batch_err ( req_pool, batch_idx ); - ulong * req_val_gaddr0 = fd_vinyl_req_batch_val_gaddr( req_pool, batch_idx ); - - ulong ro_close_cnt = 0UL; - ulong rw_close_cnt = 0UL; - ulong req_cnt = 0UL; + /* Release vinyl records acquired via rq/cq: decrement ref count directly */ for( ulong i=0UL; iaccdb_type!=FD_ACCDB_TYPE_V2 ) continue; - ref->ref_type==FD_ACCDB_REF_RO ? ro_close_cnt++ : rw_close_cnt++; - req_err0 [ req_cnt ] = 0; - req_val_gaddr0[ req_cnt ] = fd_wksp_gaddr_fast( data_wksp, (void *)ref->meta_laddr ); + fd_vinyl_data_obj_t * obj = fd_vinyl_data_obj( (void *)ref->meta_laddr ); + FD_ATOMIC_FETCH_AND_SUB( &v2->vinyl_line[ obj->line_idx ].ctl, 1UL ); + accdb->base.ro_active--; memset( ref, 0, sizeof(fd_accdb_ref_t) ); - req_cnt++; - } - if( req_cnt ) { - if( FD_UNLIKELY( ro_close_cnt > accdb->base.ro_active ) ) { - FD_LOG_CRIT(( "attempted to close more accdb_ro (%lu) than are open (%lu)", - ro_close_cnt, accdb->base.ro_active )); - } - if( FD_UNLIKELY( rw_close_cnt > accdb->base.rw_active ) ) { - FD_LOG_CRIT(( "attempted to close more accdb_rw (%lu) than are open (%lu)", - rw_close_cnt, accdb->base.rw_active )); - } - ulong req_id = v2->vinyl_req_id++; - memset( fd_vinyl_req_batch_comp( req_pool, batch_idx ), 0, sizeof(fd_vinyl_comp_t) ); - fd_vinyl_req_send_batch( rq, req_pool, req_wksp, req_id, link_id, FD_VINYL_REQ_TYPE_RELEASE, 0UL, batch_idx, req_cnt ); } - /* While our vinyl request is inflight, release funk records - (This does expensive DRAM accesses, which are convenient to do when - we are waiting for the database to asynchronously respond) */ + /* Release funk records */ for( ulong i=0UL; iseq )!=1UL ) FD_SPIN_PAUSE(); - FD_COMPILER_MFENCE(); - int comp_err = FD_VOLATILE_CONST( comp->err ); - if( FD_UNLIKELY( comp_err!=FD_VINYL_SUCCESS ) ) { - FD_LOG_CRIT(( "vinyl tile rejected my RELEASE request: %i-%s", comp_err, fd_vinyl_strerror( comp_err ) )); - } - for( ulong i=0UL; ibase.ro_active -= ro_close_cnt; - accdb->base.rw_active -= rw_close_cnt; - } - - fd_vinyl_req_pool_release( req_pool, batch_idx ); } ulong @@ -780,6 +892,7 @@ fd_accdb_user_v2_init( fd_accdb_user_t * accdb_, void * vinyl_rq, void * vinyl_data, void * vinyl_req_pool, + void * vinyl_line, ulong vinyl_link_id, ulong max_depth ) { if( FD_UNLIKELY( !accdb_ ) ) { @@ -817,11 +930,33 @@ fd_accdb_user_v2_init( fd_accdb_user_t * accdb_, accdb->vinyl_data_wksp = vinyl_data; accdb->vinyl_req_wksp = fd_wksp_containing( req_pool ); accdb->vinyl_req_pool = req_pool; + accdb->vinyl_line = vinyl_line; accdb->base.accdb_type = FD_ACCDB_TYPE_V2; accdb->base.vt = &fd_accdb_user_v2_vt; return accdb_; } +void +fd_accdb_user_v2_init_cache( fd_accdb_user_t * accdb_, + void * vinyl_shmeta, + void * vinyl_shele, + void * vinyl_shline, + ulong vinyl_line_cnt ) { + fd_accdb_user_v2_t * v2 = fd_type_pun( accdb_ ); + + if( FD_UNLIKELY( !vinyl_shmeta || !vinyl_shele || !vinyl_shline || !vinyl_line_cnt ) ) { + /* Specread disabled */ + v2->vinyl_line_cnt = 0UL; + v2->vinyl_specrd_wksp = NULL; + return; + } + + FD_TEST( fd_vinyl_meta_join( v2->vinyl_meta, vinyl_shmeta, vinyl_shele ) ); + v2->vinyl_line = (fd_vinyl_line_t *)vinyl_shline; + v2->vinyl_line_cnt = vinyl_line_cnt; + v2->vinyl_specrd_wksp = v2->vinyl_data_wksp; /* same workspace */ +} + void fd_accdb_user_v2_fini( fd_accdb_user_t * accdb ) { fd_accdb_user_v2_t * user = (fd_accdb_user_v2_t *)accdb; diff --git a/src/flamenco/accdb/fd_accdb_impl_v2.h b/src/flamenco/accdb/fd_accdb_impl_v2.h index 60286d67073..fc02721901c 100644 --- a/src/flamenco/accdb/fd_accdb_impl_v2.h +++ b/src/flamenco/accdb/fd_accdb_impl_v2.h @@ -13,6 +13,7 @@ #include "../../vinyl/cq/fd_vinyl_cq.h" #include "../../vinyl/rq/fd_vinyl_rq.h" +#include "../../vinyl/line/fd_vinyl_line.h" #include "fd_accdb_user.h" #include "fd_accdb_lineage.h" #include "fd_vinyl_req_pool.h" @@ -47,6 +48,12 @@ struct fd_accdb_user_v2 { fd_wksp_t * vinyl_data_wksp; fd_wksp_t * vinyl_req_wksp; fd_vinyl_req_pool_t * vinyl_req_pool; + fd_vinyl_line_t * vinyl_line; /* vinyl cache line array (shared memory) */ + + /* Speculative read (specread) state — populated by init_cache */ + fd_vinyl_meta_t vinyl_meta[1]; /* local join of meta map */ + ulong vinyl_line_cnt; /* number of cache lines */ + fd_wksp_t * vinyl_specrd_wksp; /* data workspace for gaddr resolution */ }; typedef struct fd_accdb_user_v2 fd_accdb_user_v2_t; @@ -62,9 +69,24 @@ fd_accdb_user_v2_init( fd_accdb_user_t * ljoin, void * vinyl_rq, void * vinyl_data, void * vinyl_req_pool, + void * vinyl_line, ulong vinyl_link_id, ulong max_depth ); +/* fd_accdb_user_v2_init_cache enables speculative reads on an + already-initialized v2 accdb client. vinyl_shmeta / vinyl_shele / + vinyl_shline point to the shared meta map, element pool, and line + array created by the accdb tile. vinyl_line_cnt is the number of + cache lines. If vinyl_shmeta is NULL, specread is disabled (the + client only uses rq/cq). */ + +void +fd_accdb_user_v2_init_cache( fd_accdb_user_t * ljoin, + void * vinyl_shmeta, + void * vinyl_shele, + void * vinyl_shline, + ulong vinyl_line_cnt ); + FD_PROTOTYPES_END #endif /* HEADER_fd_src_flamenco_accdb_fd_accdb_impl_v2_h */ diff --git a/src/flamenco/accdb/fd_accdb_specread.h b/src/flamenco/accdb/fd_accdb_specread.h new file mode 100644 index 00000000000..c094f39e032 --- /dev/null +++ b/src/flamenco/accdb/fd_accdb_specread.h @@ -0,0 +1,136 @@ +#ifndef HEADER_fd_src_flamenco_accdb_fd_accdb_specread_h +#define HEADER_fd_src_flamenco_accdb_fd_accdb_specread_h + +/* fd_accdb_specread.h provides pin-based speculative reads of rooted + account data from the vinyl cache. + + A specread client is a tile (replay, exec, ...) that has read-only + access to the vinyl meta map, element pool, and line array, plus the + data workspace. It reads cached account data directly from shared + memory, bypassing the rq/cq round-trip. On cache miss or contention, + the client falls back to the normal vinyl ACQUIRE path. + + Pin protocol: + + 1. fd_vinyl_meta_query_try → ele_idx, line_idx + 2. Validate meta seqlock via fd_vinyl_meta_query_test + 3. FETCH_AND_ADD(&line[line_idx].ctl, 1) to pin + 4. Bail if EVICTING, ref < 0, or cross-validation fails + 5. Resolve obj_gaddr, check rd_active == 0 + 6. Point caller at val data (zero-copy) + 7. On close: FETCH_AND_SUB(&line[line_idx].ctl, 1) to unpin */ + +#include "../../vinyl/line/fd_vinyl_line.h" /* includes meta + data */ +#include "../../discof/accdb/fd_accdb_line_ctl.h" +#include "../fd_flamenco_base.h" /* fd_account_meta_t */ + +FD_PROTOTYPES_BEGIN + +/* fd_accdb_specread_pin attempts to pin a cached account and return a + direct pointer to its metadata. + + On success (FD_VINYL_SUCCESS): *out_meta points to the + fd_account_meta_t in the data cache, *out_line_idx gives the pinned + line. The caller MUST call fd_accdb_specread_unpin when done. + + On failure: FD_VINYL_ERR_KEY (key not in meta) or FD_VINYL_ERR_AGAIN + (transient: eviction in progress, I/O pending, seqlock contention). + No pin is held; caller should fall back to rq/cq ACQUIRE. */ + +static inline int +fd_accdb_specread_pin( fd_vinyl_meta_t * meta, + fd_vinyl_line_t * line, + ulong line_cnt, + fd_wksp_t * data_wksp, + fd_vinyl_key_t const * key, + fd_account_meta_t const ** out_meta, + ulong * out_line_idx ) { + + /* 1. Lockfree query of the meta map for key */ + + fd_vinyl_meta_query_t query[1]; + int err = fd_vinyl_meta_query_try( meta, key, NULL, query, 0 /* non-blocking */ ); + if( FD_UNLIKELY( err ) ) return err; /* ERR_KEY or ERR_AGAIN */ + + fd_vinyl_meta_ele_t const * ele = fd_vinyl_meta_query_ele_const( query ); + + /* Read fields of interest while the seqlock is held */ + + ulong ctl = ele->phdr.ctl; + ulong line_idx = ele->line_idx; + ulong ele_idx = (ulong)( ele - (fd_vinyl_meta_ele_t const *)fd_vinyl_meta_shele_const( meta ) ); + + /* 2. Validate meta seqlock — detect torn reads */ + + if( FD_UNLIKELY( fd_vinyl_meta_query_test( query ) ) ) return FD_VINYL_ERR_AGAIN; + + /* Key not in bstream or being created? */ + + if( FD_UNLIKELY( !ctl || ctl==ULONG_MAX ) ) return FD_VINYL_ERR_AGAIN; + + /* 3. Validate line_idx in range (key might not be cached) */ + + if( FD_UNLIKELY( line_idx>=line_cnt ) ) return FD_VINYL_ERR_AGAIN; + + /* 4. Pin: atomically increment ref count */ + + ulong old_ctl = FD_ATOMIC_FETCH_AND_ADD( &line[ line_idx ].ctl, 1UL ); + + /* 5. If EVICTING set or ref was negative (acquired for modify), + undo the pin immediately */ + + if( FD_UNLIKELY( (old_ctl & FD_ACCDB_LINE_CTL_EVICTING) || + fd_accdb_line_ctl_ref( old_ctl ) < 0L ) ) { + FD_ATOMIC_FETCH_AND_SUB( &line[ line_idx ].ctl, 1UL ); + return FD_VINYL_ERR_AGAIN; + } + + /* 6. Resolve obj_gaddr */ + + ulong obj_gaddr = line[ line_idx ].obj_gaddr; + if( FD_UNLIKELY( !obj_gaddr ) ) { + FD_ATOMIC_FETCH_AND_SUB( &line[ line_idx ].ctl, 1UL ); + return FD_VINYL_ERR_AGAIN; + } + + /* 7. Cross-validate: line still maps to same meta element */ + + if( FD_UNLIKELY( line[ line_idx ].ele_idx != ele_idx ) ) { + FD_ATOMIC_FETCH_AND_SUB( &line[ line_idx ].ctl, 1UL ); + return FD_VINYL_ERR_AGAIN; + } + + /* 8. Resolve to local address */ + + fd_vinyl_data_obj_t * obj = (fd_vinyl_data_obj_t *) + fd_wksp_laddr_fast( data_wksp, obj_gaddr ); + + /* 9. Check I/O not in progress */ + + if( FD_UNLIKELY( obj->rd_active ) ) { + FD_ATOMIC_FETCH_AND_SUB( &line[ line_idx ].ctl, 1UL ); + return FD_VINYL_ERR_AGAIN; + } + + /* 10. Success — return pointer directly into cache (zero-copy). + fd_vinyl_data_obj_val returns the start of the val payload, + which for accdb is fd_account_meta_t. */ + + *out_meta = (fd_account_meta_t const *)fd_vinyl_data_obj_val( obj ); + *out_line_idx = line_idx; + return FD_VINYL_SUCCESS; +} + +/* fd_accdb_specread_unpin releases a pin acquired by + fd_accdb_specread_pin. Must be called exactly once per successful + pin. */ + +static inline void +fd_accdb_specread_unpin( fd_vinyl_line_t * line, + ulong line_idx ) { + FD_ATOMIC_FETCH_AND_SUB( &line[ line_idx ].ctl, 1UL ); +} + +FD_PROTOTYPES_END + +#endif /* HEADER_fd_src_flamenco_accdb_fd_accdb_specread_h */ diff --git a/src/flamenco/accdb/fd_accdb_user.h b/src/flamenco/accdb/fd_accdb_user.h index a38eae75616..94271545a76 100644 --- a/src/flamenco/accdb/fd_accdb_user.h +++ b/src/flamenco/accdb/fd_accdb_user.h @@ -123,6 +123,16 @@ struct fd_accdb_user_base { ulong rw_active; ulong ro_active; ulong created_cnt; + + /* Cache hit tracking for open_ro_multi */ + ulong lookup_funk; /* Account found in funk (in-memory fork store) */ + ulong lookup_specrd; /* Account found via speculative read (vinyl cache) */ + ulong lookup_accdb; /* Account requested from accdb tile (vinyl rq/cq) */ + + /* Cumulative tickcount spent in each lookup regime */ + long dt_funk; /* Time spent in funk lookups */ + long dt_specrd; /* Time spent in specread pin/unpin */ + long dt_vinyl; /* Time spent waiting for vinyl rq/cq */ }; typedef struct fd_accdb_user_base fd_accdb_user_base_t; diff --git a/src/flamenco/accdb/test_accdb_v2.c b/src/flamenco/accdb/test_accdb_v2.c deleted file mode 100644 index 71c12792b31..00000000000 --- a/src/flamenco/accdb/test_accdb_v2.c +++ /dev/null @@ -1,670 +0,0 @@ -#include "fd_accdb_base.h" -#include "fd_accdb_admin_v1.h" -#include "fd_accdb_impl_v2.h" -#include "fd_accdb_admin.h" -#include "fd_accdb_sync.h" -#include "fd_accdb_pipe.h" -#include "../../vinyl/fd_vinyl.h" - -#define WKSP_TAG (1UL) - -static uchar const s_key_a[ 32 ] = { 1 }; /* a: present in vinyl, account exists */ -static uchar const s_key_b[ 32 ] = { 2 }; /* b: present in vinyl, tombstone */ -static uchar const s_key_c[ 32 ] = { 3 }; /* c: present in funk, account exists */ -static uchar const s_key_d[ 32 ] = { 4 }; /* d: present in funk, tombstone*/ -static uchar const s_key_e[ 32 ] = { 5 }; /* e: not found */ - -static int -fd_vinyl_tile( int argc, - char ** argv ) { - (void)argc; - fd_vinyl_exec( (fd_vinyl_t *)argv ); - return 0; -} - -static void -add_account_vinyl( fd_accdb_user_t * accdb_, - uchar const * key, - ulong lamports ) { - fd_accdb_user_v2_t * accdb = (fd_accdb_user_v2_t *)accdb_; - - /* Start write */ - ulong batch_idx = fd_vinyl_req_pool_acquire ( accdb->vinyl_req_pool ); - fd_vinyl_key_t * req_key = fd_vinyl_req_batch_key ( accdb->vinyl_req_pool, batch_idx ); - ulong * req_val_gaddr = fd_vinyl_req_batch_val_gaddr( accdb->vinyl_req_pool, batch_idx ); - schar * req_err = fd_vinyl_req_batch_err ( accdb->vinyl_req_pool, batch_idx ); - fd_vinyl_comp_t * comp = fd_vinyl_req_batch_comp ( accdb->vinyl_req_pool, batch_idx ); - fd_vinyl_key_init( req_key, key, 32UL ); - ulong val_max = sizeof(fd_account_meta_t) + 32UL; - *req_val_gaddr = val_max; - memset( comp, 0, sizeof(fd_vinyl_comp_t) ); - fd_vinyl_req_send_batch( - accdb->vinyl_rq, - accdb->vinyl_req_pool, - accdb->vinyl_req_wksp, - accdb->vinyl_req_id++, - accdb->vinyl_link_id, - FD_VINYL_REQ_TYPE_ACQUIRE, - FD_VINYL_REQ_FLAG_MODIFY | FD_VINYL_REQ_FLAG_CREATE | FD_VINYL_REQ_FLAG_EXCL, - batch_idx, - 1UL /* batch_cnt */ - ); - while( FD_VOLATILE_CONST( comp->seq )!=1UL ) FD_SPIN_PAUSE(); - FD_COMPILER_MFENCE(); - int comp_err = FD_VOLATILE_CONST( comp->err ); - if( FD_UNLIKELY( comp_err!=FD_VINYL_SUCCESS ) ) { - FD_LOG_CRIT(( "vinyl tile rejected my ACQUIRE request: %i-%s", comp_err, fd_vinyl_strerror( comp_err ) )); - } - int err = FD_VOLATILE_CONST( req_err[0] ); - if( FD_UNLIKELY( err!=FD_VINYL_SUCCESS ) ) { - FD_LOG_CRIT(( "vinyl tile ACQUIRE request failed: %i-%s", err, fd_vinyl_strerror( err ) )); - } - - ulong val_gaddr = FD_VOLATILE_CONST( req_val_gaddr[0] ); - void * val = fd_wksp_laddr_fast( accdb->vinyl_data_wksp, val_gaddr ); - fd_vinyl_info_t * info = fd_vinyl_data_info( val ); - fd_account_meta_t * meta = val; - uchar * data = (uchar *)( meta+1 ); - - memset( meta, 0, val_max ); - meta->lamports = lamports; - meta->dlen = 32U; - memcpy( data, key, 32UL ); - info->val_sz = (uint)val_max; - - /* Finish write */ - memset( comp, 0, sizeof(fd_vinyl_comp_t) ); - req_val_gaddr[0] = val_gaddr; - fd_vinyl_req_send_batch( - accdb->vinyl_rq, - accdb->vinyl_req_pool, - accdb->vinyl_req_wksp, - accdb->vinyl_req_id++, - accdb->vinyl_link_id, - FD_VINYL_REQ_TYPE_RELEASE, - FD_VINYL_REQ_FLAG_MODIFY, - batch_idx, - 1UL /* batch_cnt */ - ); - while( FD_VOLATILE_CONST( comp->seq )!=1UL ) FD_SPIN_PAUSE(); - FD_COMPILER_MFENCE(); - comp_err = FD_VOLATILE_CONST( comp->err ); - if( FD_UNLIKELY( comp_err!=FD_VINYL_SUCCESS ) ) { - FD_LOG_CRIT(( "vinyl tile rejected my RELEASE request: %i-%s", comp_err, fd_vinyl_strerror( comp_err ) )); - } - err = FD_VOLATILE_CONST( req_err[0] ); - if( FD_UNLIKELY( err!=FD_VINYL_SUCCESS ) ) { - FD_LOG_CRIT(( "vinyl tile RELEASE request failed: %i-%s", err, fd_vinyl_strerror( err ) )); - } - - fd_vinyl_req_pool_release( accdb->vinyl_req_pool, batch_idx ); -} - -static void -add_account_funk( fd_accdb_user_t * accdb_, - uchar const * key, - ulong lamports ) { - fd_accdb_user_v2_t * accdb = (fd_accdb_user_v2_t *)accdb_; - fd_funk_t * funk = accdb->funk; - - fd_funk_rec_map_t * rec_map = funk->rec_map; - fd_funk_rec_pool_t * rec_pool = funk->rec_pool; - - fd_funk_rec_t * rec = fd_funk_rec_pool_acquire( rec_pool, NULL, 1, NULL ); - FD_TEST( rec ); - ulong rec_idx = (ulong)( rec - rec_pool->ele ); - *rec = (fd_funk_rec_t) { - .next_idx = UINT_MAX, - .prev_idx = UINT_MAX - }; - accdb->funk->rec_lock[ rec_idx ] = fd_funk_rec_ver_lock( 1UL, 0UL ); - fd_funk_txn_xid_set_root( rec->pair.xid ); - memcpy( rec->pair.key->uc, key, 32UL ); - - ulong val_sz = sizeof(fd_account_meta_t) + 32UL; - fd_account_meta_t * meta = fd_funk_val_truncate( rec, funk->alloc, funk->wksp, 16UL, val_sz, NULL ); - FD_TEST( meta ); memset( meta, 0, val_sz ); - uchar * data = (uchar *)( meta+1 ); - - meta->lamports = lamports; - meta->dlen = 32U; - memcpy( data, key, 32UL ); - - FD_TEST( fd_funk_rec_map_insert( rec_map, rec, 0 )==FD_MAP_SUCCESS ); -} - -static fd_funk_rec_t * -ref_funk_rec( fd_accdb_ref_t const * ref ) { - return (fd_funk_rec_t *)ref->user_data; -} - -static ulong -ref_ver_lock( fd_funk_t const * funk, - fd_funk_rec_t const * rec ) { - ulong rec_idx = (ulong)( rec - funk->rec_pool->ele ); - return funk->rec_lock[ rec_idx ]; -} - -static void -test_account_creation( fd_accdb_user_t * accdb, - fd_funk_txn_xid_t const * xid2, - void const * addr, - ulong lamports ) { - fd_accdb_rw_t rw[1]; - fd_accdb_ro_t ro[1]; - fd_funk_t * funk = ((fd_accdb_user_v2_t *)accdb)->funk; - FD_TEST( accdb->base.ro_active==0 && accdb->base.rw_active==0 ); - - fd_funk_rec_t * rec; - - FD_TEST( fd_accdb_open_rw( accdb, rw, xid2, addr, 16UL, FD_ACCDB_FLAG_CREATE ) ); - FD_TEST( accdb->base.ro_active==0 && accdb->base.rw_active==1 ); - rec = ref_funk_rec( rw->ref ); - FD_TEST( fd_funk_rec_ver_alive( fd_funk_rec_ver_bits( ref_ver_lock( funk, rec ) ) )==1 ); - FD_TEST( fd_funk_rec_lock_bits( ref_ver_lock( funk, rec ) )==FD_FUNK_REC_LOCK_MASK ); /* write locked */ - fd_accdb_ref_lamports_set( rw, lamports ); - fd_accdb_close_rw( accdb, rw ); - FD_TEST( fd_funk_rec_ver_alive( fd_funk_rec_ver_bits( ref_ver_lock( funk, rec ) ) )==1 ); - FD_TEST( fd_funk_rec_lock_bits( ref_ver_lock( funk, rec ) )==0 ); - FD_TEST( accdb->base.ro_active==0 && accdb->base.rw_active==0 ); - - FD_TEST( fd_accdb_open_ro( accdb, ro, xid2, addr ) ); - FD_TEST( accdb->base.ro_active==1 && accdb->base.rw_active==0 ); - rec = ref_funk_rec( ro->ref ); - FD_TEST( fd_funk_rec_ver_alive( fd_funk_rec_ver_bits( ref_ver_lock( funk, rec ) ) )==1 ); - FD_TEST( fd_funk_rec_lock_bits( ref_ver_lock( funk, rec ) )==1UL ); /* read lock */ - FD_TEST( fd_accdb_ref_lamports( ro )==lamports ); - fd_accdb_close_ro( accdb, ro ); - FD_TEST( fd_funk_rec_ver_alive( fd_funk_rec_ver_bits( ref_ver_lock( funk, rec ) ) )==1 ); - FD_TEST( fd_funk_rec_lock_bits( ref_ver_lock( funk, rec ) )==0 ); - FD_TEST( accdb->base.ro_active==0 && accdb->base.rw_active==0 ); - - FD_TEST( fd_accdb_open_rw( accdb, rw, xid2, addr, 16UL, 0 ) ); - rec = ref_funk_rec( rw->ref ); - FD_TEST( fd_funk_rec_ver_alive( fd_funk_rec_ver_bits( ref_ver_lock( funk, rec ) ) )==1 ); - FD_TEST( fd_funk_rec_lock_bits( ref_ver_lock( funk, rec ) )==FD_FUNK_REC_LOCK_MASK ); /* write locked */ - fd_accdb_ref_lamports_set( rw, 0UL ); /* delete */ - fd_accdb_close_rw( accdb, rw ); - FD_TEST( fd_funk_rec_ver_alive( fd_funk_rec_ver_bits( ref_ver_lock( funk, rec ) ) )==1 ); - FD_TEST( fd_funk_rec_lock_bits( ref_ver_lock( funk, rec ) )==0UL ); - - FD_TEST( !fd_accdb_open_rw( accdb, rw, xid2, addr, 16UL, 0 ) ); - FD_TEST( accdb->base.ro_active==0 && accdb->base.rw_active==0 ); - - FD_TEST( !fd_accdb_open_ro( accdb, ro, xid2, addr ) ); - FD_TEST( accdb->base.ro_active==0 && accdb->base.rw_active==0 ); -} - - -/* test_truncate verifies open_rw behavior with the TRUNCATE flag set. - - test_truncate_create: Account does not exist, create new (flags+=CREATE) - test_truncate_nonexist: Account does not exist, return NULL - test_truncate_inplace: Account exists and is mutable, truncate in-place - test_truncate_copy: Account exists and is immutable, create new and copy meta */ - -static void -test_truncate_create( fd_accdb_admin_t * admin, - fd_accdb_user_t * accdb ) { - fd_funk_txn_xid_t root = fd_accdb_root_get( admin ); - fd_funk_txn_xid_t xid = { .ul={ 1UL, 0UL } }; - fd_accdb_attach_child( admin, &root, &xid ); - - fd_funk_rec_key_t key = { .ul={ 42UL } }; - fd_accdb_rw_t rw[1]; - FD_TEST( fd_accdb_open_rw( accdb, rw, &xid, &key, 56UL, FD_ACCDB_FLAG_CREATE|FD_ACCDB_FLAG_TRUNCATE ) ); - FD_TEST( rw->ref->ref_type==FD_ACCDB_REF_RW ); - fd_funk_rec_t * rec = (void *)rw->ref->user_data; - FD_TEST( rec->val_sz == sizeof(fd_account_meta_t) ); - FD_TEST( rec->val_max >= sizeof(fd_account_meta_t)+56UL ); - FD_TEST( rw->meta->dlen == 0UL ); - fd_accdb_close_rw( accdb, rw ); - - fd_accdb_cancel( admin, &xid ); -} - -static void -test_truncate_nonexist( fd_accdb_admin_t * admin, - fd_accdb_user_t * accdb ) { - fd_funk_txn_xid_t root = fd_accdb_root_get( admin ); - fd_funk_txn_xid_t xid = { .ul={ 2UL, 0UL } }; - fd_accdb_attach_child( admin, &root, &xid ); - - fd_funk_rec_key_t key = { .ul={ 42UL } }; - fd_accdb_rw_t rw[1]; - FD_TEST( !fd_accdb_open_rw( accdb, rw, &xid, &key, 42UL, FD_ACCDB_FLAG_TRUNCATE ) ); - - fd_accdb_close_rw( accdb, rw ); -} - -static void -test_truncate_inplace( fd_accdb_admin_t * admin, - fd_accdb_user_t * accdb ) { - fd_funk_txn_xid_t root = fd_accdb_root_get( admin ); - fd_funk_txn_xid_t xid = { .ul={ 3UL, 0UL } }; - fd_accdb_attach_child( admin, &root, &xid ); - - fd_funk_rec_key_t key = { .ul={ 42UL } }; - fd_accdb_rw_t rw[1]; - ulong data_sz_0 = 56UL; - FD_TEST( fd_accdb_open_rw( accdb, rw, &xid, &key, data_sz_0, FD_ACCDB_FLAG_CREATE ) ); - FD_TEST( rw->ref->ref_type==FD_ACCDB_REF_RW ); - fd_accdb_ref_lamports_set( rw, 32UL ); - fd_accdb_ref_data_set( accdb, rw, "hello", 5UL ); - fd_funk_rec_t * rec = (void *)rw->ref->user_data; - FD_TEST( rec->val_sz == sizeof(fd_account_meta_t)+5UL ); - FD_TEST( rec->val_max >= sizeof(fd_account_meta_t)+data_sz_0 ); - FD_TEST( rw->meta->dlen == 5UL ); - fd_accdb_close_rw( accdb, rw ); - - ulong data_sz_1 = 256UL; - FD_TEST( fd_accdb_open_rw( accdb, rw, &xid, &key, data_sz_1, FD_ACCDB_FLAG_TRUNCATE ) ); - FD_TEST( rw->ref->ref_type==FD_ACCDB_REF_RW ); - rec = (void *)rw->ref->user_data; - FD_TEST( rec->val_sz == sizeof(fd_account_meta_t) ); - FD_TEST( rec->val_max >= sizeof(fd_account_meta_t)+data_sz_1 ); - FD_TEST( rw->meta->dlen == 0UL ); - fd_accdb_close_rw( accdb, rw ); - - fd_accdb_close_rw( accdb, rw ); -} - -static void -test_truncate_copy( fd_accdb_admin_t * admin, - fd_accdb_user_t * accdb ) { - fd_funk_txn_xid_t root = fd_accdb_root_get( admin ); - fd_funk_txn_xid_t xid1 = { .ul={ 4UL, 0UL } }; - fd_accdb_attach_child( admin, &root, &xid1 ); - - fd_funk_rec_key_t key = { .ul={ 42UL } }; - fd_accdb_rw_t rw[1]; - FD_TEST( fd_accdb_open_rw( accdb, rw, &xid1, &key, 56UL, FD_ACCDB_FLAG_CREATE ) ); - FD_TEST( rw->ref->ref_type==FD_ACCDB_REF_RW ); - fd_accdb_ref_lamports_set( rw, 32UL ); - fd_accdb_ref_data_set( accdb, rw, "hello", 5UL ); - fd_funk_rec_t * rec = (void *)rw->ref->user_data; - FD_TEST( rec->val_sz == sizeof(fd_account_meta_t)+5UL ); - FD_TEST( rec->val_max >= sizeof(fd_account_meta_t)+56UL ); - FD_TEST( rw->meta->dlen == 5UL ); - fd_accdb_close_rw( accdb, rw ); - - fd_funk_txn_xid_t xid2 = { .ul={ 5UL, 0UL } }; - fd_accdb_attach_child( admin, &xid1, &xid2 ); - FD_TEST( fd_accdb_open_rw( accdb, rw, &xid2, &key, 256UL, FD_ACCDB_FLAG_TRUNCATE ) ); - FD_TEST( rw->ref->ref_type==FD_ACCDB_REF_RW ); - rec = (void *)rw->ref->user_data; - FD_TEST( rec->val_sz == sizeof(fd_account_meta_t) ); - FD_TEST( rec->val_max >= sizeof(fd_account_meta_t)+256UL ); - FD_TEST( rw->meta->dlen == 0UL ); - fd_accdb_close_rw( accdb, rw ); - - fd_accdb_cancel( admin, &xid2 ); - fd_accdb_cancel( admin, &xid1 ); -} - -static void -run_tests( fd_accdb_user_t * accdb ) { - fd_accdb_user_v2_t * v2 = (fd_accdb_user_v2_t *)accdb; - fd_vinyl_req_pool_t * req_pool = v2->vinyl_req_pool; - FD_TEST( accdb->base.ro_active==0UL ); - - add_account_vinyl( accdb, s_key_a, 10000UL ); - add_account_vinyl( accdb, s_key_b, 0UL ); - add_account_vinyl( accdb, s_key_d, 40000UL ); - add_account_funk ( accdb, s_key_c, 20000UL ); - add_account_funk ( accdb, s_key_d, 0UL ); - - fd_funk_txn_xid_t xid[1]; fd_funk_txn_xid_set_root( xid ); - fd_accdb_ro_t ro[1]; - - FD_TEST( fd_accdb_open_ro( accdb, ro, xid, s_key_a ) ); - FD_TEST( ro->ref->accdb_type==FD_ACCDB_TYPE_V2 ); - FD_TEST( ro->ref->ref_type==FD_ACCDB_REF_RO ); - FD_TEST( accdb->base.ro_active==1UL ); - FD_TEST( fd_accdb_ref_lamports( ro )==10000UL ); - fd_accdb_close_ro( accdb, ro ); - FD_TEST( accdb->base.ro_active==0UL ); - FD_TEST( req_pool->free_cnt==2UL ); - - FD_TEST( !fd_accdb_open_ro( accdb, ro, xid, s_key_b ) ); - - FD_TEST( fd_accdb_open_ro( accdb, ro, xid, s_key_c ) ); - fd_funk_rec_t * rec = ref_funk_rec( ro->ref ); - FD_TEST( ref_ver_lock( v2->funk, rec )==fd_funk_rec_ver_lock( 1UL, 1UL ) ); - FD_TEST( accdb->base.ro_active==1UL ); - FD_TEST( ro->ref->accdb_type==FD_ACCDB_TYPE_V1 ); - FD_TEST( ro->ref->ref_type==FD_ACCDB_REF_RO ); - FD_TEST( fd_accdb_ref_lamports( ro )==20000UL ); - fd_accdb_close_ro( accdb, ro ); - FD_TEST( ref_ver_lock( v2->funk, rec )==fd_funk_rec_ver_lock( 1UL, 0UL ) ); - FD_TEST( accdb->base.ro_active==0UL ); - FD_TEST( req_pool->free_cnt==2UL ); - - FD_TEST( !fd_accdb_open_ro( accdb, ro, xid, s_key_d ) ); - - FD_TEST( !fd_accdb_open_ro( accdb, ro, xid, s_key_e ) ); - - /* Test ro_pipe API */ - - fd_accdb_ro_t * ro_tmp; - fd_accdb_ro_pipe_t pipe[1]; - FD_TEST( fd_accdb_ro_pipe_init( pipe, accdb, xid ) ); - FD_TEST( pipe->req_cnt==0UL ); - FD_TEST( pipe->req_max==4UL ); - FD_TEST( req_pool->free_cnt==2UL ); - - /* first batch: d, b, c, e */ - fd_accdb_ro_pipe_enqueue( pipe, s_key_d ); - FD_TEST( req_pool->free_cnt==2UL ); - FD_TEST( pipe->req_cnt==1UL ); - FD_TEST( !fd_accdb_ro_pipe_poll( pipe ) ); - fd_accdb_ro_pipe_enqueue( pipe, s_key_b ); - FD_TEST( !fd_accdb_ro_pipe_poll( pipe ) ); - fd_accdb_ro_pipe_enqueue( pipe, s_key_c ); - FD_TEST( !fd_accdb_ro_pipe_poll( pipe ) ); - fd_accdb_ro_pipe_enqueue( pipe, s_key_e ); - FD_TEST( req_pool->free_cnt==2UL ); - - /* result for d */ - FD_TEST( (ro_tmp = fd_accdb_ro_pipe_poll( pipe )) ); - FD_TEST( ro_tmp->ref->ref_type==FD_ACCDB_REF_RO ); - FD_TEST( ro_tmp->ref->accdb_type==FD_ACCDB_TYPE_NONE ); - FD_TEST( 0==memcmp( fd_accdb_ref_address( ro_tmp ), s_key_d, 32UL ) ); - FD_TEST( ro_tmp->meta->lamports==0UL ); - FD_TEST( accdb->base.ro_active==3UL ); - - /* result for b (tombstone) */ - FD_TEST( (ro_tmp = fd_accdb_ro_pipe_poll( pipe )) ); - FD_TEST( ro_tmp->ref->ref_type==FD_ACCDB_REF_RO ); - FD_TEST( ro_tmp->ref->accdb_type==FD_ACCDB_TYPE_NONE ); - FD_TEST( 0==memcmp( fd_accdb_ref_address( ro_tmp ), s_key_b, 32UL ) ); - FD_TEST( ro_tmp->meta->lamports==0UL ); - - /* result for c */ - FD_TEST( (ro_tmp = fd_accdb_ro_pipe_poll( pipe )) ); - FD_TEST( ro_tmp->ref->ref_type==FD_ACCDB_REF_RO ); - FD_TEST( ro_tmp->ref->accdb_type==FD_ACCDB_TYPE_V1 ); - FD_TEST( 0==memcmp( fd_accdb_ref_address( ro_tmp ), s_key_c, 32UL ) ); - FD_TEST( ro_tmp->meta->lamports==20000UL ); - - /* result for e (tombstone) */ - FD_TEST( (ro_tmp = fd_accdb_ro_pipe_poll( pipe )) ); - FD_TEST( ro_tmp->ref->accdb_type==FD_ACCDB_TYPE_NONE ); - FD_TEST( 0==memcmp( fd_accdb_ref_address( ro_tmp ), s_key_e, 32UL ) ); - FD_TEST( ro_tmp->meta->lamports==0UL ); - FD_TEST( accdb->base.ro_active==3UL ); - FD_TEST( !fd_accdb_ro_pipe_poll( pipe ) ); - FD_TEST( accdb->base.ro_active==0UL ); - - /* result for a */ - fd_accdb_ro_pipe_enqueue( pipe, s_key_a ); - FD_TEST( !fd_accdb_ro_pipe_poll( pipe ) ); - fd_accdb_ro_pipe_flush( pipe ); - FD_TEST( (ro_tmp = fd_accdb_ro_pipe_poll( pipe )) ); - FD_TEST( ro_tmp->ref->accdb_type==FD_ACCDB_TYPE_V2 ); - FD_TEST( 0==memcmp( fd_accdb_ref_address( ro_tmp ), s_key_a, 32UL ) ); - FD_TEST( ro_tmp->meta->lamports==10000UL ); - FD_TEST( accdb->base.ro_active==1UL ); - FD_TEST( !fd_accdb_ro_pipe_poll( pipe ) ); - FD_TEST( accdb->base.ro_active==0UL ); - - fd_accdb_ro_pipe_fini( pipe ); - - fd_accdb_rw_t rw[1]; - fd_funk_txn_xid_t xid2[1] = {{ .ul={ 1UL, 2UL } }}; - fd_accdb_admin_t admin[1]; - fd_accdb_admin_v1_init( admin, v2->funk->shmem, (void *)v2->funk->txn_lock ); - fd_accdb_attach_child( admin, xid, xid2 ); - FD_TEST( accdb->base.ro_active==0 && accdb->base.rw_active==0 ); - - /* vinyl tombstone */ - FD_TEST( !fd_accdb_open_rw( accdb, rw, xid2, s_key_b, 16UL, 0 ) ); - FD_TEST( accdb->base.ro_active==0 && accdb->base.rw_active==0 ); - test_account_creation( accdb, xid2, s_key_b, 1UL ); - - /* funk tombstone, vinyl exist */ - FD_TEST( !fd_accdb_open_rw( accdb, rw, xid2, s_key_d, 16UL, 0 ) ); - FD_TEST( accdb->base.ro_active==0 && accdb->base.rw_active==0 ); - test_account_creation( accdb, xid2, s_key_d, 2UL ); - - /* missing account */ - FD_TEST( !fd_accdb_open_rw( accdb, rw, xid2, s_key_e, 16UL, 0 ) ); - FD_TEST( accdb->base.ro_active==0 && accdb->base.rw_active==0 ); - test_account_creation( accdb, xid2, s_key_e, 4UL ); - - /* repeatedly delete and recreate the same account */ - for( ulong i=0UL; i<1024UL; i++ ) { - test_account_creation( accdb, xid2, s_key_e, 4UL ); - } - - fd_accdb_cancel( admin, xid2 ); - - /* Test truncate */ - - test_truncate_create ( admin, accdb ); - test_truncate_nonexist( admin, accdb ); - test_truncate_inplace ( admin, accdb ); - test_truncate_copy ( admin, accdb ); - - /* Open vinyl record as writable */ - - xid2->ul[1]++; - fd_accdb_attach_child( admin, xid, xid2 ); - FD_TEST( fd_accdb_open_rw( accdb, rw, xid2, s_key_a, 0UL, 0 ) ); - FD_TEST( accdb->base.ro_active==0 && accdb->base.rw_active==1 ); - rec = ref_funk_rec( rw->ref ); - FD_TEST( fd_funk_rec_ver_alive( fd_funk_rec_ver_bits( ref_ver_lock( v2->funk, rec ) ) )==1 ); - FD_TEST( fd_accdb_ref_data_sz( rw->ro )==32UL ); - FD_TEST( 0==memcmp( fd_accdb_ref_data_const( rw->ro ), s_key_a, 32UL ) ); - fd_accdb_close_rw( accdb, rw ); - FD_TEST( accdb->base.ro_active==0 && accdb->base.rw_active==0 ); - FD_TEST( fd_funk_rec_ver_alive( fd_funk_rec_ver_bits( ref_ver_lock( v2->funk, rec ) ) )==1 ); - fd_accdb_cancel( admin, xid2 ); - FD_TEST( fd_funk_rec_ver_alive( fd_funk_rec_ver_bits( ref_ver_lock( v2->funk, rec ) ) )==0 ); - - /* Open vinyl record as writable (truncate) */ - - xid2->ul[1]++; - fd_accdb_attach_child( admin, xid, xid2 ); - FD_TEST( fd_accdb_open_rw( accdb, rw, xid2, s_key_a, 0UL, FD_ACCDB_FLAG_TRUNCATE ) ); - FD_TEST( accdb->base.ro_active==0 && accdb->base.rw_active==1 ); - rec = ref_funk_rec( rw->ref ); - FD_TEST( fd_funk_rec_ver_alive( fd_funk_rec_ver_bits( ref_ver_lock( v2->funk, rec ) ) )==1 ); - FD_TEST( fd_accdb_ref_data_sz( rw->ro )==0UL ); - fd_accdb_close_rw( accdb, rw ); - FD_TEST( accdb->base.ro_active==0 && accdb->base.rw_active==0 ); - FD_TEST( fd_funk_rec_ver_alive( fd_funk_rec_ver_bits( ref_ver_lock( v2->funk, rec ) ) )==1 ); - fd_accdb_cancel( admin, xid2 ); - FD_TEST( fd_funk_rec_ver_alive( fd_funk_rec_ver_bits( ref_ver_lock( v2->funk, rec ) ) )==0 ); - - fd_accdb_admin_fini( admin ); -} - -int -main( int argc, - char ** argv ) { - fd_boot( &argc, &argv ); - if( FD_UNLIKELY( fd_tile_cnt() < 2UL ) ) { - FD_LOG_ERR(( "This test requires at least 2 tiles (use --tile-cpus to configure)" )); - } - - char const * _wksp = fd_env_strip_cmdline_cstr ( &argc, &argv, "--wksp", NULL, NULL ); - char const * _page_sz = fd_env_strip_cmdline_cstr ( &argc, &argv, "--page-sz", NULL, "gigantic" ); - ulong page_cnt = fd_env_strip_cmdline_ulong( &argc, &argv, "--page-cnt", NULL, 8UL ); - ulong near_cpu = fd_env_strip_cmdline_ulong( &argc, &argv, "--near-cpu", NULL, fd_log_cpu_id() ); - ulong tag = fd_env_strip_cmdline_ulong( &argc, &argv, "--tag", NULL, WKSP_TAG ); - - /* Vinyl I/O parameters */ - ulong spad_max = fd_env_strip_cmdline_ulong( &argc, &argv, "--spad-max", NULL, fd_vinyl_io_spad_est() ); - ulong dev_sz = fd_env_strip_cmdline_ulong( &argc, &argv, "--dev-sz", NULL, 1UL << 30 ); - ulong io_seed = fd_env_strip_cmdline_ulong( &argc, &argv, "--io-seed", NULL, 1234UL ); - - /* Vinyl cache parameters */ - ulong line_cnt = fd_env_strip_cmdline_ulong( &argc, &argv, "--line-cnt", NULL, 7UL ); - ulong ele_max = fd_env_strip_cmdline_ulong( &argc, &argv, "--ele-max", NULL, 8UL ); - ulong lock_cnt = fd_env_strip_cmdline_ulong( &argc, &argv, "--lock-cnt", NULL, 8UL ); - ulong probe_max = ele_max; - ulong seed = fd_env_strip_cmdline_ulong( &argc, &argv, "--seed", NULL, 5678UL ); - ulong obj_sz = fd_env_strip_cmdline_ulong( &argc, &argv, "--obj-sz", NULL, 6UL << 30 ); - - /* Vinyl runtime parameters */ - ulong async_min = fd_env_strip_cmdline_ulong( &argc, &argv, "--async-min", NULL, 5UL ); - ulong async_max = fd_env_strip_cmdline_ulong( &argc, &argv, "--async-max", NULL, 2UL*async_min ); - ulong part_thresh = fd_env_strip_cmdline_ulong( &argc, &argv, "--part-thresh", NULL, 64UL << 20 ); - ulong gc_thresh = fd_env_strip_cmdline_ulong( &argc, &argv, "--gc-thresh", NULL, 128UL << 20 ); - int gc_eager = fd_env_strip_cmdline_int ( &argc, &argv, "--gc-eager", NULL, 2 ); - char const * _style = fd_env_strip_cmdline_cstr ( &argc, &argv, "--style", NULL, "lz4" ); - int level = fd_env_strip_cmdline_int ( &argc, &argv, "--level", NULL, 0 ); - - /* Vinyl client parameters */ - ulong rq_max = fd_env_strip_cmdline_ulong( &argc, &argv, "--rq-max", NULL, 32UL ); - ulong cq_max = fd_env_strip_cmdline_ulong( &argc, &argv, "--cq-max", NULL, 32UL ); - ulong link_id = fd_env_strip_cmdline_ulong( &argc, &argv, "--link-id", NULL, 2345UL ); - ulong burst_max = fd_env_strip_cmdline_ulong( &argc, &argv, "--burst-max", NULL, 1UL ); - ulong quota_max = fd_env_strip_cmdline_ulong( &argc, &argv, "--quota-max", NULL, 4UL ); - - /* Funk (in-memory DB) parameters */ - ulong txn_max = fd_env_strip_cmdline_ulong( &argc, &argv, "--txn-max", NULL, 32UL ); - ulong rec_max = fd_env_strip_cmdline_ulong( &argc, &argv, "--rec-max", NULL, 512UL ); - - int style = fd_cstr_to_vinyl_bstream_ctl_style( _style ); - - FD_LOG_NOTICE(( "Setting up workspace" )); - - fd_wksp_t * wksp; - if( _wksp ) { - FD_LOG_NOTICE(( "Attaching to --wksp %s", _wksp )); - wksp = fd_wksp_attach( _wksp ); - } else { - FD_LOG_NOTICE(( "--wksp not specified, using an anonymous local workspace (--page-sz %s --page-cnt %lu --near-cpu %lu)", - _page_sz, page_cnt, near_cpu )); - wksp = fd_wksp_new_anonymous( fd_cstr_to_shmem_page_sz( _page_sz ), page_cnt, near_cpu, "wksp", 0UL ); - } - FD_TEST( wksp ); - - ulong io_footprint = fd_vinyl_io_mm_footprint( spad_max ); FD_TEST( io_footprint ); - ulong dev_footprint = fd_ulong_align_dn( dev_sz, FD_VINYL_BSTREAM_BLOCK_SZ ); FD_TEST( dev_footprint ); - ulong vinyl_footprint = fd_vinyl_footprint(); FD_TEST( vinyl_footprint ); - ulong cnc_footprint = fd_cnc_footprint( FD_VINYL_CNC_APP_SZ ); FD_TEST( cnc_footprint ); - ulong meta_footprint = fd_vinyl_meta_footprint( ele_max, lock_cnt, probe_max ); FD_TEST( meta_footprint ); - ulong line_footprint = sizeof(fd_vinyl_line_t) * line_cnt; FD_TEST( line_footprint ); - ulong ele_footprint = sizeof(fd_vinyl_meta_ele_t) * ele_max; FD_TEST( ele_footprint ); - ulong obj_footprint = fd_ulong_align_dn( obj_sz, alignof(fd_vinyl_data_obj_t) ); FD_TEST( obj_footprint ); - ulong rq_footprint = fd_vinyl_rq_footprint( rq_max ); FD_TEST( rq_footprint ); - ulong cq_footprint = fd_vinyl_cq_footprint( cq_max ); FD_TEST( cq_footprint ); - - void * _io = fd_wksp_alloc_laddr( wksp, fd_vinyl_io_mm_align(), io_footprint, tag ); FD_TEST( _io ); - void * _dev = fd_wksp_alloc_laddr( wksp, FD_VINYL_BSTREAM_BLOCK_SZ, dev_footprint, tag ); FD_TEST( _dev ); - void * _vinyl = fd_wksp_alloc_laddr( wksp, fd_vinyl_align(), vinyl_footprint, tag ); FD_TEST( _vinyl ); - void * _cnc = fd_wksp_alloc_laddr( wksp, fd_cnc_align(), cnc_footprint, tag ); FD_TEST( _cnc ); - void * _meta = fd_wksp_alloc_laddr( wksp, fd_vinyl_meta_align(), meta_footprint, tag ); FD_TEST( _meta ); - void * _line = fd_wksp_alloc_laddr( wksp, alignof(fd_vinyl_line_t), line_footprint, tag ); FD_TEST( _line ); - void * _ele = fd_wksp_alloc_laddr( wksp, alignof(fd_vinyl_meta_ele_t), ele_footprint, tag ); FD_TEST( _ele ); - void * _obj = fd_wksp_alloc_laddr( wksp, alignof(fd_vinyl_data_obj_t), obj_footprint, tag ); FD_TEST( _obj ); - void * _rq = fd_wksp_alloc_laddr( wksp, fd_vinyl_rq_align(), rq_footprint, tag ); FD_TEST( _rq ); - void * _cq = fd_wksp_alloc_laddr( wksp, fd_vinyl_cq_align(), cq_footprint, tag ); FD_TEST( _cq ); - - fd_vinyl_io_t * io = fd_vinyl_io_mm_init( _io, spad_max, _dev, dev_footprint, 1, "test", 5UL, io_seed ); - FD_TEST( io ); - - fd_vinyl_t * vinyl = fd_vinyl_init( NULL, 0UL, 0UL, level, _vinyl, - _cnc, cnc_footprint, - _meta, meta_footprint, - _line, line_footprint, - _ele, ele_footprint, - _obj, obj_footprint, - io, seed, wksp, async_min, async_max, - part_thresh, gc_thresh, gc_eager, style ); - - FD_TEST( vinyl ); - - FD_LOG_NOTICE(( "Vinyl booting" )); - - fd_tile_exec_t * exec = fd_tile_exec_new( 1UL, fd_vinyl_tile, 0, (char **)vinyl ); - FD_TEST( exec ); - - fd_vinyl_rq_t * rq = fd_vinyl_rq_join( fd_vinyl_rq_new( _rq, rq_max ) ); FD_TEST( rq ); - fd_vinyl_cq_t * cq = fd_vinyl_cq_join( fd_vinyl_cq_new( _cq, cq_max ) ); FD_TEST( cq ); - - fd_cnc_t * cnc = fd_cnc_join( _cnc ); FD_TEST( cnc ); - FD_TEST( fd_cnc_wait( cnc, FD_VINYL_CNC_SIGNAL_BOOT, (long)5e9, NULL )==FD_VINYL_CNC_SIGNAL_RUN ); - - FD_LOG_NOTICE(( "Vinyl running" )); - - ulong funk_seed = 9876UL; - ulong funk_footprint = fd_funk_shmem_footprint( txn_max, rec_max ); - ulong lock_footprint = fd_funk_locks_footprint( txn_max, rec_max ); - void * shfunk = fd_wksp_alloc_laddr( wksp, fd_funk_align(), funk_footprint, tag ); - void * shlocks = fd_wksp_alloc_laddr( wksp, fd_funk_align(), lock_footprint, tag ); - FD_TEST( shfunk ); - FD_TEST( shlocks ); - FD_TEST( fd_funk_shmem_new( shfunk, tag, funk_seed, txn_max, rec_max ) ); - FD_TEST( fd_funk_locks_new( shlocks, txn_max, rec_max ) ); - - ulong req_pool_footprint = fd_vinyl_req_pool_footprint( 2UL, 4UL ); - FD_TEST( req_pool_footprint ); - void * _req_pool = fd_wksp_alloc_laddr( wksp, fd_vinyl_req_pool_align(), req_pool_footprint, tag ); - FD_TEST( _req_pool ); - void * req_pool = fd_vinyl_req_pool_new( _req_pool, 2UL, 4UL ); - FD_TEST( req_pool ); - - FD_LOG_NOTICE(( "Connecting client to vinyl" )); - - FD_TEST( !fd_vinyl_client_join( cnc, rq, cq, wksp, link_id, burst_max, quota_max ) ); - - fd_accdb_user_t accdb[1]; - FD_TEST( fd_accdb_user_v2_init( accdb, shfunk, shlocks, _rq, wksp, req_pool, link_id, txn_max ) ); - FD_TEST( accdb->base.accdb_type == FD_ACCDB_TYPE_V2 ); - - FD_LOG_NOTICE(( "Running tests" )); - - run_tests( accdb ); - - FD_LOG_NOTICE(( "Cleaning up" )); - - fd_accdb_admin_t admin[1]; - FD_TEST( fd_accdb_admin_v1_init( admin, shfunk, shlocks ) ); - fd_accdb_v1_clear( admin ); - fd_accdb_admin_fini( admin ); - - fd_accdb_user_fini( accdb ); - - FD_TEST( !fd_vinyl_client_leave( cnc, link_id ) ); - - FD_LOG_NOTICE(( "Vinyl stopping" )); - FD_TEST( !fd_vinyl_halt( cnc ) ); - FD_TEST( fd_cnc_leave( cnc )==_cnc ); - - fd_tile_exec_delete( exec, NULL ); - - FD_TEST( fd_vinyl_cq_delete( fd_vinyl_cq_leave( cq ) )==_cq ); - FD_TEST( fd_vinyl_rq_delete( fd_vinyl_rq_leave( rq ) )==_rq ); - - FD_TEST( fd_vinyl_fini( vinyl )==_vinyl ); - FD_TEST( fd_vinyl_io_fini( io )==_io ); - - fd_wksp_free_laddr( fd_vinyl_req_pool_delete( req_pool ) ); - fd_wksp_free_laddr( shlocks ); - fd_wksp_free_laddr( fd_funk_delete( shfunk ) ); - fd_wksp_free_laddr( _cq ); - fd_wksp_free_laddr( _rq ); - fd_wksp_free_laddr( _obj ); - fd_wksp_free_laddr( _ele ); - fd_wksp_free_laddr( _line ); - fd_wksp_free_laddr( _meta ); - fd_wksp_free_laddr( _cnc ); - fd_wksp_free_laddr( _vinyl ); - fd_wksp_free_laddr( _dev ); - fd_wksp_free_laddr( _io ); - - fd_wksp_usage_t wksp_usage; - FD_TEST( fd_wksp_usage( wksp, NULL, 0UL, &wksp_usage ) ); - FD_TEST( wksp_usage.free_cnt==wksp_usage.total_cnt ); - - if( _wksp ) fd_wksp_detach( wksp ); - else fd_wksp_delete_anonymous( wksp ); - - FD_LOG_NOTICE(( "pass" )); - fd_halt(); - return 0; -} diff --git a/src/flamenco/runtime/fd_executor.c b/src/flamenco/runtime/fd_executor.c index 96341be68f8..f489339bdf0 100644 --- a/src/flamenco/runtime/fd_executor.c +++ b/src/flamenco/runtime/fd_executor.c @@ -1524,12 +1524,87 @@ fd_executor_setup_accounts_for_txn( fd_runtime_t * runtime, txn_out->accounts.rollback_nonce_mem = writable_accs_mem[ writable_account_cnt+1UL ]; ushort executable_idx = 0U; - for( ushort i=0; iaccounts.cnt; i++ ) { - fd_executor_setup_txn_account( runtime, bank, txn_in, txn_out, i, writable_accs_mem, &writable_accs_idx ); - fd_account_meta_t * meta = txn_out->accounts.account[ i ].meta; - if( FD_UNLIKELY( meta && memcmp( meta->owner, fd_solana_bpf_loader_upgradeable_program_id.key, sizeof(fd_pubkey_t) ) == 0 ) ) { - fd_executor_setup_executable_account( runtime, bank, meta, &executable_idx ); + if( FD_LIKELY( !txn_in->bundle.is_bundle ) ) { + /* Fast path: batch fetch all accounts from DB in one call. + This amortizes I/O wait time across all accounts rather than + issuing individual lookups per account. */ + + fd_funk_txn_xid_t xid = { .ul = { fd_bank_slot_get( bank ), bank->data->idx } }; + ushort acct_cnt = (ushort)txn_out->accounts.cnt; + + fd_accdb_open_ro_multi( runtime->accdb, + txn_out->accounts.account->ro, + &xid, + txn_out->accounts.keys, + acct_cnt ); + + for( ushort i=0; iaccounts.keys[ i ]; + fd_accdb_rw_t * ref_slot = &txn_out->accounts.account[ i ]; + fd_accdb_rw_t * account = ref_slot; + + /* For non-existent accounts (zero lamports), close the DB + reference and treat as not found. */ + if( fd_accdb_ref_lamports( account->ro )==0UL ) { + fd_accdb_close_ref( runtime->accdb, ref_slot->ref ); + account = NULL; + } + + if( txn_out->accounts.is_writable[ i ] ) { + uchar * new_raw_data = writable_accs_mem[ writable_accs_idx ]; + ulong dlen = !!account ? fd_accdb_ref_data_sz( (fd_accdb_ro_t *)account ) : 0UL; + writable_accs_idx++; + + if( FD_LIKELY( account ) ) { + fd_memcpy( new_raw_data, account->meta, sizeof(fd_account_meta_t)+dlen ); + fd_accdb_close_ro( runtime->accdb, (fd_accdb_ro_t *)account ); + } else { + fd_account_meta_init( (fd_account_meta_t *)new_raw_data ); + } + + account = fd_accdb_rw_init_nodb( + (fd_accdb_rw_t *)ref_slot, + address, + (fd_account_meta_t *)new_raw_data, + FD_RUNTIME_ACC_SZ_MAX + ); + + } else { + if( FD_UNLIKELY( fd_pubkey_eq( address, &fd_sysvar_instructions_id ) ) ) { + if( FD_LIKELY( account ) ) { + fd_accdb_close_ro( runtime->accdb, (fd_accdb_ro_t *)account ); + } + fd_account_meta_t * meta = fd_account_meta_init( (void *)runtime->accounts.sysvar_instructions_mem ); + account = (fd_accdb_rw_t *)fd_accdb_ro_init_nodb( (fd_accdb_ro_t *)ref_slot, address, meta ); + } else if( FD_LIKELY( account ) ) { + /* transfer ownership of DB reference to runtime struct; + reference is freed in cancel/commit */ + } else { + account = (fd_accdb_rw_t *)fd_accdb_ro_init_nodb( (fd_accdb_ro_t *)ref_slot, address, &FD_ACCOUNT_META_DEFAULT ); + } + } + + runtime->accounts.starting_lamports[i] = fd_accdb_ref_lamports( account->ro ); + runtime->accounts.starting_dlen[i] = fd_accdb_ref_data_sz ( account->ro ); + runtime->accounts.refcnt[i] = 0UL; + + fd_account_meta_t * meta = txn_out->accounts.account[ i ].meta; + if( FD_UNLIKELY( meta && memcmp( meta->owner, fd_solana_bpf_loader_upgradeable_program_id.key, sizeof(fd_pubkey_t) ) == 0 ) ) { + fd_executor_setup_executable_account( runtime, bank, meta, &executable_idx ); + } + } + + } else { + /* Bundle path: per-account setup since accounts may reference + previous transactions in the bundle. */ + for( ushort i=0; iaccounts.cnt; i++ ) { + fd_executor_setup_txn_account( runtime, bank, txn_in, txn_out, i, writable_accs_mem, &writable_accs_idx ); + fd_account_meta_t * meta = txn_out->accounts.account[ i ].meta; + + if( FD_UNLIKELY( meta && memcmp( meta->owner, fd_solana_bpf_loader_upgradeable_program_id.key, sizeof(fd_pubkey_t) ) == 0 ) ) { + fd_executor_setup_executable_account( runtime, bank, meta, &executable_idx ); + } } } diff --git a/src/vinyl/Local.mk b/src/vinyl/Local.mk index 24856df403e..4e10e28ef81 100644 --- a/src/vinyl/Local.mk +++ b/src/vinyl/Local.mk @@ -1,13 +1,7 @@ ifdef FD_HAS_LZ4 $(call make-lib,fd_vinyl) $(call add-hdrs,fd_vinyl_base.h fd_vinyl.h) -$(call add-objs,fd_vinyl_base fd_vinyl_recover fd_vinyl_compact fd_vinyl_cmd fd_vinyl fd_vinyl_exec,fd_vinyl) -ifdef FD_HAS_HOSTED -$(call make-bin,fd_vinyl_ctl,fd_vinyl_ctl,fd_vinyl fd_tango fd_util) -endif +$(call add-objs,fd_vinyl_base fd_vinyl,fd_vinyl) $(call make-unit-test,test_vinyl_base,test_vinyl_base,fd_vinyl fd_tango fd_util) -ifdef FD_HAS_HOSTED -$(call make-unit-test,test_vinyl_req,test_vinyl_req,fd_vinyl fd_tango fd_util) -endif $(call run-unit-test,test_vinyl_base) endif diff --git a/src/vinyl/data/fd_vinyl_data.c b/src/vinyl/data/fd_vinyl_data.c index 4a8d30fc30a..63eab51f035 100644 --- a/src/vinyl/data/fd_vinyl_data.c +++ b/src/vinyl/data/fd_vinyl_data.c @@ -6,6 +6,23 @@ fd_vinyl_data_szc_all_blocks( ulong szc ) { return ((1UL << ((int)fd_vinyl_data_szc_cfg[ szc ].obj_cnt - 1)) << 1) - 1UL; } +static inline void +fd_vinyl_data_lock( int * lock ) { + for(;;) { + if( FD_LIKELY( !FD_VOLATILE_CONST( *lock ) ) ) { + if( FD_LIKELY( !FD_ATOMIC_CAS( lock, 0, 1 ) ) ) break; + } + FD_SPIN_PAUSE(); + } + FD_COMPILER_MFENCE(); +} + +static inline void +fd_vinyl_data_unlock( int * lock ) { + FD_COMPILER_MFENCE(); + FD_VOLATILE( *lock ) = 0; +} + FD_FN_CONST static inline ulong fd_vinyl_data_obj_off( void const * laddr0, fd_vinyl_data_obj_t const * obj ) { @@ -152,10 +169,11 @@ fd_vinyl_data_fini( fd_vinyl_data_t * data ) { return data; } -/* Note: the algorithms below is identical to fd_alloc. But since it - is running single threaded and non-persistent, there's less atomic - operation and/or address translation shenanigans going on. See - fd_alloc for more in depth discussions. */ +/* Note: the algorithms below are similar to fd_alloc. Since it is + non-persistent, there's less address translation shenanigans going + on. Concurrency is handled via per-sizeclass spinlocks with a + strict ascending lock order (szc < parent_szc < vol_lock) to prevent + deadlock. See fd_alloc for more in depth discussions. */ fd_vinyl_data_obj_t * fd_vinyl_data_alloc( fd_vinyl_data_t * data, @@ -164,8 +182,11 @@ fd_vinyl_data_alloc( fd_vinyl_data_t * data, FD_CRIT( data, "NULL data" ); FD_CRIT( szcladdr0; - fd_vinyl_data_vol_t * vol = data->vol; + void * laddr0 = data->laddr0; + fd_vinyl_data_vol_t * vol = data->vol; + + fd_vinyl_data_lock( &data->superblock[ szc ].lock ); + fd_vinyl_data_obj_t ** _active = &data->superblock[ szc ].active; fd_vinyl_data_obj_t ** _inactive_top = &data->superblock[ szc ].inactive_top; @@ -200,8 +221,11 @@ fd_vinyl_data_alloc( fd_vinyl_data_t * data, ulong parent_szc = (ulong)fd_vinyl_data_szc_cfg[ szc ].parent_szc; if( FD_LIKELY( parent_szc szc. */ + superblock = fd_vinyl_data_alloc( data, parent_szc ); - if( FD_UNLIKELY( !superblock ) ) return NULL; + if( FD_UNLIKELY( !superblock ) ) { fd_vinyl_data_unlock( &data->superblock[ szc ].lock ); return NULL; } /* superblock->type init by obj_alloc to ALLOC, reset below */ /* superblock->szc init by obj_alloc */ @@ -212,10 +236,18 @@ fd_vinyl_data_alloc( fd_vinyl_data_t * data, } else { + fd_vinyl_data_lock( &data->vol_lock ); + ulong vol_idx = data->vol_idx_free; - if( FD_UNLIKELY( vol_idx >= data->vol_cnt ) ) return NULL; + if( FD_UNLIKELY( vol_idx >= data->vol_cnt ) ) { + fd_vinyl_data_unlock( &data->vol_lock ); + fd_vinyl_data_unlock( &data->superblock[ szc ].lock ); + return NULL; + } data->vol_idx_free = vol[ vol_idx ].obj->idx; + fd_vinyl_data_unlock( &data->vol_lock ); + superblock = vol[ vol_idx ].obj; /* superblock->type init below */ @@ -250,43 +282,20 @@ fd_vinyl_data_alloc( fd_vinyl_data_t * data, superblock->free_blocks = free_blocks; /* If this superblock still has free blocks in it, return it to - circulation for future allocation as szc's active superblock, - pushing any displaced superblock onto szc's inactive superblock - stack. Other strategies are possible, see fd_alloc for discussion - of tradeoffs. */ - -# if 0 - - if( FD_LIKELY( free_blocks ) ) { - - fd_vinyl_data_obj_t * displaced_superblock = *_active; - *_active = superblock; - - if( FD_UNLIKELY( displaced_superblock ) ) { - - FD_ALERT( !fd_vinyl_data_superblock_test( data, displaced_superblock, szc ), "corruption detected" ); - - displaced_superblock->next_off = fd_vinyl_data_obj_off( laddr0, *_inactive_top ); - *_inactive_top = displaced_superblock; - - } - - } - -# else - - /* For a non-concurrent implementation, we know szc has no active - superblock active at this point (because their's no concurrent - alloc or free that could have set it behind our back). We don't - have to worry about displacing a superblock, simplifying the - above. */ + circulation for future allocation as szc's active superblock. + Since we hold lock[szc], we know szc has no active superblock at + this point (no concurrent alloc or free can set it behind our back + while we hold the lock). We don't have to worry about displacing + a superblock, simplifying this. Other strategies are possible, see + fd_alloc for discussion of tradeoffs. */ fd_vinyl_data_obj_t * tmp[1]; *(free_blocks ? _active : tmp) = superblock; /* branchless conditional store */ -# endif + fd_vinyl_data_unlock( &data->superblock[ szc ].lock ); - /* Initialize the allocated object metadata and return. */ + /* Initialize the allocated object metadata and return. The object + is not yet visible to other threads so no lock is needed here. */ fd_vinyl_data_obj_t * obj = (fd_vinyl_data_obj_t *)( (ulong)superblock + sizeof(fd_vinyl_data_obj_t) + idx*fd_vinyl_data_szc_obj_footprint( szc ) ); @@ -325,10 +334,14 @@ fd_vinyl_data_free( fd_vinyl_data_t * data, if( FD_UNLIKELY( szc>=FD_VINYL_DATA_SZC_CNT ) ) { FD_CRIT( idx < data->vol_cnt, "corruption detected" ); /* valid idx for vol */ + fd_vinyl_data_lock( &data->vol_lock ); + obj->type = FD_VINYL_DATA_OBJ_TYPE_FREEVOL; /* Mark as on the free stack */ obj->idx = data->vol_idx_free; data->vol_idx_free = idx; + fd_vinyl_data_unlock( &data->vol_lock ); + return; } @@ -337,6 +350,8 @@ fd_vinyl_data_free( fd_vinyl_data_t * data, /* At this point, obj appears to be contained in a superblock at position idx. Mark the object as free in the superblock. */ + fd_vinyl_data_lock( &data->superblock[ szc ].lock ); + fd_vinyl_data_obj_t * superblock = (fd_vinyl_data_obj_t *) ((ulong)obj - sizeof(fd_vinyl_data_obj_t) - idx*fd_vinyl_data_szc_obj_footprint( szc )); @@ -359,7 +374,7 @@ fd_vinyl_data_free( fd_vinyl_data_t * data, superblock onto the szc's inactive superblock stack. Otherwise, if this free made the superblock totally empty, we check - if the szc'c inactive superblock top is also totally empty. If so, + if the szc's inactive superblock top is also totally empty. If so, we pop the inactive stack and free that. This keeps a small bounded supply empty superblocks around for fast @@ -399,7 +414,14 @@ fd_vinyl_data_free( fd_vinyl_data_t * data, data->superblock[ szc ].inactive_top = fd_vinyl_data_obj_ptr( data->laddr0, candidate_superblock->next_off ); + /* Recursive free of the empty superblock. Its szc is + parent_szc > szc, so lock ordering is preserved. We + release our lock first since szc state is consistent. */ + + fd_vinyl_data_unlock( &data->superblock[ szc ].lock ); + fd_vinyl_data_free( data, candidate_superblock ); + return; } } @@ -408,6 +430,8 @@ fd_vinyl_data_free( fd_vinyl_data_t * data, } + fd_vinyl_data_unlock( &data->superblock[ szc ].lock ); + } static FD_FOR_ALL_BEGIN( fd_vinyl_data_reset_task, 1L ) { @@ -453,9 +477,11 @@ fd_vinyl_data_reset( fd_tpool_t * tpool, ulong t0, ulong t1, int level, FD_FOR_ALL( fd_vinyl_data_reset_task, tpool,t0,t1, 0L,(long)data->vol_cnt, data, level ); + data->vol_lock = 0; data->vol_idx_free = 0UL; for( ulong szc=0UL; szcsuperblock[ szc ].lock = 0; data->superblock[ szc ].active = NULL; data->superblock[ szc ].inactive_top = NULL; } diff --git a/src/vinyl/data/fd_vinyl_data.h b/src/vinyl/data/fd_vinyl_data.h index 7664454ad37..4e7437dd521 100644 --- a/src/vinyl/data/fd_vinyl_data.h +++ b/src/vinyl/data/fd_vinyl_data.h @@ -7,10 +7,10 @@ lockfree operated on by multiple threads in other address spaces and async direct I/O hardware concurrently. - Note that, though pairs are cached in a shared memory region, this is - not a persistent or concurrent datastructure. Specifically, only the - vinyl tile can allocate or free objects from it and then can do only - sequentially. + Pairs are cached in a shared memory region. The allocator is thread + safe: multiple threads may concurrently allocate and free objects + using per-sizeclass spinlocks with a consistent lock ordering + (lock[szc] < lock[parent_szc] < vol_lock) to prevent deadlock. Notes: @@ -26,10 +26,9 @@ The algorithms that manage the allocations are virtually identical to fd_groove and fd_alloc. But they have been simplified, customized and optimized for this use case (e.g. minimal need for address - translation, no need for atomic operations, no need for concurrency - group optimizations, no need to layout cache for concurrent access, - much more fine grained size classes for minimal data store overheads, - etc). This also does extensive (and compile time configurable) + translation, simple spinlock concurrency, much more fine grained + size classes for minimal data store overheads, etc). This also does + extensive (and compile time configurable) memory data integrity continuously to help catch memory corruption (either due to hardware failures, buggy usage or malicious usage). @@ -258,6 +257,23 @@ FD_PROTOTYPES_BEGIN fd_vinyl_data_* mirror the above but they take the value region as input. */ +/* fd_vinyl_data_laddr resolves a data object global address to a local + address given the data cache's laddr0 (i.e. the workspace base). + Returns NULL if gaddr is 0. + + fd_vinyl_data_gaddr converts a local address to a data object global + address given the data cache's laddr0. */ + +FD_FN_CONST static inline void * +fd_vinyl_data_laddr( ulong gaddr, void * laddr0 ) { + return (void *)fd_ulong_if( !!gaddr, (ulong)laddr0 + gaddr, 0UL ); +} + +FD_FN_CONST static inline ulong +fd_vinyl_data_gaddr( void const * laddr, void const * laddr0 ) { + return fd_ulong_if( !!laddr, (ulong)laddr - (ulong)laddr0, 0UL ); +} + FD_FN_CONST static inline fd_vinyl_bstream_phdr_t * fd_vinyl_data_obj_phdr( fd_vinyl_data_obj_t const * obj ) { return (fd_vinyl_bstream_phdr_t *)((ulong)obj + sizeof(fd_vinyl_data_obj_t)); @@ -343,8 +359,10 @@ struct __attribute((aligned(FD_VINYL_DATA_ALIGN))) fd_vinyl_data { (FD_VINYL_BSTREAM_BLOCK_SZ aligned) */ fd_vinyl_data_vol_t * vol; /* Vols, indexed [0,vol_cnt), in raw shared memory region */ ulong vol_cnt; /* Num vols, in [0,FD_VINYL_DATA_VOL_MAX) */ + int vol_lock; /* Spinlock protecting vol_idx_free */ ulong vol_idx_free; /* Idx of first free volume if in [0,vol_cnt), no free volumes o.w. */ struct { + int lock; /* Spinlock protecting this size class */ fd_vinyl_data_obj_t * active; /* active superblock for this size class */ fd_vinyl_data_obj_t * inactive_top; /* top of the inactive superblock stack for this size class */ } superblock[ FD_VINYL_DATA_SZC_CNT ]; @@ -436,23 +454,25 @@ fd_vinyl_data_is_valid_obj( void const * laddr, /* fd_vinyl_data_alloc acquires an object of sizeclass szc from the data cache. Returns a pointer to the object on success and NULL if there - is no space available in the data. Will FD_LOG_CRIT if anything - wonky is detected (bad, memory corruption, etc). */ + is no space available in the data. Thread safe. Will FD_LOG_CRIT + if anything wonky is detected (bad, memory corruption, etc). */ fd_vinyl_data_obj_t * fd_vinyl_data_alloc( fd_vinyl_data_t * data, ulong szc ); /* fd_vinyl_data_free releases obj to the data cache. This cannot fail - from the caller's perspective. Will FD_LOG_CRIT if anything wonky is - detected (bad args, memory corruption, etc). */ + from the caller's perspective. Thread safe. Will FD_LOG_CRIT if + anything wonky is detected (bad args, memory corruption, etc). */ void fd_vinyl_data_free( fd_vinyl_data_t * data, fd_vinyl_data_obj_t * obj ); /* fd_vinyl_data_reset uses the caller and tpool threads (t0,t1) to free - all objects from the data cache. level zero/non-zero indicates to do + all objects from the data cache. Not thread safe with concurrent + alloc/free; caller must ensure exclusive access. level zero/non-zero + indicates to do soft/hard reset. In a hard reset, the shmem region is zero'd before formatting it into a set of free data volumes. This cannot fail from the caller's perspective. Assumes tpool threads (t0,t1) are @@ -465,7 +485,9 @@ fd_vinyl_data_reset( fd_tpool_t * tpool, ulong t0, ulong t1, int level, /* fd_vinyl_data_verify returns FD_VINYL_SUCCESS (0) if the given data appears to be a valid vinyl data and FD_VINYL_ERR_CORRUPT (negative) - otherwise (logs details). This only verifies the vinyl data's state + otherwise (logs details). Not thread safe with concurrent + alloc/free; caller must ensure exclusive access. This only verifies + the vinyl data's state and superblock heirarchy are intact. It does not test any of the allocations for correctness (but could given access to the bstream, line and/or meta). */ diff --git a/src/vinyl/data/test_vinyl_data.c b/src/vinyl/data/test_vinyl_data.c index 87aa726c5c0..d4df92e3630 100644 --- a/src/vinyl/data/test_vinyl_data.c +++ b/src/vinyl/data/test_vinyl_data.c @@ -14,7 +14,7 @@ FD_STATIC_ASSERT( sizeof (fd_vinyl_data_obj_t)==FD_VINYL_BSTREAM_BLOCK_SZ, unit_ FD_STATIC_ASSERT( FD_VINYL_DATA_VOL_FOOTPRINT==34078592UL, unit_test ); FD_STATIC_ASSERT( FD_VINYL_DATA_ALIGN == 128UL, unit_test ); -FD_STATIC_ASSERT( FD_VINYL_DATA_FOOTPRINT==5376UL, unit_test ); +FD_STATIC_ASSERT( FD_VINYL_DATA_FOOTPRINT==7936UL, unit_test ); FD_STATIC_ASSERT( alignof(fd_vinyl_data_vol_t)==FD_VINYL_BSTREAM_BLOCK_SZ, unit_test ); FD_STATIC_ASSERT( sizeof (fd_vinyl_data_vol_t)==FD_VINYL_DATA_VOL_FOOTPRINT, unit_test ); diff --git a/src/vinyl/fd_vinyl.c b/src/vinyl/fd_vinyl.c index bb5e0f10cd0..fc6f764f8c0 100644 --- a/src/vinyl/fd_vinyl.c +++ b/src/vinyl/fd_vinyl.c @@ -10,124 +10,6 @@ fd_vinyl_footprint( void ) { return sizeof(fd_vinyl_t); } -fd_vinyl_t * -fd_vinyl_init( fd_tpool_t * tpool, ulong t0, ulong t1, int level, - void * _vinyl, - void * _cnc, ulong cnc_footprint, - void * _meta, ulong meta_footprint, - void * _line, ulong line_footprint, - void * _ele, ulong ele_footprint, - void * _obj, ulong obj_footprint, - fd_vinyl_io_t * io, - ulong seed, - void * obj_laddr0, - ulong async_min, - ulong async_max, - ulong part_thresh, - ulong gc_thresh, - int gc_eager, - int style ) { - if( t1<=t0 ) t0 = 0UL, t1 = 1UL; - - FD_LOG_NOTICE(( "Testing vinyl configuration" )); - -# define TEST( c ) do { if( FD_UNLIKELY( !(c) ) ) { FD_LOG_WARNING(( "FAIL: %s", #c )); return NULL; } } while(0) - - TEST( _vinyl ); TEST( fd_ulong_is_aligned( (ulong)_vinyl, fd_vinyl_align() ) ); - TEST( _cnc ); TEST( fd_ulong_is_aligned( (ulong)_cnc, fd_cnc_align() ) ); - TEST( _meta ); TEST( fd_ulong_is_aligned( (ulong)_meta, fd_vinyl_meta_align() ) ); - TEST( _line ); TEST( fd_ulong_is_aligned( (ulong)_line, alignof(fd_vinyl_line_t) ) ); - TEST( _ele ); TEST( fd_ulong_is_aligned( (ulong)_ele, alignof(fd_vinyl_meta_ele_t) ) ); - TEST( _obj ); TEST( fd_ulong_is_aligned( (ulong)_obj, alignof(fd_vinyl_data_obj_t) ) ); - - TEST( cnc_footprint >= fd_cnc_footprint( FD_VINYL_CNC_APP_SZ ) ); - - ulong ele_max = fd_ulong_pow2_dn( ele_footprint / sizeof( fd_vinyl_meta_ele_t ) ); - ulong lock_cnt = fd_vinyl_meta_lock_cnt_est( ele_max ); - ulong probe_max = ele_max; - - TEST( ele_max>=4UL ); - TEST( meta_footprint >= fd_vinyl_meta_footprint( ele_max, lock_cnt, probe_max ) ); - - ulong pair_max = ele_max - 1UL; - ulong line_cnt = fd_ulong_min( line_footprint / sizeof( fd_vinyl_line_t ), pair_max ); - - TEST( (3UL<=line_cnt) & (line_cnt<=FD_VINYL_LINE_MAX) ); - - TEST( io ); - - /* seed is arb */ - - TEST( (0ULcnc = fd_cnc_join( fd_cnc_new( _cnc, FD_VINYL_CNC_APP_SZ, FD_VINYL_CNC_TYPE, fd_log_wallclock() ) ); TEST( vinyl->cnc ); - vinyl->line = (fd_vinyl_line_t *)_line; - vinyl->io = io; - - vinyl->line_cnt = line_cnt; - vinyl->pair_max = pair_max; - vinyl->async_min = async_min; - vinyl->async_max = async_max; - - vinyl->part_thresh = part_thresh; - vinyl->gc_thresh = gc_thresh; - vinyl->gc_eager = gc_eager; - vinyl->style = style; - vinyl->line_idx_lru = 0U; - vinyl->pair_cnt = 0UL; - vinyl->garbage_sz = 0UL; - - TEST( fd_vinyl_meta_join( vinyl->meta, fd_vinyl_meta_new( _meta, ele_max, lock_cnt, probe_max, seed ), _ele )==vinyl->meta ); - - TEST( fd_vinyl_data_init( vinyl->data, _obj, obj_footprint, obj_laddr0 )==vinyl->data ); - - vinyl->cnc_footprint = cnc_footprint; - vinyl->meta_footprint = meta_footprint; - vinyl->line_footprint = line_footprint; - vinyl->ele_footprint = ele_footprint; - vinyl->obj_footprint = obj_footprint; - - FD_LOG_NOTICE(( "Recovering bstream past (level %i)", level )); - - TEST( fd_vinyl_seq_eq( fd_vinyl_recover( tpool,t0,t1, level, vinyl ), fd_vinyl_io_seq_present( io ) ) ); - -# undef TEST - - FD_LOG_NOTICE(( "Initializing complete" )); - - return vinyl; -} - void * fd_vinyl_fini( fd_vinyl_t * vinyl ) { diff --git a/src/vinyl/fd_vinyl.h b/src/vinyl/fd_vinyl.h index 89a1d72cf25..7b638e697ab 100644 --- a/src/vinyl/fd_vinyl.h +++ b/src/vinyl/fd_vinyl.h @@ -109,36 +109,6 @@ FD_PROTOTYPES_BEGIN FD_FN_CONST ulong fd_vinyl_align ( void ); FD_FN_CONST ulong fd_vinyl_footprint( void ); -/* fd_vinyl_init uses the the caller (typically tpool thread t0) and - tpool threads (t0,t1) to init the vinyl structure (this structure can - be extremely large ... hundreds of gigabytes to terabytes in memory - for petabytes or more in persistent storage ... so it is worthwhile - to parallelize the initialization). The bstream's past will be used - to recover the vinyl instance to the bstream's seq_present. The - recovery level is given by level (see fd_vinyl_recover below). - Assumes tpool threads (t0,t1) are available for dispatch. These - threads will be avaialble for dispatch on return. Retain no interest - in tpool. If tpool is NULL and/or the set [t0,t1) is empty/invalid, - uses a serial algorithm for initialization. */ - -fd_vinyl_t * -fd_vinyl_init( fd_tpool_t * tpool, ulong t0, ulong t1, int level, - void * lmem, /* memory region to hold the vinyl's state */ - void * shcnc, ulong cnc_footprint, /* memory region to use for the tile cnc */ - void * shmeta, ulong meta_footprint, /* memory region to use for the cached pair metadata state */ - void * shline, ulong line_footprint, /* memory region to use for the cached pair state */ - void * shele, ulong ele_footprint, /* memory region to use for the cached pair metadata */ - void * shobj, ulong obj_footprint, /* memory region to use for the cached pairs */ - fd_vinyl_io_t * io, /* interface to the underlying bstream */ - ulong seed, - void * obj_laddr0, - ulong async_min, - ulong async_max, - ulong part_thresh, - ulong gc_thresh, - int gc_eager, - int style ); - void * fd_vinyl_fini( fd_vinyl_t * vinyl ); @@ -204,74 +174,6 @@ FD_FN_PURE static inline ulong fd_vinyl_gc_thresh ( fd_vinyl_t const * vinyl ) FD_FN_PURE static inline int fd_vinyl_gc_eager ( fd_vinyl_t const * vinyl ) { return vinyl->gc_eager; } FD_FN_PURE static inline int fd_vinyl_style ( fd_vinyl_t const * vinyl ) { return vinyl->style; } -/* fd_vinyl_compact does up to compact_max rounds of compaction to the - bstream's past. This cannot fail from the caller's perspective (will - FD_LOG_CRIT if any corruption is detected). */ -/* FIXME: PRIVATE */ - -void -fd_vinyl_compact( fd_vinyl_t * vinyl, - ulong compact_max ); - -/* fd_vinyl_recover uses the caller (typically tpool thread t0) and - tpool threads (t0,t1) to reset the vinyl meta cache, reset the vinyl - data cache, reset vinyl cache line eviction priorities and repopulate - the vinyl meta data cache from the current state of the bstream's - past to the bstream's seq_present. level zero/non-zero indicates to - do a soft/hard reset. In a soft reset, the data cache region is - minimally cleared. In a hard reset, it is fully cleared. A hard - reset is recommended for most usage but a soft reset can allow faster - startup for rapid iteration during development. - - Returns the bstream sequence number of how far recovery got (if this - is not seq_present, the recovery was partial and it is theoretically - moves in the recovery were not processed atomically). Logs details - of any issues encoutered. - - Assumes the tpool threads (t0,t1) are available for dispatch. - Retains no interest in tpool and threads (t0,t1) will be available - for dispatch on return. */ -/* FIXME: PRIVATE */ - -ulong -fd_vinyl_recover( fd_tpool_t * tpool, ulong t0, ulong t1, int level, - fd_vinyl_t * vinyl ); - -/* fd_vinyl_exec runs a vinyl tile on the caller. */ - -void -fd_vinyl_exec( fd_vinyl_t * vinyl ); - -int -fd_vinyl_halt( fd_cnc_t * cnc ); - -int -fd_vinyl_sync( fd_cnc_t * cnc ); - -int -fd_vinyl_get( fd_cnc_t * cnc, - int opt, - ulong * opt_val ); - -int -fd_vinyl_set( fd_cnc_t * cnc, - int opt, - ulong val, - ulong * opt_val ); - -int -fd_vinyl_client_join( fd_cnc_t * cnc, - fd_vinyl_rq_t * rq, - fd_vinyl_cq_t * cq, - fd_wksp_t * wksp, - ulong link_id, - ulong burst_max, - ulong quota_max ); - -int -fd_vinyl_client_leave( fd_cnc_t * cnc, - ulong link_id ); - #define FD_VINYL_CNC_SIGNAL_CSTR_BUF_MAX (21UL) char * diff --git a/src/vinyl/fd_vinyl_case_acquire.c b/src/vinyl/fd_vinyl_case_acquire.c deleted file mode 100644 index d88255e6e98..00000000000 --- a/src/vinyl/fd_vinyl_case_acquire.c +++ /dev/null @@ -1,399 +0,0 @@ - case FD_VINYL_REQ_TYPE_ACQUIRE: { - ulong req_flags = (ulong)req->flags; - fd_vinyl_key_t const * req_key = MAP_REQ_GADDR( req->key_gaddr, fd_vinyl_key_t, batch_cnt ); - ulong * req_val_gaddr = MAP_REQ_GADDR( req->val_gaddr_gaddr, ulong, batch_cnt ); - schar * req_err = MAP_REQ_GADDR( req->err_gaddr, schar, batch_cnt ); - - int req_flag_modify = fd_vinyl_req_flag_modify( req_flags ); - int req_flag_ignore = fd_vinyl_req_flag_ignore( req_flags ); - int req_flag_create = fd_vinyl_req_flag_create( req_flags ); - int req_flag_excl = fd_vinyl_req_flag_excl ( req_flags ); - int req_evict_prio = fd_vinyl_req_evict_prio ( req_flags ); - - int bad_gaddr = (!!batch_cnt) & ((!req_key) | (!req_val_gaddr) | (!req_err)); - int bad_quota = quota_remFD_VINYL_VAL_MAX ) ) DONE( FD_VINYL_ERR_INVAL ); - } - - /* Query vinyl meta for key */ - - fd_vinyl_key_t const * key = req_key + batch_idx; - - ulong memo = fd_vinyl_key_memo( meta_seed, key ); - - ulong _ele_idx; /* avoid pointer escape */ - int err = fd_vinyl_meta_query_fast( ele0, ele_max, key, memo, &_ele_idx ); - ulong ele_idx = _ele_idx; /* In [0,ele_max) */ - - if( FD_LIKELY( !err ) ) { /* pair key meta cached */ - - /* At this point, pair key either exists at bstream seq_present - or is in the process of being created. If pair key is being - created, fail with AGAIN (it must be acquired for modify). */ - - ulong pair_ctl = ele0[ ele_idx ].phdr.ctl; - - FD_CRIT( (fd_vinyl_bstream_ctl_type( pair_ctl )==FD_VINYL_BSTREAM_CTL_TYPE_PAIR) | (pair_ctl==ULONG_MAX), - "corruption detected" ); - - if( FD_UNLIKELY( pair_ctl==ULONG_MAX ) ) DONE( FD_VINYL_ERR_AGAIN ); - - /* At this point, pair key exists at bstream seq_present. */ - - ulong val_sz = (ulong)ele0[ ele_idx ].phdr.info.val_sz; - ulong line_idx = ele0[ ele_idx ].line_idx; - - FD_CRIT( val_sz<=FD_VINYL_VAL_MAX, "corruption detected" ); - FD_CRIT( (line_idxline_idx==line_idx, "corruption detected" ); - - ulong line_ctl = line[ line_idx ].ctl; - - ulong ver = fd_vinyl_line_ctl_ver( line_ctl ); - long ref = fd_vinyl_line_ctl_ref( line_ctl ); - - if( FD_LIKELY( !req_flag_modify ) ) { - - /* At this point, we are acquiring a cached pair for read. - If the line is acquired for modify, fail with AGAIN. If - there are too many acquires for read on this pair, CRIT - (could consider AGAIN here). Otherwise, we update the - ref count (don't change the ver), point the client at the - line caching pair key to finish the acquire. Note that - we don't validate the pair header if we detect that an - earlier acquire in this batch started fetching the pair - because the read might still be in progress (see note - below for more details). */ - - if( FD_UNLIKELY( ref<0L ) ) DONE( FD_VINYL_ERR_AGAIN ); - if( FD_UNLIKELY( ref>=FD_VINYL_LINE_REF_MAX ) ) FD_LOG_CRIT(( "too many acquires for read on this pair" )); - - if( FD_LIKELY( !obj->rd_active ) ) { - fd_vinyl_bstream_phdr_t * phdr = fd_vinyl_data_obj_phdr( obj ); - - FD_CRIT( fd_vinyl_data_obj_val_max( obj ) >= val_sz, "corruption detected" ); - FD_CRIT( phdr->ctl==fd_vinyl_bstream_ctl( FD_VINYL_BSTREAM_CTL_TYPE_PAIR, - FD_VINYL_BSTREAM_CTL_STYLE_RAW, val_sz ), "corruption detected" ); - FD_CRIT( fd_vinyl_key_eq( &phdr->key, key ), "corruption detected" ); - FD_CRIT( !memcmp( &phdr->info, &ele0[ ele_idx ].phdr.info, sizeof(fd_vinyl_info_t) ), "corruption detected" ); - } - - line[ line_idx ].ctl = fd_vinyl_line_ctl( ver, ref+1L ); /* don't bump ver */ - - req_val_gaddr[ batch_idx ] = (ulong)fd_vinyl_data_obj_val( obj ) - data_laddr0; - - DONE( FD_VINYL_SUCCESS ); - - } - - /* At this point, we are acquiring a cached pair for modify. - If we are not allowed to acquire an existing pair for - modify (INVAL) or if the line line_idx is already acquired - for anything (AGAIN), fail. */ - - if( FD_UNLIKELY( ref ) ) DONE( FD_VINYL_ERR_AGAIN ); - if( FD_UNLIKELY( req_flag_excl ) ) DONE( FD_VINYL_ERR_INVAL ); - - fd_vinyl_bstream_phdr_t * phdr = fd_vinyl_data_obj_phdr( obj ); - - FD_CRIT( !obj->rd_active, "corruption detected" ); - FD_CRIT( fd_vinyl_data_obj_val_max( obj ) >= val_sz, "corruption detected" ); - FD_CRIT( phdr->ctl==fd_vinyl_bstream_ctl( FD_VINYL_BSTREAM_CTL_TYPE_PAIR, - FD_VINYL_BSTREAM_CTL_STYLE_RAW, val_sz ), "corruption detected" ); - FD_CRIT( fd_vinyl_key_eq( &phdr->key, key ), "corruption detected" ); - FD_CRIT( !memcmp( &phdr->info, &ele0[ ele_idx ].phdr.info, sizeof(fd_vinyl_info_t) ), "corruption detected" ); - - /* If the ignore flag is set, set the cached value size to 0. */ - - if( req_flag_ignore ) { - phdr->info.val_sz = 0U; - val_sz = 0UL; - } - - /* If the current location for the pair key's data isn't - sufficient to hold the worst case val_sz that the client - might modify the pair's value into, adjust the space - available for the pair to the user's val_max. Because we - might be ignoring the existing value, this could be smaller - than the current object. (We could chose to not trim in - this case because it will get trimmed again on release. - But doing so makes a more consistent guarantee to the - client and makes testing easier.) */ - - ulong csz = sizeof(fd_vinyl_bstream_phdr_t) + val_sz; - - ulong szc_new = fd_vinyl_data_szc( fd_ulong_max( val_sz, req_val_max ) ); - ulong szc_old = (ulong)obj->szc; - - if( FD_UNLIKELY( szc_new != szc_old ) ) { - - fd_vinyl_data_obj_t * obj_new = fd_vinyl_data_alloc( data, szc_new ); - if( FD_UNLIKELY( !obj_new ) ) FD_LOG_CRIT(( "increase data cache size" )); - - fd_vinyl_bstream_phdr_t * phdr_new = fd_vinyl_data_obj_phdr( obj_new ); - - memcpy( phdr_new, phdr, csz ); - - fd_vinyl_data_free( data, obj ); - - phdr = phdr_new; - obj = obj_new; - - line[ line_idx ].obj = obj; obj->line_idx = line_idx; obj->rd_active = (short)0; - } - - /* Zero out any remaining space in the pair. */ - - ulong zsz = fd_vinyl_bstream_pair_sz( fd_vinyl_data_szc_val_max( szc_new ) ) - csz; - memset( ((uchar *)phdr) + csz, 0, zsz ); - - /* Finish up acquiring for modify */ - - //line[ line_idx ].obj = ... already init; - //line[ line_idx ].ele_idx = ... already init; - line[ line_idx ].ctl = fd_vinyl_line_ctl( ver+1UL, -1L ); /* bump ver */ - - fd_vinyl_line_evict_prio( &vinyl->line_idx_lru, line, line_cnt, line_idx, req_evict_prio ); - - //phdr->ctl = ... already init - //phdr->key = ... already init - //phdr->info = ... already init - - //ele0[ ele_idx ] = ... already init - - req_val_gaddr[ batch_idx ] = (ulong)fd_vinyl_data_obj_val( obj ) - data_laddr0; - - DONE( FD_VINYL_SUCCESS ); - - } /* pair key data cached */ - - /* At this point, pair key is not cached. If we are not allowed - to acquire this pair, fail. Otherwise, evict the least - recently used evictable line (this should always be possible - if quotas are confiured correctly) to make room to cache this - pair. Connect this line to meta element ele_idx, set the - line's reference count appropriately, bump the line's version - and move the line to the desired location in the eviction - sequence. We don't modify any shared fields in meta element - ele_idx so we can do the modification fast. - - We do this upfront to free data cache for the alloc if the - LRU line is in use and to handle the same pair appearing - multiple times in an acquire. - - That is, if req_key appears multiple times in an acquire to - modify, the trailing redundant acquires will see the object - as cached with ref==-1 and fail with AGAIN. If the key - appears multiple times in an acquire for read, the trailing - redundant acquires will see the object as cached with ref>0 - and rd_active==1, conclude that the first redundant acquire - is in the process of reading the pair into cache, skip any - racy metadata checks, increase the ref count and succeed. - - IMPORTANT SAFETY TIP! Note that this implies that client - doing an acquire-for-read with redundant keys and with - speculative processing will see req_err transition to success - for the trailing redundant items for a key before the leading - item of that key transitions to success (and thus before the - object is fully read / verified and/or decoded). It is up to - the client doing speculative cut through processing to avoid - redundant keys or react accordingly. */ - - if( FD_UNLIKELY( req_flag_modify & req_flag_excl ) ) DONE( FD_VINYL_ERR_INVAL ); - - line_idx = fd_vinyl_line_evict_lru( &vinyl->line_idx_lru, line, line_cnt, ele0, ele_max, data ); - - ulong line_ctl = line[ line_idx ].ctl; - - ulong ver = fd_vinyl_line_ctl_ver( line_ctl ); - - line[ line_idx ].ele_idx = ele_idx; ele0[ ele_idx ].line_idx = line_idx; - line[ line_idx ].ctl = fd_vinyl_line_ctl( ver+1UL, req_flag_modify ? -1L : 1L ); - - fd_vinyl_line_evict_prio( &vinyl->line_idx_lru, line, line_cnt, line_idx, req_evict_prio ); - - /* Allocate an appropriately sized object to hold this pair, - connect it to this line and report the location to the client. */ - - ulong val_max = fd_ulong_if( !req_flag_modify, val_sz, - fd_ulong_if( !req_flag_ignore, fd_ulong_max( val_sz, req_val_max ), - req_val_max ) ); - - ulong szc = fd_vinyl_data_szc( val_max ); - - fd_vinyl_data_obj_t * obj = fd_vinyl_data_alloc( data, szc ); - if( FD_UNLIKELY( !obj ) ) FD_LOG_CRIT(( "increase data cache size" )); - - line[ line_idx ].obj = obj; obj->line_idx = line_idx; - - void * val = fd_vinyl_data_obj_val( obj ); - - req_val_gaddr[ batch_idx ] = (ulong)val - data_laddr0; - - /* If we need to do I/O, start reading encoded pair data and - defer the data integrity and decoding to later (and then in - whatever order the I/O layer sees fit). */ - - if( FD_LIKELY( !(req_flag_modify & req_flag_ignore) ) ) { - obj->rd_active = (short)1; - - int style = fd_vinyl_bstream_ctl_style( pair_ctl ); - ulong val_esz = fd_vinyl_bstream_ctl_sz ( pair_ctl ); - - FD_CRIT( val_esz<=FD_VINYL_VAL_MAX, "corruption detected" ); - FD_CRIT( (style!=FD_VINYL_BSTREAM_CTL_STYLE_RAW) | (val_sz==val_esz), "corruption detected" ); - - fd_vinyl_data_obj_t * cobj; - - if( FD_LIKELY( style==FD_VINYL_BSTREAM_CTL_STYLE_RAW ) ) cobj = obj; - else { - cobj = fd_vinyl_data_alloc( data, fd_vinyl_data_szc( val_esz ) ); - if( FD_UNLIKELY( !cobj ) ) FD_LOG_CRIT(( "increase data cache size" )); - } - - cobj->rd->ctx = (ulong)obj; - cobj->rd->seq = ele0[ ele_idx ].seq; - cobj->rd->dst = fd_vinyl_data_obj_phdr( cobj ); - cobj->rd->sz = fd_vinyl_bstream_pair_sz( val_esz ); - - cobj->rd_err = req_err + batch_idx; - - fd_vinyl_io_read( io, cobj->rd ); - read_cnt++; - - quota_rem--; - goto next_acquire; - } - - /* At this point, we are acquiring to modify but we don't need - the existing value. We populate the cached pair header - appropriately for the modify and zero the rest to complete - this request immediately. */ - - obj->rd_active = (short)0; - - fd_vinyl_bstream_phdr_t * phdr = fd_vinyl_data_obj_phdr( obj ); - - phdr->ctl = fd_vinyl_bstream_ctl( FD_VINYL_BSTREAM_CTL_TYPE_PAIR, FD_VINYL_BSTREAM_CTL_STYLE_RAW, val_sz ); - phdr->key = *key; - phdr->info = ele0[ ele_idx ].phdr.info; - - phdr->info.val_sz = 0U; - - memset( val, 0, fd_vinyl_data_szc_obj_footprint( szc ) - sizeof(fd_vinyl_data_obj_t) - sizeof(fd_vinyl_bstream_phdr_t) ); - - DONE( FD_VINYL_SUCCESS ); - - } /* pair key meta cached */ - - /* At this point, pair key does not exist at bstream seq_present - and is not in the process of being created. If we aren't - allowed to create pair key, fail. Otherwise, evict the least - recently used evictable line (this should always be possible if - quotas are configured correctly) to make room to cache this - pair, set the line's reference count appropriately, bump the - version and move the line to the desired location in the - eviction sequence. We do this upfront to free data cache for - the alloc if the LRU line is in use. */ - - if( FD_UNLIKELY( !(req_flag_modify & req_flag_create) ) ) DONE( FD_VINYL_ERR_KEY ); - - ulong line_idx = fd_vinyl_line_evict_lru( &vinyl->line_idx_lru, line, line_cnt, ele0, ele_max, data ); - - ulong line_ctl = line[ line_idx ].ctl; - - ulong ver = fd_vinyl_line_ctl_ver( line_ctl ); - - line[ line_idx ].ctl = fd_vinyl_line_ctl( ver+1UL, -1L ); - - fd_vinyl_line_evict_prio( &vinyl->line_idx_lru, line, line_cnt, line_idx, req_evict_prio ); - - /* Allocate an appropriately sized object to hold this pair and - connect it to this line. */ - - ulong szc = fd_vinyl_data_szc( req_val_max ); - - fd_vinyl_data_obj_t * obj = fd_vinyl_data_alloc( data, szc ); - if( FD_UNLIKELY( !obj ) ) FD_LOG_CRIT(( "increase data cache size" )); - - line[ line_idx ].obj = obj; obj->line_idx = line_idx; obj->rd_active = (short)0; - - /* Allocate a meta element to hold metadata for this pair and - connect it to this line. Since we are inserting at meta - element ele_idx, we don't need to lock anything so long as we - mark the element as in use very last. */ - - ulong pair_cnt = vinyl->pair_cnt; - if( FD_UNLIKELY( pair_cnt>=pair_max ) ) FD_LOG_CRIT(( "increase meta cache size" )); - vinyl->pair_cnt = pair_cnt + 1UL; - - ele0[ ele_idx ].memo = memo; - //ele0[ ele_idx ].phdr.ctl init below - ele0[ ele_idx ].phdr.key = *key; - memset( &ele0[ ele_idx ].phdr.info, 0, sizeof(fd_vinyl_info_t) ); /* sets val_sz to 0 */ - ele0[ ele_idx ].line_idx = line_idx; - ele0[ ele_idx ].seq = 0UL; /* Will be init on release */ - - FD_COMPILER_MFENCE(); - ele0[ ele_idx ].phdr.ctl = ULONG_MAX; /* Mark as being created */ - FD_COMPILER_MFENCE(); - - line[ line_idx ].ele_idx = ele_idx; - - /* Initialize the data region for a new pair */ - - *fd_vinyl_data_obj_phdr( obj ) = ele0[ ele_idx ].phdr; - - uchar * val = (uchar *)fd_vinyl_data_obj_val( obj ); - - memset( val, 0, fd_vinyl_data_szc_obj_footprint( szc ) - sizeof(fd_vinyl_data_obj_t) - sizeof(fd_vinyl_bstream_phdr_t) ); - - req_val_gaddr[ batch_idx ] = (ulong)val - data_laddr0; - - DONE( FD_VINYL_SUCCESS ); - - next_acquire: /* silly language restriction */; - -# undef DONE - - } /* for batch_idx */ - - FD_CRIT( (!read_cnt) | (!(req_flag_modify & req_flag_ignore)), "corruption detected" ); - - comp_err = FD_VINYL_SUCCESS; - break; - } diff --git a/src/vinyl/fd_vinyl_case_erase.c b/src/vinyl/fd_vinyl_case_erase.c deleted file mode 100644 index 18adddc6fb1..00000000000 --- a/src/vinyl/fd_vinyl_case_erase.c +++ /dev/null @@ -1,108 +0,0 @@ - case FD_VINYL_REQ_TYPE_ERASE: { - - fd_vinyl_key_t const * req_key = MAP_REQ_GADDR( req->key_gaddr, fd_vinyl_key_t, batch_cnt ); - schar * req_err = MAP_REQ_GADDR( req->err_gaddr, schar, batch_cnt ); - - if( FD_UNLIKELY( (!!batch_cnt) & ((!req_key) | (!req_err)) ) ) { - comp_err = FD_VINYL_ERR_INVAL; - break; - } - - for( ulong batch_idx=0UL; batch_idxline_idx==line_idx, "corruption detected" ); - FD_CRIT ( !obj->rd_active, "corruption detected" ); - - ulong ctl = line[ line_idx ].ctl; - - ulong ver = fd_vinyl_line_ctl_ver( ctl ); - long ref = fd_vinyl_line_ctl_ref( ctl ); - - if( FD_UNLIKELY( ref ) ) { - FD_COMPILER_MFENCE(); - req_err[ batch_idx ] = (schar)FD_VINYL_ERR_AGAIN; - FD_COMPILER_MFENCE(); - fail_cnt++; - continue; - } - - line[ line_idx ].obj = NULL; - line[ line_idx ].ele_idx = ULONG_MAX; //ele0[ ele_idx ].line_idx = ULONG_MAX; /* Technically not necessary given below */ - line[ line_idx ].ctl = fd_vinyl_line_ctl( ver+1UL, 0L ); /* bump version */ - - fd_vinyl_line_evict_prio( &vinyl->line_idx_lru, line, line_cnt, line_idx, FD_VINYL_LINE_EVICT_PRIO_LRU ); - - fd_vinyl_data_free( data, obj ); - - } else { - - FD_CRIT( line_idx==ULONG_MAX, "corruption detected" ); - - } - - /* At this point, pair key exists and is not cached. Append a - dead block and remove it from the meta. This generates two - pieces of bstream garbage: the old pair and the dead block - itself (the dead block is only needed for recovery and then - only while the old pair is in the bstream's past). */ - - /* FIXME: COMPACT SEQUENTIAL DEADS IN THE BSTREAM TO BE MORE - SPACE EFFICIENT? */ - - ulong val_esz = fd_vinyl_bstream_ctl_sz( ele0[ ele_idx ].phdr.ctl ); - - accum_garbage_cnt += 2UL; - accum_garbage_sz += fd_vinyl_bstream_pair_sz( val_esz ) + FD_VINYL_BSTREAM_BLOCK_SZ; - - fd_vinyl_io_append_dead( io, &ele0[ ele_idx ].phdr, NULL, 0UL ); - append_cnt++; - accum_dead_cnt++; - - fd_vinyl_meta_remove_fast( ele0, ele_max, lock, lock_shift, line, line_cnt, ele_idx ); - - ulong pair_cnt = vinyl->pair_cnt; - FD_CRIT( pair_cnt, "corruption detected" ); - vinyl->pair_cnt = pair_cnt - 1UL; - - FD_COMPILER_MFENCE(); - req_err[ batch_idx ] = (schar)FD_VINYL_SUCCESS; - FD_COMPILER_MFENCE(); - } - - comp_err = FD_VINYL_SUCCESS; - break; - } diff --git a/src/vinyl/fd_vinyl_case_fetch.c b/src/vinyl/fd_vinyl_case_fetch.c deleted file mode 100644 index 247c8a621a7..00000000000 --- a/src/vinyl/fd_vinyl_case_fetch.c +++ /dev/null @@ -1,116 +0,0 @@ - case FD_VINYL_REQ_TYPE_FETCH: { - fd_vinyl_key_t const * req_key = MAP_REQ_GADDR( req->key_gaddr, fd_vinyl_key_t, batch_cnt ); - - if( FD_UNLIKELY( (!!batch_cnt) & (!req_key) ) ) break; - - for( ulong batch_idx=0UL; batch_idxline_idx_lru, line, line_cnt, line_idx, FD_VINYL_LINE_EVICT_PRIO_MRU ); - - continue; - } - - /* At this point, pair key existsat seq_present but is not cached. - Evict the least recently used evictable line to make room to - cache this pair. Connect this line to meta element ele_idx, - set the line's reference count to zero, bump the line's version - and set the eviction priority to MRU. We don't modify any - shared fields in meta element ele_idx so we can do the - modification fast. - - We do this upfront to free data cache for the alloc if the LRU - line is in use and to handle the same pair appearing multiple - times in an acquire. - - The mechanics for fetch requests with redundant keys are - similar to acquire-for-read requests. In this case, trailing - redundant fetches will see the pair as cached (due to the first - redundant fetch ... this one), set the eviction priority to MRU - (again) and then continue. */ - - ulong pair_ctl = ele0[ ele_idx ].phdr.ctl; - ulong val_sz = (ulong)ele0[ ele_idx ].phdr.info.val_sz; - - FD_CRIT( fd_vinyl_bstream_ctl_type( pair_ctl )==FD_VINYL_BSTREAM_CTL_TYPE_PAIR, "corruption detected" ); - FD_CRIT( val_sz<=FD_VINYL_VAL_MAX, "corruption detected" ); - - line_idx = fd_vinyl_line_evict_lru( &vinyl->line_idx_lru, line, line_cnt, ele0, ele_max, data ); - - ulong line_ctl = line[ line_idx ].ctl; - - ulong ver = fd_vinyl_line_ctl_ver( line_ctl ); - - line[ line_idx ].ele_idx = ele_idx; ele0[ ele_idx ].line_idx = line_idx; - line[ line_idx ].ctl = fd_vinyl_line_ctl( ver+1UL, 0L ); - - fd_vinyl_line_evict_prio( &vinyl->line_idx_lru, line, line_cnt, line_idx, FD_VINYL_LINE_EVICT_PRIO_MRU ); - - /* Allocate an appropriately sized object to hold this pair, - connect it to this line and start reading the encoded pair data - into obj. */ - - ulong szc = fd_vinyl_data_szc( val_sz ); - - fd_vinyl_data_obj_t * obj = fd_vinyl_data_alloc( data, szc ); - if( FD_UNLIKELY( !obj ) ) FD_LOG_CRIT(( "increase data cache size" )); - - line[ line_idx ].obj = obj; obj->line_idx = line_idx; - - /* Start reading encoded pair data and defer the validation and - decoding to later (and then in whatever order the I/O layer - sees fit). */ - - obj->rd_active = (short)1; - - int style = fd_vinyl_bstream_ctl_style( pair_ctl ); - ulong val_esz = fd_vinyl_bstream_ctl_sz ( pair_ctl ); - - FD_CRIT( val_esz<=FD_VINYL_VAL_MAX, "corruption detected" ); - FD_CRIT( (style!=FD_VINYL_BSTREAM_CTL_STYLE_RAW) | (val_sz==val_esz), "corruption detected" ); - - fd_vinyl_data_obj_t * cobj; - - if( FD_LIKELY( style==FD_VINYL_BSTREAM_CTL_STYLE_RAW ) ) cobj = obj; - else { - cobj = fd_vinyl_data_alloc( data, fd_vinyl_data_szc( val_esz ) ); - if( FD_UNLIKELY( !cobj ) ) FD_LOG_CRIT(( "increase data cache size" )); - } - - cobj->rd->ctx = (ulong)obj; - cobj->rd->seq = ele0[ ele_idx ].seq; - cobj->rd->dst = fd_vinyl_data_obj_phdr( cobj ); - cobj->rd->sz = fd_vinyl_bstream_pair_sz( val_esz ); - - cobj->rd_err = (schar *)cobj->unused; - - fd_vinyl_io_read( io, cobj->rd ); - read_cnt++; - } - - break; - } diff --git a/src/vinyl/fd_vinyl_case_flush.c b/src/vinyl/fd_vinyl_case_flush.c deleted file mode 100644 index 75512879ff6..00000000000 --- a/src/vinyl/fd_vinyl_case_flush.c +++ /dev/null @@ -1,68 +0,0 @@ - case FD_VINYL_REQ_TYPE_FLUSH: { - - fd_vinyl_key_t const * req_key = MAP_REQ_GADDR( req->key_gaddr, fd_vinyl_key_t, batch_cnt ); - - if( FD_UNLIKELY( (!!batch_cnt) & (!req_key) ) ) break; /* flushes don't generate completions */ - - for( ulong batch_idx=0UL; batch_idxline_idx==line_idx, "corruption detected" ); - FD_CRIT ( !obj->rd_active, "corruption detected" ); - - fd_vinyl_line_evict_prio( &vinyl->line_idx_lru, line, line_cnt, line_idx, FD_VINYL_LINE_EVICT_PRIO_LRU ); - - ulong ctl = line[ line_idx ].ctl; - - ulong ver = fd_vinyl_line_ctl_ver( ctl ); - long ref = fd_vinyl_line_ctl_ref( ctl ); - - if( FD_UNLIKELY( ref ) ) continue; - - /* At this point, pair key is cached, not acquired and the line - is at LRU position. Flush the cached data. We don't modify - any shared fields of meta element ele_idx so we can do this - fast. */ - - line[ line_idx ].obj = NULL; - line[ line_idx ].ele_idx = ULONG_MAX; - line[ line_idx ].ctl = fd_vinyl_line_ctl( ver+1UL, 0UL ); - /* evict prio updated above */ - - ele0[ ele_idx ].line_idx = ULONG_MAX; - - fd_vinyl_data_free( data, obj ); - - } - - break; - } diff --git a/src/vinyl/fd_vinyl_case_move.c b/src/vinyl/fd_vinyl_case_move.c deleted file mode 100644 index 142a8be51ae..00000000000 --- a/src/vinyl/fd_vinyl_case_move.c +++ /dev/null @@ -1,326 +0,0 @@ - case FD_VINYL_REQ_TYPE_MOVE: { - - fd_vinyl_key_t const * req_key_src = MAP_REQ_GADDR( req->key_gaddr, fd_vinyl_key_t, batch_cnt ); - fd_vinyl_key_t const * req_key_dst = MAP_REQ_GADDR( req->val_gaddr_gaddr, fd_vinyl_key_t, batch_cnt ); - schar * req_err = MAP_REQ_GADDR( req->err_gaddr, schar, batch_cnt ); - - if( FD_UNLIKELY( (!!batch_cnt) & ((!req_key_src) | (!req_key_dst) | (!req_err)) ) ) { - comp_err = FD_VINYL_ERR_INVAL; - break; - } - - for( ulong batch_idx=0UL; batch_idxline_idx==line_idx_src, "corruption detected" ); - - phdr_src = fd_vinyl_data_obj_phdr( obj_src ); - - line[ line_idx_src ].ctl = fd_vinyl_line_ctl( ver_src+1UL, 0L ); - - } else { - - FD_CRIT( line_idx_src==ULONG_MAX, "corruption detected" ); - - /* Read the encoded pair from the bstream */ - - ulong ctl = ele0[ ele_idx_src ].phdr.ctl; - - int type = fd_vinyl_bstream_ctl_type ( ctl ); - int style = fd_vinyl_bstream_ctl_style( ctl ); - ulong val_esz = fd_vinyl_bstream_ctl_sz ( ctl ); - - FD_CRIT( type==FD_VINYL_BSTREAM_CTL_TYPE_PAIR, "corruption detected" ); - FD_CRIT( (style==FD_VINYL_BSTREAM_CTL_STYLE_RAW) | (style==FD_VINYL_BSTREAM_CTL_STYLE_LZ4), "corruption detected" ); - FD_CRIT( val_esz<=FD_VINYL_VAL_MAX, "corruption detected" ); - - fd_vinyl_data_obj_t * cobj = fd_vinyl_data_alloc( data, fd_vinyl_data_szc( val_esz ) ); - if( FD_UNLIKELY( !cobj ) ) FD_LOG_CRIT(( "increase data cache size" )); - - fd_vinyl_bstream_phdr_t * cphdr = fd_vinyl_data_obj_phdr( cobj ); - ulong cpair_sz = fd_vinyl_bstream_pair_sz( val_esz ); - - fd_vinyl_io_read_imm( io, seq_src, cphdr, cpair_sz ); - /* not an async read (so no read_cnt increment) */ - - /* Verify data integrity */ - - FD_ALERT( !fd_vinyl_bstream_pair_test( io_seed, seq_src, (fd_vinyl_bstream_block_t *)cphdr, cpair_sz ), - "corruption detected" ); - - /* Decode the pair */ - - if( FD_LIKELY( style==FD_VINYL_BSTREAM_CTL_STYLE_RAW ) ) { - - FD_CRIT( val_esz==val_sz, "corruption detected" ); - - obj_src = cobj; - phdr_src = cphdr; - - } else { - - obj_src = fd_vinyl_data_alloc( data, fd_vinyl_data_szc( val_sz ) ); - if( FD_UNLIKELY( !obj_src ) ) FD_LOG_CRIT(( "increase data cache size" )); - - char const * cval = (char const *)fd_vinyl_data_obj_val( cobj ); - char * val = (char *) fd_vinyl_data_obj_val( obj_src ); - if( FD_UNLIKELY( (ulong)LZ4_decompress_safe( cval, val, (int)val_esz, (int)val_sz )!=val_sz ) ) - FD_LOG_CRIT(( "LZ4_decompress_safe failed" )); - - phdr_src = fd_vinyl_data_obj_phdr( obj_src ); - - phdr_src->ctl = fd_vinyl_bstream_ctl( FD_VINYL_BSTREAM_CTL_TYPE_PAIR, FD_VINYL_BSTREAM_CTL_STYLE_RAW, val_sz ); - phdr_src->key = cphdr->key; - phdr_src->info = cphdr->info; - - fd_vinyl_data_free( data, cobj ); - - } - - line_idx_src = fd_vinyl_line_evict_lru( &vinyl->line_idx_lru, line, line_cnt, ele0, ele_max, data ); - - ulong line_ctl_src = line[ line_idx_src ].ctl; - - ulong ver_src = fd_vinyl_line_ctl_ver( line_ctl_src ); - - line[ line_idx_src ].obj = obj_src; obj_src->line_idx = line_idx_src; obj_src->rd_active = (short)0; - line[ line_idx_src ].ele_idx = ele_idx_src; ele0[ ele_idx_src ].line_idx = line_idx_src; - line[ line_idx_src ].ctl = fd_vinyl_line_ctl( ver_src+1UL, 0L ); - - fd_vinyl_line_evict_prio( &vinyl->line_idx_lru, line, line_cnt, line_idx_src, FD_VINYL_LINE_EVICT_PRIO_LRU ); - - if( line_idx_src==line_idx_dst ) line_idx_dst = ULONG_MAX; /* Handle evict_lru evicting the dst */ - - } - - /* At this point, pair key_src is cached but not acquired and pair - key_dst is not acquired. We are clear to move. If pair - key_dst exists, we are replacing pair key_dst with pair - key_src. In this case, we remove pair key_dst from cache and - remove pair key_dst from the meta. This remove might move the - location of pair key_src's meta element. So we reload if - necessary. */ - - FD_CRIT( fd_vinyl_bstream_ctl_type( phdr_src->ctl )==fd_vinyl_bstream_ctl_type( ele0[ ele_idx_src ].phdr.ctl ), - "corruption detected" ); - FD_CRIT( fd_vinyl_key_eq( &phdr_src->key, &ele0[ ele_idx_src ].phdr.key ), "corruption detected" ); - FD_CRIT( !memcmp( &phdr_src->info, &ele0[ ele_idx_src ].phdr.info, sizeof(fd_vinyl_info_t) ), "corruption detected" ); - - accum_garbage_cnt += 2UL; /* old src and new move block */ - accum_garbage_sz += fd_vinyl_bstream_pair_sz( fd_vinyl_bstream_ctl_sz( ele0[ ele_idx_src ].phdr.ctl ) ) + - FD_VINYL_BSTREAM_BLOCK_SZ; - - if( FD_UNLIKELY( !err_dst ) ) { - - accum_garbage_cnt++; /* old dst */ - accum_garbage_sz += fd_vinyl_bstream_pair_sz( fd_vinyl_bstream_ctl_sz( ele0[ ele_idx_dst ].phdr.ctl ) ); - - if( FD_UNLIKELY( line_idx_dst < line_cnt ) ) { - - FD_CRIT( line[ line_idx_dst ].ele_idx==ele_idx_dst, "corruption detected" ); - - fd_vinyl_data_obj_t * obj_dst = line[ line_idx_dst ].obj; - - FD_ALERT( fd_vinyl_data_is_valid_obj( obj_dst, vol, vol_cnt ), "corruption detected" ); - FD_CRIT ( obj_dst->line_idx==line_idx_dst, "corruption detected" ); - - ulong line_ctl_dst = line[ line_idx_dst ].ctl; - - ulong ver_dst = fd_vinyl_line_ctl_ver( line_ctl_dst ); - - fd_vinyl_data_free( data, obj_dst ); - - line[ line_idx_dst ].obj = NULL; - line[ line_idx_dst ].ele_idx = ULONG_MAX; // ele0[ ele_idx_dst ].line_idx = ULONG_MAX; /* Technically not necessary given below */ - line[ line_idx_dst ].ctl = fd_vinyl_line_ctl( ver_dst+1UL, 0L ); - - fd_vinyl_line_evict_prio( &vinyl->line_idx_lru, line, line_cnt, line_idx_dst, FD_VINYL_LINE_EVICT_PRIO_LRU ); - } - - fd_vinyl_meta_remove_fast( ele0, ele_max, lock, lock_shift, line, line_cnt, ele_idx_dst ); /* See note below about atomicity for concurrent meta readers */ - - ulong pair_cnt = vinyl->pair_cnt; - FD_CRIT( pair_cnt, "corruption detected" ); - vinyl->pair_cnt = pair_cnt - 1UL; - - err_src = fd_vinyl_meta_query_fast( ele0, ele_max, key_src, memo_src, &_ele_idx_src ); - ele_idx_src = _ele_idx_src; /* In [0,ele_max) */ - FD_CRIT( !err_src, "corruption detected" ); - /* Note: could test other fields post move too */ - - } - - /* At this point, pair key_src is cached but not acquired and pair - key_dst is not cached and not in the meta (the move block that - will official erase if it already exists will be written - below). Update the cached phdr to reflect the move. Remove - the meta entry for pair key_src and insert a meta entry for - pair key_dst. - - Note: this means from the point of view of concurrent meta - queries, there will be a brief time interval when pair key_src - and pair key_dst are both reported as not existing. - - As an alternative with more overhead we could instead insert - the meta element for key_dst, remove the meta element for - key_src and requery meta for key_dst (as the remove could move - it). In this case, there will be a gap where both key_src and - key_dst are both reported as available (and they will point to - the same cache entry during this interval). - - With even more complexity and overhead, we could eliminate the - gap and overhead and make this atomic from the point of view of - concurrent meta readers. (Would have compute a lock set that - cover the target key_dst insert location and the key_src probe - sequence assuming key_dst has been inserted, lock the locks, do - the insert, do the remove without any locking behavior, free - the lock set and then requery where key_dst ended up.) Also - note that, if we are replacing pair key_dst, at this point, - pair key_dst is already reported to concurrent meta readers as - not existing. Would need to extend this to the above. - - But it isn't clear that concurrent meta readers care at all. - So we go with the fast simple method below (it still is atomic - from the point of view of clients and the bstream). */ - - ulong pair_sz = fd_vinyl_bstream_pair_sz( val_sz ); - ulong seq_move = fd_vinyl_io_hint( io, FD_VINYL_BSTREAM_BLOCK_SZ + pair_sz ); - ulong seq_dst = seq_move + FD_VINYL_BSTREAM_BLOCK_SZ; - - //phdr_src->ctl = ... already init - phdr_src->key = *key_dst; - //phdr_src->info = ... already init - - fd_vinyl_meta_remove_fast( ele0, ele_max, lock, lock_shift, line, line_cnt, ele_idx_src ); - - err_dst = fd_vinyl_meta_query_fast( ele0, ele_max, key_dst, memo_dst, &_ele_idx_dst ); - ele_idx_dst = _ele_idx_dst; /* In [0,ele_max) */ - - FD_CRIT( err_dst==FD_VINYL_ERR_KEY, "corruption detected" ); - - ele0[ ele_idx_dst ].memo = memo_dst; - //ele0[ ele_idx_dst ].phdr.ctl = ... init below for concurrent safe insert - ele0[ ele_idx_dst ].phdr.key = phdr_src->key; - ele0[ ele_idx_dst ].phdr.info = phdr_src->info; - ele0[ ele_idx_dst ].line_idx = line_idx_src; - ele0[ ele_idx_dst ].seq = seq_dst; - - FD_COMPILER_MFENCE(); - ele0[ ele_idx_dst ].phdr.ctl = phdr_src->ctl; - FD_COMPILER_MFENCE(); - - line[ line_idx_src ].ele_idx = ele_idx_dst; - - fd_vinyl_io_append_move( io, phdr_src, key_dst, NULL, 0UL ); - append_cnt++; - accum_move_cnt++; - - fd_vinyl_bstream_pair_hash( io_seed, (fd_vinyl_bstream_block_t *)phdr_src ); - - ulong seq = fd_vinyl_io_append( io, phdr_src, pair_sz ); - append_cnt++; - FD_CRIT( fd_vinyl_seq_eq( seq, seq_dst ), "unexpected append location" ); - - DONE( FD_VINYL_SUCCESS ); - - next_move: /* silly language restriction */; - -# undef DONE - - } - - comp_err = FD_VINYL_SUCCESS; - break; - } diff --git a/src/vinyl/fd_vinyl_case_release.c b/src/vinyl/fd_vinyl_case_release.c deleted file mode 100644 index 12c5ae4482f..00000000000 --- a/src/vinyl/fd_vinyl_case_release.c +++ /dev/null @@ -1,352 +0,0 @@ - case FD_VINYL_REQ_TYPE_RELEASE: { - - ulong req_flags = (ulong)req->flags; - fd_vinyl_key_t const * req_key = MAP_REQ_GADDR( req->key_gaddr, fd_vinyl_key_t const, batch_cnt ); - ulong * req_val_gaddr = MAP_REQ_GADDR( req->val_gaddr_gaddr, ulong, batch_cnt ); - schar * req_err = MAP_REQ_GADDR( req->err_gaddr, schar, batch_cnt ); - - int req_flag_modify = fd_vinyl_req_flag_modify( req_flags ); - int req_flag_ignore = fd_vinyl_req_flag_ignore( req_flags ); - int req_flag_erase = fd_vinyl_req_flag_erase ( req_flags ); - int req_flag_by_key = fd_vinyl_req_flag_by_key( req_flags ); - int req_evict_prio = fd_vinyl_req_evict_prio ( req_flags ); - - if( FD_UNLIKELY( (!!batch_cnt) & ( ((!req_key ) & req_flag_by_key ) | - ((!req_val_gaddr) & (!req_flag_by_key)) | - ( !req_err ) ) ) ) { - comp_err = FD_VINYL_ERR_INVAL; - break; - } - - for( ulong batch_idx=0UL; batch_idxrd_active ) ) DONE( FD_VINYL_ERR_INVAL ); - - line_idx = obj->line_idx; - if( FD_UNLIKELY( line_idx>=line_cnt ) || FD_UNLIKELY( obj!=line[ line_idx ].obj ) ) DONE( FD_VINYL_ERR_INVAL ); - - ele_idx = line[ line_idx ].ele_idx; - if( FD_UNLIKELY( ele_idx>=ele_max ) || FD_UNLIKELY( ele0[ ele_idx ].line_idx!=line_idx ) ) DONE( FD_VINYL_ERR_INVAL ); - /* FIXME: MAKE SURE ELE0[ ELE_IDX ] IS IN USE FOR DATA INTEGRITY! */ - - ulong ctl = line[ line_idx ].ctl; - - ver = fd_vinyl_line_ctl_ver( ctl ); - ref = fd_vinyl_line_ctl_ref( ctl ); - - if( FD_UNLIKELY( !ref ) ) DONE( FD_VINYL_ERR_INVAL ); /* Pair key exists and is cached ... but not acquired */ - - } else { /* Release by key */ - - fd_vinyl_key_t const * key = req_key + batch_idx; - - ulong memo = fd_vinyl_key_memo( meta_seed, key ); /* This can be slow which is why releasing by val_gaddr is preferred */ - - ulong _ele_idx; /* avoid pointer escape */ - int err = fd_vinyl_meta_query_fast( ele0, ele_max, key, memo, &_ele_idx ); - ele_idx = _ele_idx; /* in [0,ele_max) */ - - if( FD_UNLIKELY( err ) ) DONE( FD_VINYL_ERR_INVAL ); /* Pair key does not exist ... can't have been acquired */ - - line_idx = ele0[ ele_idx ].line_idx; - - if( FD_UNLIKELY( line_idx>=line_cnt ) ) { /* Pair key exists but is not cached ... can't have been acquired */ - FD_CRIT( line_idx==ULONG_MAX, "corruption detected" ); - DONE( FD_VINYL_ERR_INVAL ); - } - - FD_CRIT( ele_idx==line[ line_idx ].ele_idx, "corruption detected" ); - - obj = line[ line_idx ].obj; - - FD_ALERT( fd_vinyl_data_is_valid_obj( obj, vol, vol_cnt ), "corruption detected" ); - FD_CRIT ( obj->line_idx==line_idx, "corruption detected" ); - FD_CRIT ( !obj->rd_active, "corruption detected" ); - - ulong ctl = line[ line_idx ].ctl; - - ver = fd_vinyl_line_ctl_ver( ctl ); - ref = fd_vinyl_line_ctl_ref( ctl ); - - if( FD_UNLIKELY( !ref ) ) DONE( FD_VINYL_ERR_INVAL ); /* Pair key exists and is cached ... but not acquired */ - - } - - /* At this point, we are releasing an acquire of the object obj, - cached at line line_idx with metadata at ele_idx. */ - - fd_vinyl_bstream_phdr_t * phdr = fd_vinyl_data_obj_phdr( obj ); - - if( FD_LIKELY( ref>0L ) ) { - - /* At this point, we are releasing an acquire for read. If - the client indicated they modified pair key, we don't have - data integrity anymore and we CRIT. Otherwise, we update - line eviction priority and ref count to do the release. */ - - if( FD_UNLIKELY( req_flag_modify ) ) FD_LOG_CRIT(( "client modified read only acquire" )); - - FD_CRIT( phdr->ctl==fd_vinyl_bstream_ctl( FD_VINYL_BSTREAM_CTL_TYPE_PAIR, - FD_VINYL_BSTREAM_CTL_STYLE_RAW, - (ulong)ele0[ ele_idx ].phdr.info.val_sz ), "corruption detected" ); - FD_CRIT( fd_vinyl_key_eq( &phdr->key, &ele0[ ele_idx ].phdr.key ), "corruption detected" ); - FD_CRIT( !memcmp( &phdr->info, &ele0[ ele_idx ].phdr.info, sizeof(fd_vinyl_info_t) ), "corruption detected" ); - - fd_vinyl_line_evict_prio( &vinyl->line_idx_lru, line, line_cnt, line_idx, req_evict_prio ); - - line[ line_idx ].ctl = fd_vinyl_line_ctl( ver, ref-1L ); /* don't bump ver */ - - DONE( FD_VINYL_SUCCESS ); - } - - /* At this point, we are releasing an acquire for modify */ - - ulong phdr_ctl = phdr->ctl; - - int modifying_existing = (phdr_ctl!=ULONG_MAX); - - if( FD_LIKELY( req_flag_modify & (!req_flag_erase) ) ) { - - /* At this point, we are either finishing up modifying an - existing pair (modifying_existing 1) or finishing up creating - a new pair (modifying_existing 0). Cache the object in the - smallest size class that supports it. Note that the client - could have modified info so we only validate ctl and key - (FIXME: consider validating memo too?). */ - - FD_CRIT( (!modifying_existing) | - (phdr_ctl==fd_vinyl_bstream_ctl( FD_VINYL_BSTREAM_CTL_TYPE_PAIR, - FD_VINYL_BSTREAM_CTL_STYLE_RAW, - (ulong)ele0[ ele_idx ].phdr.info.val_sz )), "corruption detected" ); - FD_CRIT( fd_vinyl_key_eq( &phdr->key, &ele0[ ele_idx ].phdr.key ), "corruption detected" ); - - ulong val_sz_after = (ulong)phdr->info.val_sz; - - if( FD_UNLIKELY( val_sz_after > fd_vinyl_data_obj_val_max( obj ) ) ) FD_LOG_CRIT(( "client overran memory" )); - - ulong szc_before = (ulong)obj->szc; - ulong szc_after = fd_vinyl_data_szc( val_sz_after ); - - if( FD_UNLIKELY( szc_before!=szc_after ) ) { - - FD_CRIT( szc_afterctl = fd_vinyl_bstream_ctl( FD_VINYL_BSTREAM_CTL_TYPE_PAIR, FD_VINYL_BSTREAM_CTL_STYLE_RAW, val_sz_after ); - /*phdr->key already init */ - /*phdr->info already init */ - - int style_after; - ulong val_esz_after; - ulong seq_after = fd_vinyl_io_append_pair_inplace( io, vinyl->style, phdr, &style_after, &val_esz_after ); - append_cnt++; - - /* Update the line and meta to match. Note that setting meta - element ele_idx phdr.ctl to something other than ULONG_MAX - marks a pair that was being created as no longer being - created. For a pair that already existed, we also need to - update phdr.ctl to reflect that we might be storing this in - the stream in a different format than it was stored in - bstream before. Since we are changing shared fields of meta - element ele_idx, we need to use prepare / publish semantics. */ - - line[ line_idx ].obj = obj; obj->line_idx = line_idx; obj->rd_active = (short)0; - //line[ line_idx ].ele_idx ... already init - line[ line_idx ].ctl = fd_vinyl_line_ctl( ver+1L, 0L ); /* bump ver */ - - fd_vinyl_line_evict_prio( &vinyl->line_idx_lru, line, line_cnt, line_idx, req_evict_prio ); - - fd_vinyl_meta_prepare_fast( lock, lock_shift, ele_idx ); - - //ele0[ ele_idx ].memo = already init - ele0[ ele_idx ].phdr.ctl = fd_vinyl_bstream_ctl( FD_VINYL_BSTREAM_CTL_TYPE_PAIR, style_after, val_esz_after ); - //ele0[ ele_idx ].phdr.key = already init - ele0[ ele_idx ].phdr.info = phdr->info; - ele0[ ele_idx ].seq = seq_after; - //ele0[ ele_idx ].line_idx = already init - - fd_vinyl_meta_publish_fast( lock, lock_shift, ele_idx ); - - DONE( FD_VINYL_SUCCESS ); - - } - - /* At this point, we are either canceling a modification (modify - 0, erase d/c) or the modification is to erase the pair (modify - 1, erase 1). If we are canceling the modification of an - existing pair and the client indicated the cached pair info and - cached pair val are still valid, (i.e. release-cancel of an - acquire-for-modify of an existing pair), we revert the line - state and adjust the line evict priority. (This code path can - be omitted if we don't trust the clients to report correctly. - We do test at least the client is correctly reporting the info - is not modified.) Note that we might have put this in a larged - sized obj when we acquired it for modify. So we also move the - object to the tightest location. */ - - if( FD_LIKELY( modifying_existing & (!req_flag_modify) & (!req_flag_ignore) ) ) { - - /* FIXME: consider allowing the client to always clobber the - pair info and just restore info from the meta cache? */ - - if( FD_UNLIKELY( !( (phdr->ctl==fd_vinyl_bstream_ctl( FD_VINYL_BSTREAM_CTL_TYPE_PAIR, - FD_VINYL_BSTREAM_CTL_STYLE_RAW, - (ulong)ele0[ ele_idx ].phdr.info.val_sz ) ) & - (fd_vinyl_key_eq( &phdr->key, &ele0[ ele_idx ].phdr.key ) ) & - (!memcmp( &phdr->info, &ele0[ ele_idx ].phdr.info, sizeof(fd_vinyl_info_t) )) ) ) ) - FD_LOG_CRIT(( "client clobbered pair info" )); - - ulong val_sz_before = (ulong)phdr->info.val_sz; - - ulong szc_after = (ulong)obj->szc; - ulong szc_before = fd_vinyl_data_szc( val_sz_before ); - - if( FD_UNLIKELY( szc_before!=szc_after ) ) { - - FD_CRIT( szc_beforeline_idx = line_idx; obj_before->rd_active = (short)0; - - } - - line[ line_idx ].ctl = fd_vinyl_line_ctl( ver-1UL, 0L ); /* revert ver */ - - fd_vinyl_line_evict_prio( &vinyl->line_idx_lru, line, line_cnt, line_idx, req_evict_prio ); - - DONE( FD_VINYL_SUCCESS ); - - } - - /* At this point, we are canceling a modification of an existing - pair that no longer has valid cached pair info or cached pair - val, erasing an existing pair, canceling the creation of a new - pair or erasing a pair in the process of being created (which - we treat the same as cancelling the creation). - - Since there was nothing cached originally (canceling / erasing - a pair being created), the cached data is no longer valid - (cancel with ignore of an existing pair) or the the cached data - is no longer needed (erase of an existing pair), we free the - data obj, mark the line as empty, move the line to LRU - position. */ - - /* FIXME: INTEGRITY CHECKS ON PHDR HERE? (TRICKY AS WE'D HAVE TO - MAP OUT EXACTLY WHICH FIELDS CAN BE TRUSTED AT THIS POINT AND - IT ISN'T OBVIOUS IT MATTERS) */ - - fd_vinyl_data_free( data, obj ); - - line[ line_idx ].obj = NULL; - line[ line_idx ].ele_idx = ULONG_MAX; ele0[ ele_idx ].line_idx = ULONG_MAX; - line[ line_idx ].ctl = fd_vinyl_line_ctl( ver+1UL, 0L ); /* bump ver */ - - fd_vinyl_line_evict_prio( &vinyl->line_idx_lru, line, line_cnt, line_idx, FD_VINYL_LINE_EVICT_PRIO_LRU ); - - /* If we are erasing an existing pair, append a dead block to - the bstream. This generates two pieces of bstream garbage (the - old pair and the dead block itself). Likewise, if we are - erasing an existing pair or cancelling / erasing a pair - creation, remove the element from the meta. Note that - req_flag_modify==1 implies req_flag_erase==1 but not vice versa - at this point. */ - - if( FD_LIKELY( req_flag_modify & modifying_existing ) ) { - - ulong val_esz_before = fd_vinyl_bstream_ctl_sz( ele0[ ele_idx ].phdr.ctl ); - - accum_garbage_cnt += 2UL; - accum_garbage_sz += fd_vinyl_bstream_pair_sz( val_esz_before ) + FD_VINYL_BSTREAM_BLOCK_SZ; - - fd_vinyl_io_append_dead( io, &ele0[ ele_idx ].phdr, NULL, 0UL ); - append_cnt++; - accum_dead_cnt++; - - } - - if( FD_LIKELY( req_flag_modify | (!modifying_existing) ) ) { - fd_vinyl_meta_remove_fast( ele0, ele_max, lock, lock_shift, line, line_cnt, ele_idx ); - - ulong pair_cnt = vinyl->pair_cnt; - FD_CRIT( (0ULpair_cnt = pair_cnt - 1UL; - } - - DONE( FD_VINYL_SUCCESS ); - - next_release: /* silly language restriction */; - -# undef DONE - - } /* for batch_idx */ - - comp_err = FD_VINYL_SUCCESS; - break; - } diff --git a/src/vinyl/fd_vinyl_case_test.c b/src/vinyl/fd_vinyl_case_test.c deleted file mode 100644 index 3e03319aeea..00000000000 --- a/src/vinyl/fd_vinyl_case_test.c +++ /dev/null @@ -1,39 +0,0 @@ - case FD_VINYL_REQ_TYPE_TEST: { - - ulong const * req_val_gaddr = MAP_REQ_GADDR( req->val_gaddr_gaddr, ulong, 2UL*batch_cnt ); - schar * req_err = MAP_REQ_GADDR( req->err_gaddr, schar, batch_cnt ); - - if( FD_UNLIKELY( (!!batch_cnt) & ((!req_val_gaddr) | (!req_err)) ) ) { - comp_err = FD_VINYL_ERR_INVAL; - break; - } - - ulong const * req_try = req_val_gaddr + batch_cnt; - - for( ulong batch_idx=0UL; batch_idx> 32; - ulong line_idx = try & FD_VINYL_LINE_MAX; - - int err = FD_UNLIKELY( line_idx>=line_cnt ) ? FD_VINYL_ERR_INVAL - : FD_UNLIKELY( fd_vinyl_line_ctl_ver( line[ line_idx ].ctl )!=ver ) ? FD_VINYL_ERR_CORRUPT - : FD_VINYL_SUCCESS; - - FD_COMPILER_MFENCE(); - req_err[ batch_idx ] = (schar)err; - FD_COMPILER_MFENCE(); - - fail_cnt += (ulong)!!err; - - } - - comp_err = FD_VINYL_SUCCESS; - break; - } diff --git a/src/vinyl/fd_vinyl_case_try.c b/src/vinyl/fd_vinyl_case_try.c deleted file mode 100644 index 63ef050b6c7..00000000000 --- a/src/vinyl/fd_vinyl_case_try.c +++ /dev/null @@ -1,180 +0,0 @@ - case FD_VINYL_REQ_TYPE_TRY: { - - FD_STATIC_ASSERT( FD_VINYL_LINE_VER_MAX==((1UL<<32)-1UL), update_impl_for_ver_max ); - - ulong req_flags = (ulong)req->flags; - fd_vinyl_key_t const * req_key = MAP_REQ_GADDR( req->key_gaddr, fd_vinyl_key_t, batch_cnt ); - ulong * req_val_gaddr = MAP_REQ_GADDR( req->val_gaddr_gaddr, ulong, 2UL*batch_cnt ); - schar * req_err = MAP_REQ_GADDR( req->err_gaddr, schar, batch_cnt ); - - int req_evict_prio = fd_vinyl_req_evict_prio( req_flags ); - - if( FD_UNLIKELY( (!!batch_cnt) & ((!req_key) | (!req_val_gaddr) | (!req_err)) ) ) { - comp_err = FD_VINYL_ERR_INVAL; - break; - } - - ulong * req_try = req_val_gaddr + batch_cnt; - - for( ulong batch_idx=0UL; batch_idxline_idx==line_idx, "corruption detected" ); - - ulong line_ctl = line[ line_idx ].ctl; - - ulong ver = fd_vinyl_line_ctl_ver( line_ctl ); - long ref = fd_vinyl_line_ctl_ref( line_ctl ); - - if( FD_UNLIKELY( ref<0L ) ) DONE( FD_VINYL_ERR_AGAIN, ULONG_MAX ); - - if( FD_LIKELY( !obj->rd_active ) ) { - fd_vinyl_bstream_phdr_t * phdr = fd_vinyl_data_obj_phdr( obj ); - - FD_CRIT( fd_vinyl_data_obj_val_max( obj ) >= val_sz, "corruption detected" ); - FD_CRIT( phdr->ctl==fd_vinyl_bstream_ctl( FD_VINYL_BSTREAM_CTL_TYPE_PAIR, - FD_VINYL_BSTREAM_CTL_STYLE_RAW, val_sz ), "corruption detected" ); - FD_CRIT( fd_vinyl_key_eq( &phdr->key, key ), "corruption detected" ); - FD_CRIT( !memcmp( &phdr->info, &ele0[ ele_idx ].phdr.info, sizeof(fd_vinyl_info_t) ), "corruption detected" ); - } - - fd_vinyl_line_evict_prio( &vinyl->line_idx_lru, line, line_cnt, line_idx, req_evict_prio ); - - req_val_gaddr[ batch_idx ] = (ulong)fd_vinyl_data_obj_val( obj ) - data_laddr0; - - DONE( FD_VINYL_SUCCESS, (ver<<32) | line_idx ); - - } - - /* At this point, pair key exists but is not cached. Evict the - least recently used evictable line to make room to cache this - pair. Connect this line to meta element ele_idx, set the - line's reference count to zero, bump the line's version and set - the eviction priority as desired. We don't modify any shared - fields in meta element ele_idx so we can do the modification - fast. - - We do this upfront to free data cache for the alloc if the LRU - line is in use and to handle the same pair appearing multiple - times in an acquire. - - The mechanics for try requests with redundant keys are the same - as acquire-for-read requests. */ - - line_idx = fd_vinyl_line_evict_lru( &vinyl->line_idx_lru, line, line_cnt, ele0, ele_max, data ); - - ulong line_ctl = line[ line_idx ].ctl; - - ulong ver = fd_vinyl_line_ctl_ver( line_ctl ); - - line[ line_idx ].ele_idx = ele_idx; ele0[ ele_idx ].line_idx = line_idx; - line[ line_idx ].ctl = fd_vinyl_line_ctl( ver+1UL, 0L ); - - fd_vinyl_line_evict_prio( &vinyl->line_idx_lru, line, line_cnt, line_idx, req_evict_prio ); - - /* Allocate an appropriately sized object to hold this pair, - connect it to this line and report the location to the client. */ - - ulong szc = fd_vinyl_data_szc( val_sz ); - - fd_vinyl_data_obj_t * obj = fd_vinyl_data_alloc( data, szc ); - if( FD_UNLIKELY( !obj ) ) FD_LOG_CRIT(( "increase data cache size" )); - - line[ line_idx ].obj = obj; obj->line_idx = line_idx; - - void * val = fd_vinyl_data_obj_val( obj ); - - req_val_gaddr[ batch_idx ] = (ulong)val - data_laddr0; - req_try [ batch_idx ] = ((ver+1UL)<<32) | line_idx; - - /* Start reading encoded pair data and defer validation and - decoding to later (and then in whatever order the I/O layer - sees fit). */ - - obj->rd_active = (short)1; - - int style = fd_vinyl_bstream_ctl_style( pair_ctl ); - ulong val_esz = fd_vinyl_bstream_ctl_sz ( pair_ctl ); - - FD_CRIT( val_esz<=FD_VINYL_VAL_MAX, "corruption detected" ); - FD_CRIT( (style!=FD_VINYL_BSTREAM_CTL_STYLE_RAW) | (val_sz==val_esz), "corruption detected" ); - - fd_vinyl_data_obj_t * cobj; - - if( FD_LIKELY( style==FD_VINYL_BSTREAM_CTL_STYLE_RAW ) ) cobj = obj; - else { - cobj = fd_vinyl_data_alloc( data, fd_vinyl_data_szc( val_esz ) ); - if( FD_UNLIKELY( !cobj ) ) FD_LOG_CRIT(( "increase data cache size" )); - } - - cobj->rd->ctx = (ulong)obj; - cobj->rd->seq = ele0[ ele_idx ].seq; - cobj->rd->dst = fd_vinyl_data_obj_phdr( cobj ); - cobj->rd->sz = fd_vinyl_bstream_pair_sz( val_esz ); - - cobj->rd_err = req_err + batch_idx; - - fd_vinyl_io_read( io, cobj->rd ); - read_cnt++; - - next_try: /* silly language restriction */; - -# undef DONE - - } /* for batch_idx */ - - comp_err = FD_VINYL_SUCCESS; - break; - } diff --git a/src/vinyl/fd_vinyl_compact.c b/src/vinyl/fd_vinyl_compact.c deleted file mode 100644 index 8f79695a672..00000000000 --- a/src/vinyl/fd_vinyl_compact.c +++ /dev/null @@ -1,377 +0,0 @@ -#include -#include "fd_vinyl.h" - -void -fd_vinyl_compact( fd_vinyl_t * vinyl, - ulong compact_max ) { - - fd_vinyl_io_t * io = vinyl->io; - ulong gc_thresh = vinyl->gc_thresh; - int gc_eager = vinyl->gc_eager; - int style = vinyl->style; - - ulong io_seed = fd_vinyl_io_seed ( io ); (void)io_seed; - ulong seq_past = fd_vinyl_io_seq_past ( io ); - ulong seq_present = fd_vinyl_io_seq_present( io ); - - if( FD_UNLIKELY( (!compact_max) | ((seq_present-seq_past)<=gc_thresh) | (gc_eager<0) ) ) return; - - fd_vinyl_meta_t * meta = vinyl->meta; - fd_vinyl_line_t * line = vinyl->line; - ulong line_cnt = vinyl->line_cnt; - ulong garbage_sz = vinyl->garbage_sz; - - fd_vinyl_meta_ele_t * ele0 = meta->ele; - ulong ele_max = meta->ele_max; - ulong meta_seed = meta->seed; - - fd_vinyl_data_t * data = vinyl->data; - - fd_vinyl_data_vol_t * vol = data->vol; (void)vol; - ulong vol_cnt = data->vol_cnt; (void)vol_cnt; - - ulong seq = seq_past; - - for( ulong rem=compact_max; rem; rem-- ) { - - /* At this point, we've compacted [seq_past,seq) (cyclic), with - items still needed in this range at [seq_present,seq_future) - (cyclic). We still have [seq,seq_present) (cyclic), containing - garbage_sz bytes to compact. - - If the new past region is small enough or there is a relatively - small amount of garbage in this region, we consider the bstream's - past fully compacted. */ - - ulong past_sz_new = fd_vinyl_io_seq_future( io ) - seq; - if( FD_UNLIKELY( (past_sz_new <= gc_thresh ) | - (garbage_sz <= (past_sz_new >> gc_eager)) | - (fd_vinyl_seq_ge( seq, seq_present ) ) ) ) { - FD_CRIT( fd_vinyl_seq_le( seq, seq_present ), "corruption detected" ); - if( FD_UNLIKELY( fd_vinyl_seq_eq( seq, seq_present ) ) ) FD_CRIT( !garbage_sz, "corruption detected" ); - break; - } - - /* At this point, there is enough garbage to do some more - compaction. Load the leading block of the object at seq and - determine if this object is needed to recover the bstream's state - at seq_present. - - That is, we determine if the object at bstream_past_new is the - version of a pair that exists at bstream seq_present. If so, we - append a copy to the bstream's present. - - When compacting is complete, we forget the region containing the - copy at seq. This then effectively moves the copy from seq to - seq_future without any risk of losing data while allowing - compaction to be done with large amounts of async I/O overlapped - with compaction processing (metadata lookups, hash validation, - etc). - - This move will not move the pair past any conflicting operations - later in the bstream's past (almost definitionally so as the pair - is the most recent version). Thus set of pairs recovered at - seq_future will be identical to the set of pairs recovered at - seq_present. */ - - fd_vinyl_bstream_block_t block[1]; - - fd_vinyl_io_read_imm( io, seq, block, FD_VINYL_BSTREAM_BLOCK_SZ ); - - ulong ctl = block->ctl; - - int type = fd_vinyl_bstream_ctl_type( ctl ); - - switch( type ) { - - case FD_VINYL_BSTREAM_CTL_TYPE_PAIR: { - - /* At this point, we've read a pair's leading block into block. - Validate the pair was completely written. It's okay if we are - in a move (move block processing the previous iteration already - confirmed this pair is the proper). */ - - int pair_style = fd_vinyl_bstream_ctl_style( ctl ); - ulong pair_val_esz = fd_vinyl_bstream_ctl_sz ( ctl ); - fd_vinyl_key_t const * pair_key = &block->phdr.key; - ulong pair_val_sz = (ulong)block->phdr.info.val_sz; - - ulong pair_sz = fd_vinyl_bstream_pair_sz( pair_val_esz ); - - int truncated = (pair_sz > (seq_present - seq)); /* Wrapping safe */ - int bad_esz = (pair_val_esz > FD_VINYL_VAL_MAX); - int bad_sz = (pair_val_sz > FD_VINYL_VAL_MAX); - - FD_CRIT( !(truncated | bad_esz | bad_sz), truncated ? "truncated pair" : - bad_esz ? "unexpected pair value encoded size" : - "pair value size too large" ); - -# if FD_PARANOID - fd_vinyl_bstream_block_t _ftr[1]; - fd_vinyl_bstream_block_t * ftr = _ftr; - - if( FD_UNLIKELY( pair_sz <= FD_VINYL_BSTREAM_BLOCK_SZ ) ) ftr = block; - else fd_vinyl_io_read_imm( io, seq + pair_sz - FD_VINYL_BSTREAM_BLOCK_SZ, ftr, FD_VINYL_BSTREAM_BLOCK_SZ ); - - FD_ALERT( !fd_vinyl_bstream_pair_test_fast( io_seed, seq, block, ftr ), "corruption detected" ); -# endif - - /* At this point, we appear to have a valid pair. Query the - vinyl's meta to determine if this is the version of the pair at - bstream seq_present. Since this implementation is doing single - threaded recovery, we can use the single threaded optimized - meta APIs. */ - - ulong pair_memo = fd_vinyl_key_memo( meta_seed, pair_key ); - - ulong _ele_idx; /* avoid pointer escape */ - int err = fd_vinyl_meta_query_fast( ele0, ele_max, pair_key, pair_memo, &_ele_idx ); - ulong ele_idx = _ele_idx; - - if( FD_LIKELY( !err ) ) { - - /* At this point, a version of pair key is mapped */ - - if( FD_LIKELY( fd_vinyl_meta_ele_in_bstream( &ele0[ ele_idx ] ) ) ) { - - /* At this point, a version of pair key exists at bstream - seq_present (i.e. is not in the process of being created by - a client). */ - - ulong pair_seq = ele0[ ele_idx ].seq; - - if( FD_LIKELY( fd_vinyl_seq_eq( pair_seq, seq ) ) ) { - - /* At this point, the version of pair key at seq is the - version of pair key that exists at bstream seq_present. - Validate the metadata. */ - - FD_CRIT( !memcmp( &ele0[ ele_idx ].phdr, &block->phdr, sizeof(fd_vinyl_bstream_phdr_t) ), "corruption detected" ); - - /* If the pair is cached and not acquired for modify, append - the cached copy in the target style. Otherwise, append a - (possibly recoded) copy from the bstream. */ - - int pair_style_new; - ulong pair_val_esz_new; - ulong pair_seq_new; - - int do_copy = 1; - - ulong line_idx = ele0[ ele_idx ].line_idx; - - if( FD_LIKELY( line_idx!=ULONG_MAX ) ) { /* Pair is in cache */ - - FD_CRIT( line_idxline_idx==line_idx, "corruption detected" ); - FD_CRIT ( !obj->rd_active, "corruption detected" ); - - ulong line_ctl = line[ line_idx ].ctl; - - if( FD_LIKELY( fd_vinyl_line_ctl_ref( line_ctl )>=0L ) ) { /* Pair cached and not acquired for modify */ - - fd_vinyl_bstream_phdr_t * phdr = fd_vinyl_data_obj_phdr( obj ); - - FD_ALERT( !memcmp( phdr, &block->phdr, sizeof(fd_vinyl_bstream_phdr_t) ), "corruption detected" ); - - pair_seq_new = fd_vinyl_io_append_pair_inplace( io, style, phdr, &pair_style_new, &pair_val_esz_new ); - - do_copy = 0; - - } - - } - - if( do_copy ) { /* Pair is either in cache or acquired for modify, append from the bstream */ - - if( FD_LIKELY( (pair_style!=FD_VINYL_BSTREAM_CTL_STYLE_RAW) | - (style ==FD_VINYL_BSTREAM_CTL_STYLE_RAW) | - (pair_sz ==FD_VINYL_BSTREAM_BLOCK_SZ ) ) ) { - - /* At this point, the pair is already stored in an - encoded format, the preferred format for storing - encoded pairs is raw and/or encoding the pair will - not make it any smaller in the bstream. Copy the - pair as is from seq to seq_future. The reason we - don't reencode the pair in the second case is that - this pair has likely not been touched since it last - got to the bstream's seq_past. It would be waste to - compute and bstream storage to uncompress it as we - copy it. */ - - pair_style_new = pair_style; - pair_val_esz_new = fd_vinyl_bstream_ctl_sz( ele0[ ele_idx ].phdr.ctl ); - pair_seq_new = fd_vinyl_io_copy( io, pair_seq, pair_sz ); - - } else { - - /* At this point, the pair is stored in a raw encoded - format, the preferred format is an encoded format and - there is a possibility that encoding it will make it - smaller. Encode the pair as we copy it from seq to - seq_future. - - To do this, we allocate enough scratch from the io - append spad to cover the worst case encoded pair and - the raw pair (this sets the lower bound on how large - the io append spad must be). Then we read the raw - pair into the trailing part of the scratch and encode - from that into the leading part of the scratch. - - We play some games with the spad_used so that the - append_pair_inplace will not invalidate the read and - so that we use scratch as efficiently as possible - when there is lots of stuff to compress. */ - - ulong cpair_max = fd_vinyl_bstream_pair_sz( (ulong)LZ4_COMPRESSBOUND( (int)pair_val_sz ) ); - ulong scratch_max = cpair_max + pair_sz; - - fd_vinyl_bstream_phdr_t * cphdr = (fd_vinyl_bstream_phdr_t *) - fd_vinyl_io_alloc( io, scratch_max, FD_VINYL_IO_FLAG_BLOCKING ); - - fd_vinyl_bstream_phdr_t * phdr = (fd_vinyl_bstream_phdr_t *)((ulong)cphdr + cpair_max); - - fd_vinyl_io_read_imm( io, seq, phdr, pair_sz ); - - fd_vinyl_io_trim( io, scratch_max ); - - pair_seq_new = fd_vinyl_io_append_pair_inplace( io, style, phdr, &pair_style_new, &pair_val_esz_new ); - - /* At this point, we either are appending the encoded - pair from the leading part of the scratch and - spad_used is correct or we are appending the pair - from the trailing part and spad_used does not include - it. Adjust the spad used for the later case. In - this second case, we end up with a temporary hole in - the scratch when we decided not to copy into an - encoded form. This just scratch is used less - efficiently in the unlikely case in order to use it - more efficiently in the likely case (the correct - tradeoff). */ - - if( FD_UNLIKELY( pair_style_new==FD_VINYL_BSTREAM_CTL_STYLE_RAW ) ) io->spad_used += scratch_max; - - } - } - - /* Note: we don't need to prepare here because we aren't - modifying shared fields. */ - - ele0[ ele_idx ].phdr.ctl = fd_vinyl_bstream_ctl( FD_VINYL_BSTREAM_CTL_TYPE_PAIR, pair_style_new, pair_val_esz_new ); - ele0[ ele_idx ].seq = pair_seq_new; - - } else { - - /* The version of the pair at bstream seq was replaced. The - most recent version of this pair is at pair_seq. */ - - FD_CRIT( fd_vinyl_seq_gt( pair_seq, seq ), "corruption detected" ); - - garbage_sz -= pair_sz; - - } - - } else { - - /* The pair at bstream seq does not exist in the bstream at - bstream seq_present. It is in the vinyl meta because it is - being created. We wouldn't be in the process of creating - it unless this pair (or a subsequent version of it) was - erased or moved before seq_present. So this pair is - garbage. */ - - garbage_sz -= pair_sz; - - } - - } else { - - /* The pair at bstream seq does not exist in the bstream at - bstream seq_present. This pair (or a subsequent version of - it) was erased or moved before seq_present. So this pair - is garbage. */ - - garbage_sz -= pair_sz; - - } - - seq += pair_sz; - break; - - } - - case FD_VINYL_BSTREAM_CTL_TYPE_DEAD: - case FD_VINYL_BSTREAM_CTL_TYPE_MOVE: - case FD_VINYL_BSTREAM_CTL_TYPE_PART: { - - /* DEAD blocks can always be compacted out because the version of - the pair they reference is not in the current view of the - bstream (because that version was unmapped when the DEAD was - written), that version was located at an earlier location than - the DEAD (because blocks are appended sequentially) and thus - that version has already been compacted out (because a previous - iteration of this would have encountered it before getting this - DEAD block, would have detecting that version was no longer - needed and compacted it at that time instead of moving it to a - higher sequence number). - - MOVE blocks can always be compacted out for the same reasons as - the above with the twist that, compacting the move block makes - the pair following look like a create from the point of view of - a recovery starting at the pair. This is immaterial though - because doesn't change the recovered view if recovery starts - on the block after the move. - - PART blocks can always be compacted because they are just - informational (to help partition the bstream past in parallel - recovery) and this partition ends bstream blocks that have - already been compacted out. - - We validate the block because we already have the data anyway. */ - - FD_ALERT( !fd_vinyl_bstream_block_test( io_seed, block ), "corruption detected" ); - - garbage_sz -= FD_VINYL_BSTREAM_BLOCK_SZ; - seq += FD_VINYL_BSTREAM_BLOCK_SZ; - break; - - } - - case FD_VINYL_BSTREAM_CTL_TYPE_ZPAD: { - - /* ZPAD blocks can always be compacted out because they are no-ops - from the point of view of bstream processing (the underlying - I/O layer can insert these so that, for example, a multi-block - pair is never split across two different physical volumes). - Note that zpad blocks aren't included in garbage_sz because we - don't control when they get created (and thus can't easily - update garbage_sz to account for them when they are created). */ - - FD_ALERT( !fd_vinyl_bstream_zpad_test( io_seed, seq, block ), "corruption detected" ); - - seq += FD_VINYL_BSTREAM_BLOCK_SZ; - break; - - } - - default: FD_LOG_CRIT(( "%016lx: unknown type (%x)", seq, (uint)type )); - - } - - } - - /* At this point, we've made copies of all info in [seq_past,seq) - (cyclic) to [seq_present,seq_future) (cyclic) needed to recover the - bstream's state at seq_present. We commit the new, forget the old - and update the garbage size to finish this compaction. */ - - fd_vinyl_io_commit( io, FD_VINYL_IO_FLAG_BLOCKING ); - fd_vinyl_io_forget( io, seq ); - - vinyl->garbage_sz = garbage_sz; -} diff --git a/src/vinyl/fd_vinyl_ctl.c b/src/vinyl/fd_vinyl_ctl.c deleted file mode 100644 index 3a5ade25bf8..00000000000 --- a/src/vinyl/fd_vinyl_ctl.c +++ /dev/null @@ -1,712 +0,0 @@ -/* For O_DIRECT and O_NOATIME */ -#define _GNU_SOURCE - -#include "fd_vinyl.h" -#include "../util/pod/fd_pod.h" - -#include -#include -#include -#include -#include - -FD_IMPORT_CSTR( fd_vinyl_ctl_help, "src/vinyl/fd_vinyl_ctl_help" ); - -static int -fd_vinyl_main( int argc, - char ** argv ) { - - ulong seed_default = fd_cstr_hash_append( (ulong)fd_log_wallclock(), fd_log_host() ); - - char const * _pod = fd_env_strip_cmdline_cstr ( &argc, &argv, "--pod", NULL, NULL ); - char const * _cfg = fd_env_strip_cmdline_cstr ( &argc, &argv, "--cfg", NULL, NULL ); - ulong seed = fd_env_strip_cmdline_ulong( &argc, &argv, "--seed", NULL, seed_default ); - char const * type = fd_env_strip_cmdline_cstr ( &argc, &argv, "--type", NULL, "mm" ); - char const * path = fd_env_strip_cmdline_cstr ( &argc, &argv, "--path", NULL, NULL ); - int dsync = fd_env_strip_cmdline_int ( &argc, &argv, "--dsync", NULL, 0 ); - int direct = fd_env_strip_cmdline_int ( &argc, &argv, "--direct", NULL, 0 ); - int noatime = fd_env_strip_cmdline_int ( &argc, &argv, "--noatime", NULL, 0 ); - char const * _page_sz = fd_env_strip_cmdline_cstr ( &argc, &argv, "--page-sz", NULL, "gigantic" ); - ulong page_cnt = fd_env_strip_cmdline_ulong( &argc, &argv, "--page-cnt", NULL, 1UL ); - ulong near_cpu = fd_env_strip_cmdline_ulong( &argc, &argv, "--near-cpu", NULL, fd_log_cpu_id() ); - int reset = fd_env_strip_cmdline_int ( &argc, &argv, "--reset", NULL, 0 ); - char const * info = fd_env_strip_cmdline_cstr ( &argc, &argv, "--info", NULL, NULL ); - ulong io_seed = fd_env_strip_cmdline_ulong( &argc, &argv, "--io-seed", NULL, 0UL ); - - int open_flags = O_RDWR | (dsync ? O_DSYNC : 0 ) | (direct ? O_DIRECT : 0) | (noatime ? O_NOATIME : 0); - ulong page_sz = fd_cstr_to_shmem_page_sz( _page_sz ); - ulong info_sz = info ? (strlen( info )+1UL) : 0UL; - - if( FD_UNLIKELY( !_pod ) ) FD_LOG_ERR(( "--pod not specified" )); - if( FD_UNLIKELY( !page_sz ) ) FD_LOG_ERR(( "bad --page-sz" )); - - FD_LOG_NOTICE(( "Attaching to --pod %s", _pod )); - - uchar const * pod = fd_wksp_pod_attach( _pod ); /* logs details, guaranteed to succeed */ - uchar const * cfg; - if( FD_UNLIKELY( !_cfg ) ) { - FD_LOG_NOTICE(( "--cfg not specified (using pod root for config)" )); - cfg = pod; - } else { - FD_LOG_NOTICE(( "Finding config --cfg %s", _cfg )); - cfg = fd_pod_query_subpod( pod, _cfg ); - if( FD_UNLIKELY( !cfg ) ) FD_LOG_ERR(( "config not found" )); - } - - FD_LOG_NOTICE(( "Extracting pod configuration" )); - - /* See below for explanation of defaults */ - ulong spad_max = fd_pod_query_ulong( cfg, "spad_max", fd_vinyl_io_spad_est() ); - ulong async_min = fd_pod_query_ulong( cfg, "async_min", 2UL ); - ulong async_max = fd_pod_query_ulong( cfg, "async_max", 2UL*async_min ); - ulong part_thresh = fd_pod_query_ulong( cfg, "part_thresh", 1UL<<30 ); - ulong gc_thresh = fd_pod_query_ulong( cfg, "gc_thresh", 8UL<<30 ); - int gc_eager = fd_pod_query_int ( cfg, "gc_eager", 2 ); - int style = fd_pod_query_int ( cfg, "style", FD_VINYL_BSTREAM_CTL_STYLE_LZ4 ); - int level = fd_pod_query_int ( cfg, "level", 1 ); - - FD_LOG_NOTICE(( "Processing command line configuration overrides" )); - - char const * _style = fd_env_strip_cmdline_cstr( &argc, &argv, "--style", NULL, NULL ); - if( _style ) style = fd_cstr_to_vinyl_bstream_ctl_style( _style ); - - spad_max = fd_env_strip_cmdline_ulong( &argc, &argv, "--spad-max", NULL, spad_max ); - async_min = fd_env_strip_cmdline_ulong( &argc, &argv, "--async-min", NULL, async_min ); - async_max = fd_env_strip_cmdline_ulong( &argc, &argv, "--async-max", NULL, async_max ); - part_thresh = fd_env_strip_cmdline_ulong( &argc, &argv, "--part-thresh", NULL, part_thresh ); - gc_thresh = fd_env_strip_cmdline_ulong( &argc, &argv, "--gc-thresh", NULL, gc_thresh ); - gc_eager = fd_env_strip_cmdline_int ( &argc, &argv, "--gc-eager", NULL, gc_eager ); - level = fd_env_strip_cmdline_int ( &argc, &argv, "--level", NULL, level ); - - FD_LOG_NOTICE(( "Mapping vinyl memory regions" )); - - void * _vinyl = fd_wksp_pod_map( cfg, "vinyl" ); ulong vinyl_footprint = fd_pod_query_ulong( cfg, "vinyl_footprint", 0UL ); - void * _cnc = fd_wksp_pod_map( cfg, "cnc" ); ulong cnc_footprint = fd_pod_query_ulong( cfg, "cnc_footprint", 0UL ); - void * _meta = fd_wksp_pod_map( cfg, "meta" ); ulong meta_footprint = fd_pod_query_ulong( cfg, "meta_footprint", 0UL ); - void * _line = fd_wksp_pod_map( cfg, "line" ); ulong line_footprint = fd_pod_query_ulong( cfg, "line_footprint", 0UL ); - void * _io = fd_wksp_pod_map( cfg, "io" ); ulong io_footprint = fd_pod_query_ulong( cfg, "io_footprint", 0UL ); - void * _ele = fd_wksp_pod_map( cfg, "ele" ); ulong ele_footprint = fd_pod_query_ulong( cfg, "ele_footprint", 0UL ); - void * _obj = fd_wksp_pod_map( cfg, "obj" ); ulong obj_footprint = fd_pod_query_ulong( cfg, "obj_footprint", 0UL ); - -# define TEST( c, msg ) do { \ - if( FD_UNLIKELY( !(c) ) ) FD_LOG_ERR(( "FAIL: %s (%s)", #c, (msg) )); \ - } while(0) - - fd_wksp_t * wksp = fd_wksp_containing( _obj ); - TEST( wksp, "fd_wksp_containing failed" ); - - TEST( fd_ulong_is_aligned( (ulong)_vinyl, fd_vinyl_io_mm_align() ), "bad alloc" ); - TEST( vinyl_footprint >= fd_vinyl_footprint(), "bad alloc" ); - - int is_mmio = !strcmp( type, "mm" ); - - FD_LOG_NOTICE(( "io config" - "\n\t--type \"%s\"" - "\n\t--spad-max %lu bytes" - "\n\t--path \"%s\"" - "\n\t--dsync %i" - "\n\t--direct %i" - "\n\t--noatime %i" - "\n\t--page-sz \"%s\"%s" - "\n\t--page-cnt %lu pages%s" - "\n\t--near-cpu %lu%s" - "\n\t--reset %i" - "\n\t--info \"%s\" (info_sz %lu bytes%s)" - "\n\t--io-seed 0x%016lx%s", - type, spad_max, path ? path : "(null)", dsync, direct, noatime, - _page_sz, is_mmio && !path ? "" : " (ignored)", - page_cnt, is_mmio && !path ? "" : " (ignored)", - near_cpu, is_mmio && !path ? "" : " (ignored)", - reset, info ? info : "(null)", info_sz, reset ? "" : ", ignored", io_seed, reset ? "" : " (ignored)" )); - - FD_LOG_NOTICE(( "Joining bstream" )); - - int bstream_type; - int fd = -1; - void * mmio; - ulong mmio_sz; - - fd_vinyl_io_t * io; - - if( FD_LIKELY( is_mmio ) ) { - - if( FD_LIKELY( path ) ) { - - fd = open( path, open_flags, (mode_t)0 ); - - if( FD_LIKELY( fd!=-1 ) ) { /* --path seems to be file (e.g. testing or basic I/O with weak persistence) */ - - TEST( !direct, "--direct 1 not supported with --type mm and file --path" ); - /* FIXME: is dsync valid for mmio? (unclear) noatime? (probably) */ - - FD_LOG_NOTICE(( "Using file at --path as a memory mapped bstream" )); - - bstream_type = 0; - - int err = fd_io_mmio_init( fd, FD_IO_MMIO_MODE_READ_WRITE, &mmio, &mmio_sz ); - if( FD_UNLIKELY( err ) ) FD_LOG_ERR(( "fd_io_mmio_init failed (%i-%s)", err, fd_io_strerror( err ) )); - - } else { /* --path doesn't seem to be a file, use shmem (e.g. testing or ultra HPC with weak persistence) */ - - FD_LOG_NOTICE(( "Using shmem region at --path as a memory mapped bstream (ignoring --dsync, --direct and --noatime)" )); - - bstream_type = 1; - - fd_shmem_join_info_t info[1]; - mmio = fd_shmem_join( path, FD_SHMEM_JOIN_MODE_READ_WRITE, 0, NULL, NULL, info ); - TEST( mmio, "fd_shmem_join failed" ); - mmio_sz = info->page_sz * info->page_cnt; - - } - - } else { /* No --path, use an anonymous region (e.g. testing or ultra HPC with no persistence) */ - - FD_LOG_NOTICE(( "Using an anonymous shmem region as a memory mapped bstream " - "(ignoring --dsync, --direct and --noatime, setting --reset to 1)" )); - - bstream_type = 2; - reset = 1; - - mmio = fd_shmem_acquire( page_sz, page_cnt, near_cpu ); - TEST( mmio, "fd_shmem_acquire failed" ); - mmio_sz = page_sz*page_cnt; - - } - - TEST( fd_ulong_is_aligned( (ulong)_io, fd_vinyl_io_mm_align() ), "bad alloc" ); - TEST( io_footprint >= fd_vinyl_io_mm_footprint( spad_max ), "bad alloc" ); - - io = fd_vinyl_io_mm_init( _io, spad_max, mmio, mmio_sz, reset, info, info_sz, io_seed ); - TEST( io, "fd_vinyl_io mm_init failed" ); - - } else if( !strcmp( type, "bd" ) ) { - - if( FD_VINYL_BSTREAM_BLOCK_SZ<512UL ) TEST( !direct, "--direct 1 not supported with --type bd and BLOCK_SZ<512" ); - - TEST( path, "--path not specified for --type bd" ); - - FD_LOG_NOTICE(( "Using --path as a block device bstream" )); - - bstream_type = 3; - - fd = open( path, open_flags, 0 ); - if( FD_UNLIKELY( fd==-1 ) ) FD_LOG_ERR(( "open failed (%i-%s)", errno, fd_io_strerror( errno ) )); - - TEST( fd_ulong_is_aligned( (ulong)_io, fd_vinyl_io_bd_align() ), "bad wksp alloc" ); - TEST( io_footprint >= fd_vinyl_io_bd_footprint( spad_max ), "bad wksp alloc" ); - - io = fd_vinyl_io_bd_init( _io, spad_max, fd, reset, info, info_sz, io_seed ); - TEST( io, "fd_vinyl_io bd_init failed" ); - - } else { - - FD_LOG_ERR(( "Unsupported io type" )); - - } - - FD_LOG_NOTICE(( "Creating vinyl" )); - - fd_tpool_t * tpool = NULL; - - ulong thread_cnt = fd_tile_cnt(); - - if( thread_cnt>1UL ) { - FD_LOG_NOTICE(( "Creating temporary tpool from all %lu tiles for thread parallel init", thread_cnt )); - - static uchar _tpool[ FD_TPOOL_FOOTPRINT( FD_TILE_MAX ) ] __attribute__((aligned(FD_TPOOL_ALIGN))); - - tpool = fd_tpool_init( _tpool, thread_cnt, 0UL ); /* logs details */ - if( FD_UNLIKELY( !tpool ) ) FD_LOG_ERR(( "fd_tpool_init failed" )); - - for( ulong thread_idx=1UL; thread_idx~ 8 GiB used */ - int gc_eager = 2; /* target <~25% garbage items */ - int style = FD_VINYL_BSTREAM_CTL_STYLE_LZ4; /* enable data compression */ - int level = 1; /* do a hard reset by default */ - ulong obj_footprint_avg = 2UL*FD_VINYL_BSTREAM_BLOCK_SZ + 8UL + 1UL; /* see note above */ - - int err = 0; - int cnt = 0; - - while( argc ) { - char const * cmd = argv[0]; - SHIFT(1); - - if( !strcmp( cmd, "help" ) ) { - - fflush( stdout ); fflush( stderr ); - fputs( fd_vinyl_ctl_help, stdout ); - fflush( stdout ); fflush( stderr ); - - FD_LOG_NOTICE(( "%i: %s: success", cnt, cmd )); - - } else if( !strcmp( cmd, "set" ) ) { - - if( FD_UNLIKELY( argc<2 ) ) FD_LOG_ERR(( "%i: %s: too few arguments\n\tDo %s help for help", cnt, cmd, bin )); - - char const * key = argv[0]; - char const * val = argv[1]; - - /**/ if( !strcmp( key, "wksp_tag" ) ) wksp_tag = fd_cstr_to_ulong ( val ); - else if( !strcmp( key, "pod_max" ) ) pod_max = fd_cstr_to_ulong ( val ); - else if( !strcmp( key, "cfg_path" ) ) cfg_path = val; - else if( !strcmp( key, "cnc_app_sz" ) ) cnc_app_sz = fd_cstr_to_ulong ( val ); - else if( !strcmp( key, "spad_max" ) ) spad_max = fd_cstr_to_ulong ( val ); - else if( !strcmp( key, "async_min" ) ) async_min = fd_cstr_to_ulong ( val ); - else if( !strcmp( key, "async_max" ) ) async_max = fd_cstr_to_ulong ( val ); - else if( !strcmp( key, "part_thresh" ) ) part_thresh = fd_cstr_to_ulong ( val ); - else if( !strcmp( key, "gc_thresh" ) ) gc_thresh = fd_cstr_to_ulong ( val ); - else if( !strcmp( key, "gc_eager" ) ) gc_eager = fd_cstr_to_int ( val ); - else if( !strcmp( key, "style" ) ) style = fd_cstr_to_vinyl_bstream_ctl_style( val ); - else if( !strcmp( key, "level" ) ) level = fd_cstr_to_int ( val ); - else if( !strcmp( key, "obj_footprint_avg" ) ) obj_footprint_avg = fd_cstr_to_ulong ( val ); - else FD_LOG_ERR(( "%i: %s %s %s: unknown key", cnt, cmd, key, val)); - - FD_LOG_NOTICE(( "%i: %s %s %s: success", cnt, cmd, key, val )); - SHIFT(2); - - } else if( !strcmp( cmd, "alloc-memory" ) ) { - - if( FD_UNLIKELY( argc<5 ) ) FD_LOG_ERR(( "%i: %s: too few arguments\n\tDo %s help for help", cnt, cmd, bin )); - - char const * mem = argv[0]; - ulong page_cnt = fd_cstr_to_ulong ( argv[1] ); - ulong page_sz = fd_cstr_to_shmem_page_sz( argv[2] ); - char const * seq = argv[3]; - ulong mode = fd_cstr_to_ulong_octal ( argv[4] ); - - if( FD_UNLIKELY( !page_cnt ) ) - FD_LOG_ERR(( "%i: %s %s %lu %s %s 0%03lo: bad page count\n\t" - "Do %s help for help", cnt, cmd, mem, page_cnt, argv[2], seq, mode, bin )); - - if( FD_UNLIKELY( !page_sz ) ) - FD_LOG_ERR(( "%i: %s %s %lu %s %s 0%03lo: bad page size\n\t" - "Do %s help for help", cnt, cmd, mem, page_cnt, argv[2], seq, mode, bin )); - - /* Partition the pages over the seq */ - - ulong sub_page_cnt[ 512UL ]; - ulong sub_cpu_idx [ 512UL ]; - ulong sub_cnt = fd_cstr_to_ulong_seq( seq, sub_cpu_idx, 512UL ); - - if( FD_UNLIKELY( !sub_cnt ) ) - FD_LOG_ERR(( "%i: %s %s %lu %s %s 0%03lo: empty or invalid cpu sequence\n\t" - "Do %s help for help", cnt, cmd, mem, page_cnt, argv[2], seq, mode, bin )); - - if( FD_UNLIKELY( sub_cnt>512UL ) ) - FD_LOG_ERR(( "%i: %s %s %lu %s %s 0%03lo: sequence too long, increase limit in fd_vinyl_ctl.c\n\t" - "Do %s help for help", cnt, cmd, mem, page_cnt, argv[2], seq, mode, bin )); - - /* TODO: consider striping instead of blocking */ - - ulong sub_page_min = page_cnt / sub_cnt; - ulong sub_page_rem = page_cnt % sub_cnt; - for( ulong sub_idx=0UL; sub_idx(1UL<<32)) ) ) - FD_LOG_ERR(( "%i: %s %s %lu 0%03lo: bad number of gigabytes\n\t" - "Do %s help for help", cnt, cmd, path, GiB_cnt, mode, bin )); - - ulong sz = GiB_cnt << 30; - - int fd = open( path, O_RDWR | O_CREAT | O_EXCL, (mode_t)mode ); - if( FD_UNLIKELY( fd==-1 ) ) - FD_LOG_ERR(( "%i: %s %s %lu 0%03lo: open failed (%i-%s)\n\tDo %s help for help", - cnt, cmd, path, GiB_cnt, mode, errno, fd_io_strerror( errno ), bin )); - - int err = fd_io_truncate( fd, sz ); - if( FD_UNLIKELY( err ) ) - FD_LOG_ERR(( "%i: %s %s %lu 0%03lo: fd_io_truncate failed (%i-%s)\n\tDo %s help for help", - cnt, cmd, path, GiB_cnt, mode, err, fd_io_strerror( err ), bin )); - - if( FD_UNLIKELY( close( fd ) ) ) - FD_LOG_WARNING(( "%i: %s %s %lu 0%03lo: close failed (%i-%s); attempting to continue", - cnt, cmd, path, GiB_cnt, mode, errno, fd_io_strerror( errno ) )); - - FD_LOG_NOTICE(( "%i: %s %s %lu 0%03lo: success", cnt, cmd, path, GiB_cnt, mode )); - SHIFT(3); - - } else if( !strcmp( cmd, "free-storage" ) ) { - - if( FD_UNLIKELY( argc<1 ) ) - FD_LOG_ERR(( "%i: %s: too few arguments\n\tDo %s help for help", cnt, cmd, bin )); - - char const * store = argv[0]; - - if( FD_UNLIKELY( unlink( store ) ) ) - FD_LOG_ERR(( "%i: %s %s: unlink failed (%i-%s)\n\tDo %s help for help", - cnt, cmd, store, errno, fd_io_strerror( errno ), bin )); - - FD_LOG_NOTICE(( "%i: %s %s: success", cnt, cmd, store )); - SHIFT(1); - - } else if( !strcmp( cmd, "new" ) ) { - - if( FD_UNLIKELY( argc<3 ) ) - FD_LOG_ERR(( "%i: %s: too few arguments\n\tDo %s help for help", cnt, cmd, bin )); - - char const * mem = argv[0]; - ulong pair_max = fd_cstr_to_ulong( argv[1] ); - ulong GiB_max = fd_cstr_to_ulong( argv[2] ); - -# define TEST( c, msg ) do { \ - if( FD_UNLIKELY( !(c) ) ) \ - FD_LOG_ERR(( "%i: %s %s %lu %lu: FAIL %s (%s)\n\tDo %s help for help", \ - cnt, cmd, mem, pair_max, GiB_max, #c, (msg), bin )); \ - } while(0) - - ulong ele_max = fd_ulong_pow2_up( pair_max + 1UL ); - ulong lock_cnt = fd_vinyl_meta_lock_cnt_est( ele_max ); - ulong probe_max = ele_max; - - TEST( (0UL=3UL, "increase maximum GiB allowed and/or decrease pair_max / spad_max / pod_max / cnc_app_sz" ); - - ulong line_footprint = sizeof(fd_vinyl_line_t)*line_max; - - mem_req += line_footprint; - - ulong obj_footprint = fd_ulong_align_dn( mem_max - mem_req, alignof(fd_vinyl_data_obj_t) ); - - mem_req += obj_footprint; - - TEST( mem_req<=mem_max, "internal error" ); - - /* Attach to the memory that will contain this vinyl instance */ - - fd_wksp_t * wksp = fd_wksp_attach( mem ); - TEST( wksp, "fd_wksp_attach failed" ); - - /* Allocate all the needed regions. Note that, even though the - vinyl io tile state is neither shared nor persistent, we - allocate it here so the vinyl tile itself doesn't have to - allocate it (it is dynamically sized and rather large). Since - we want the vinyl tile to be able to pick the type of io - interface and bstream store at startup without creating a new - vinyl instance, we allocated an upper bound for all supported - io types above (they are all roughly the same size anyway). - - Alternatively, we could have the vinyl tile do this allocation - at tile startup. But this would create some additional - complexity: the vinyl tile would need an allocator (and then - one potentially has allocations left over from previous runs - that did not terminate cleanly). - - Similar considerations apply for the data cache state, vinyl - tile state, lines and data objects. - - Note also that, though meta is shared and persistent, - persistence should only be used for post mortem debugging (the - meta cache is recreated from scratch on vinyl tile startup). */ - - void * _pod = fd_wksp_alloc_laddr( wksp, pod_align, pod_footprint, wksp_tag ); - void * _vinyl = fd_wksp_alloc_laddr( wksp, vinyl_align, vinyl_footprint, wksp_tag ); - void * _cnc = fd_wksp_alloc_laddr( wksp, cnc_align, cnc_footprint, wksp_tag ); - void * _meta = fd_wksp_alloc_laddr( wksp, meta_align, meta_footprint, wksp_tag ); - void * _io = fd_wksp_alloc_laddr( wksp, io_align, io_footprint, wksp_tag ); - void * _line = fd_wksp_alloc_laddr( wksp, line_align, line_footprint, wksp_tag ); /* This is kinda big */ - void * _ele = fd_wksp_alloc_laddr( wksp, ele_align, ele_footprint, wksp_tag ); /* This is really big */ - void * _obj = fd_wksp_alloc_laddr( wksp, obj_align, obj_footprint, wksp_tag ); - - /* Note: the bigger obj gets, the better the performance (until it - is large enough pairs always fit in cache but that would dwarf - ele). In typical use cases, this is probably smaller to - comparable to ele (resulting in much cheaper hardware at - comparable speeds for typical usage patterns but less robust - performance for extreme usage patterns). */ - - TEST( (!!_pod) & (!!_vinyl) & (!!_cnc) & (!!_io) & (!!_line) & (!!_ele) & (!!_obj), - "fd_wksp_alloc_laddr failed (free unneeded allocs or increase wksp size or partitions)" ); - - /* Format and the join the pod and create the cfg subpod as - necessary. */ - - uchar * pod = fd_pod_join( fd_pod_new( _pod, pod_max ) ); - TEST( pod, "internal error" ); - - uchar * cfg; - if( !cfg_path ) cfg = pod; - else { - ulong off = fd_pod_alloc_subpod( pod, cfg_path, 1024UL ); - TEST( off, "use shorter cfg_path or increase pod_max?" ); - cfg = pod + off; - } - - /* Populate the pod */ - - char tmp[ FD_WKSP_CSTR_MAX ]; - - TEST( fd_pod_insert_cstr( cfg, "vinyl", fd_wksp_cstr_laddr( _vinyl, tmp ) ), "increase pod_max?" ); - TEST( fd_pod_insert_cstr( cfg, "cnc", fd_wksp_cstr_laddr( _cnc, tmp ) ), "increase pod_max?" ); - TEST( fd_pod_insert_cstr( cfg, "meta", fd_wksp_cstr_laddr( _meta, tmp ) ), "increase pod_max?" ); - TEST( fd_pod_insert_cstr( cfg, "io", fd_wksp_cstr_laddr( _io, tmp ) ), "increase pod_max?" ); - TEST( fd_pod_insert_cstr( cfg, "line", fd_wksp_cstr_laddr( _line, tmp ) ), "increase pod_max?" ); - TEST( fd_pod_insert_cstr( cfg, "ele", fd_wksp_cstr_laddr( _ele, tmp ) ), "increase pod_max?" ); - TEST( fd_pod_insert_cstr( cfg, "obj", fd_wksp_cstr_laddr( _obj, tmp ) ), "increase pod_max?" ); - - TEST( fd_pod_insert_ulong( cfg, "vinyl_footprint", vinyl_footprint ), "increase pod_max?" ); - TEST( fd_pod_insert_ulong( cfg, "cnc_footprint", cnc_footprint ), "increase pod_max?" ); - TEST( fd_pod_insert_ulong( cfg, "meta_footprint", meta_footprint ), "increase pod_max?" ); - TEST( fd_pod_insert_ulong( cfg, "io_footprint", io_footprint ), "increase pod_max?" ); - TEST( fd_pod_insert_ulong( cfg, "line_footprint", line_footprint ), "increase pod_max?" ); - TEST( fd_pod_insert_ulong( cfg, "ele_footprint", ele_footprint ), "increase pod_max?" ); - TEST( fd_pod_insert_ulong( cfg, "obj_footprint", obj_footprint ), "increase pod_max?" ); - - TEST( fd_pod_insert_ulong( cfg, "spad_max", spad_max ), "increase pod_max?" ); - TEST( fd_pod_insert_ulong( cfg, "pair_max", pair_max ), "increase pod_max?" ); - TEST( fd_pod_insert_ulong( cfg, "line_max", line_max ), "increase pod_max?" ); - TEST( fd_pod_insert_ulong( cfg, "async_min", async_min ), "increase pod_max?" ); - TEST( fd_pod_insert_ulong( cfg, "async_max", async_max ), "increase pod_max?" ); - TEST( fd_pod_insert_ulong( cfg, "part_thresh", part_thresh ), "increase pod_max?" ); - TEST( fd_pod_insert_ulong( cfg, "gc_thresh", gc_thresh ), "increase pod_max?" ); - TEST( fd_pod_insert_int ( cfg, "gc_eager", gc_eager ), "increase pod_max?" ); - TEST( fd_pod_insert_int ( cfg, "style", style ), "increase pod_max?" ); - TEST( fd_pod_insert_int ( cfg, "level", level ), "increase pod_max?" ); - - /* Tell the operator where the pod is */ - /* FIXME: consider putting the config pod in a normal page named - shmem region or a flat file instead? Probably easier to pass - between applications than a wksp gaddr. */ - - printf( "%s\n", fd_wksp_cstr_laddr( _pod, tmp ) ); - - /* Clean up */ - - if( cfg!=pod ) TEST( fd_pod_compact( cfg, 1 ), "internal error" ); - - TEST( fd_pod_leave( pod )==_pod, "internal error" ); - - TEST( !fd_wksp_detach( wksp ), "internal error" ); - -# undef TEST - - FD_LOG_NOTICE(( "%i: %s %s %lu %lu: success", cnt, cmd, mem, pair_max, GiB_max )); - SHIFT(3); - - } else if( !strcmp( cmd, "delete" ) ) { - - if( FD_UNLIKELY( argc<1 ) ) - FD_LOG_ERR(( "%i: %s: too few arguments\n\tDo %s help for help", cnt, cmd, bin )); - - char const * cstr = argv[0]; - -# define TEST( c, msg ) do { \ - if( FD_UNLIKELY( !(c) ) ) \ - FD_LOG_ERR(( "%i: %s %s: FAIL %s (%s)\n\tDo %s help for help", \ - cnt, cmd, cstr, #c, (msg), bin )); \ - } while(0) - - uchar const * pod = fd_pod_join( fd_wksp_map( cstr ) ); /* logs details */ - TEST( pod, "unable to join pod" ); - - uchar const * cfg; - if( !cfg_path ) cfg = pod; - else { - cfg = fd_pod_query_subpod( pod, cfg_path ); - TEST( cfg, "cfg not found at cfg_path" ); - } - - fd_wksp_cstr_free( fd_pod_query_cstr( cfg, "obj", NULL ) ); - fd_wksp_cstr_free( fd_pod_query_cstr( cfg, "ele", NULL ) ); - fd_wksp_cstr_free( fd_pod_query_cstr( cfg, "line", NULL ) ); - fd_wksp_cstr_free( fd_pod_query_cstr( cfg, "io", NULL ) ); - fd_wksp_cstr_free( fd_pod_query_cstr( cfg, "meta", NULL ) ); - fd_wksp_cstr_free( fd_pod_query_cstr( cfg, "cnc", NULL ) ); - fd_wksp_cstr_free( fd_pod_query_cstr( cfg, "vinyl", NULL ) ); - - fd_wksp_unmap( fd_pod_leave( pod ) ); - - fd_wksp_cstr_free( cstr ); - - FD_LOG_NOTICE(( "%i: %s %s: success", cnt, cmd, cstr )); - SHIFT(1); - - } else if( !strcmp( cmd, "exec" ) ) { - - err = fd_vinyl_main( argc, argv ); - break; - - } else { - - FD_LOG_ERR(( "%i: %s: unknown command\n\t" - "Do %s help for help", cnt, cmd, bin )); - - } - cnt++; - } - - if( FD_UNLIKELY( cnt<1 ) ) FD_LOG_NOTICE(( "processed %i commands\n\tDo %s help for help", cnt, bin )); - else FD_LOG_NOTICE(( "processed %i commands", cnt )); - -# undef SHIFT - - fd_halt(); - return err; -} diff --git a/src/vinyl/fd_vinyl_ctl_help b/src/vinyl/fd_vinyl_ctl_help deleted file mode 100644 index e8a821cfd13..00000000000 --- a/src/vinyl/fd_vinyl_ctl_help +++ /dev/null @@ -1,91 +0,0 @@ - -Usage: fd_vinyl_ctl [cmd] [cmd args] [cmd] [cmd args] ... - -Commands are: - -help -- Prints this message. - -set [key] [val] -- Set the [key] to [val]. keys include: - - key | val type | default | notes - ------------------+----------+--------------------+----------------------------------------------------------- - wksp_tag | ulong | 0xfdc12113c597a600 | allocation tag used for vinyl wksp allocs - pod_max | ulong | 4KiB | byte size for the vinyl pod - cfg_path | cstr | NULL | path to the vinyl cfg in the vinyl pod (NULL use pod root) - cnc_app_sz | ulong | vinyl tile minimum | - - spad_max | ulong | 32MiB | io append scratch pad byte size - exec_max | ulong | 2UL | vinyl tile max request executed per run loop iteration - async_min | ulong | 2UL | min run loop iterations per async handling - async_max | ulong | 4UL | max run loop iterations per async handling - part_thresh | ulong | 1GiB | target partition size - gc_thresh | ulong | 8GiB | min bstream past size to consider compaction - gc_eager | int | 2 | target less than <~2^-gc_eager garbage items - style | cstr | "lz4" | preferred bstream pair encoding ("raw" and "lz4" are supported) - level | int | 1 | preferred reset level at startup (0 soft, 1 hard) - obj_footprint_avg | ulong | 1KiB + 9B | marginal byte cost of object in the data cache - -alloc-memory wksp page_cnt page_sz cpu_idx_seq mode - -- Create a workspace named wksp from page_cnt page_sz pages distributed - over numa nodes near cpu_idx_seq. The region will have the unix - permissions specified by mode (assumed octal). See fd_shmem_ctl help - for more details how the pages will be distributed over numa nodes. - -- The purpose of this is to create a workspace which can be used to hold - the memory regions needed by a vinyl tile. Since this is a normal - workspace though, it can be used for other application allocations. - (Or vice versa.) - -- This is identical to fd_wksp_ctl new and provided here as a - convenience. - -free-memory wksp - -- Delete a workspace named wksp. If multiple shmem regions exist with - same name, try to use the shmem region backed by the largest page size - -- This is identical to fd_wksp_ctl delete and provided here as a - convenience and for symmetry with alloc-memory. - -alloc-storage path sz_in_GiB mode - -- Create a file at path with the given sz filled with zeros. The file - will have the permissions specified by mode (assumed octal). Fails - if the file already exists. - -- The purpose of this is create a file which can be used to hold - the bstream needed by a vinyl tile. There is no requirement to use - this though. - -- This is provided as a convenience. The user is free to create bstream - storage via whatever means suits them. This includes using - pre-existing block devices directly. - -free-storage path - -- Destroys the file at path. Fails if the file could not be destroyed. - -- This is provided here as a convenience and for symmetry with - alloc-storage. - -new wksp pair_max sz_in_GiB - -- Allocate memory resources needed for a vinyl tile in the workspace - wksp. The vinyl tile will be able to track up to pair_max key-val - pairs total. The target amount of wksp memory to devote to the vinyl - tile should be sz_in_GiB. - -- One of these memory resources will be a pod. The locations of these - memory resources will be stashed in in this pod as wksp cstr gaddrs - (along with other vinyl tile configurations). - -- The wksp cstr of this pod will be printed to stdout. - -delete pod - -- Frees all memory resources allocated for a vinyl tile. The location - of the resources are given by pod. This includes freeing the pod - itself. The tile should not be running when this is done. - diff --git a/src/vinyl/fd_vinyl_exec.c b/src/vinyl/fd_vinyl_exec.c deleted file mode 100644 index 6ef4dda20fd..00000000000 --- a/src/vinyl/fd_vinyl_exec.c +++ /dev/null @@ -1,696 +0,0 @@ -#include "fd_vinyl.h" -#include "../util/pod/fd_pod.h" -#include -#include -#include -#include - -struct fd_vinyl_client { - fd_vinyl_rq_t * rq; /* Channel for requests from this client (could be shared by multiple vinyl instances) */ - fd_vinyl_cq_t * cq; /* Channel for completions from this client to this vinyl instance - (could be shared by multiple receivers of completions from this vinyl instance). */ - ulong burst_max; /* Max requests receive from this client at a time */ - ulong seq; /* Sequence number of the next request to receive in the rq */ - ulong link_id; /* Identifies requests from this client to this vinyl instance in the rq */ - ulong laddr0; /* A valid non-zero gaddr from this client maps to the vinyl instance's laddr laddr0 + gaddr ... */ - ulong laddr1; /* ... and thus is in (laddr0,laddr1). A zero gaddr maps to laddr NULL. */ - ulong quota_rem; /* Num of remaining acquisitions this client is allowed on this vinyl instance */ - ulong quota_max; /* Max quota */ -}; - -typedef struct fd_vinyl_client fd_vinyl_client_t; - -/* MAP_REQ_GADDR maps a request global address req_gaddr to an array of - cnt T's into the local address space as a T * pointer. If the result - is not properly aligned or the entire range does not completely fall - within the shared region with the client, returns NULL. Likewise, - gaadr 0 maps to NULL. Assumes sizeof(T)*(n) does not overflow (which - is true where as n is at most batch_cnt which is at most 2^32 and - sizeof(T) is at most 40. */ - -#define MAP_REQ_GADDR( gaddr, T, n ) ((T *)fd_vinyl_laddr( (gaddr), alignof(T), sizeof(T)*(n), client_laddr0, client_laddr1 )) - -FD_FN_CONST static inline void * -fd_vinyl_laddr( ulong req_gaddr, - ulong align, - ulong footprint, - ulong client_laddr0, - ulong client_laddr1 ) { - ulong req_laddr0 = client_laddr0 + req_gaddr; - ulong req_laddr1 = req_laddr0 + footprint; - return (void *)fd_ulong_if( (!!req_gaddr) & fd_ulong_is_aligned( req_laddr0, align ) & - (client_laddr0<=req_laddr0) & (req_laddr0<=req_laddr1) & (req_laddr1<=client_laddr1), - req_laddr0, 0UL ); -} - -/* FIXME: STASH THESE IN THE VINYL TOO? */ -#define FD_VINYL_CLIENT_MAX (1024UL) -#define FD_VINYL_REQ_MAX (1024UL) - -void -fd_vinyl_exec( fd_vinyl_t * vinyl ) { - - /* Unpack shared objects */ - - fd_cnc_t * cnc = vinyl->cnc; - fd_vinyl_io_t * io = vinyl->io; - fd_vinyl_line_t * line = vinyl->line; - fd_vinyl_meta_t * meta = vinyl->meta; - fd_vinyl_data_t * data = vinyl->data; - - /* Unpack config */ - - ulong line_cnt = vinyl->line_cnt; - ulong pair_max = vinyl->pair_max; - ulong async_min = vinyl->async_min; - ulong async_max = vinyl->async_max; - - /* Unpack cnc */ - - if( FD_UNLIKELY( fd_cnc_signal_query( cnc )!=FD_VINYL_CNC_SIGNAL_BOOT ) ) { - FD_LOG_WARNING(( "cnc not booting (restarting after an unclean termination?); forcing to boot and attempting to continue" )); - fd_cnc_signal( cnc, FD_VINYL_CNC_SIGNAL_BOOT ); - } - - fd_vinyl_cmd_t * cmd = (fd_vinyl_cmd_t *)fd_cnc_app_laddr( cnc ); - ulong * diag = (ulong *)(cmd+1); - - /* Unpack io */ - - ulong io_seed = fd_vinyl_io_seed( io ); - - /* Unpack meta */ - - fd_vinyl_meta_ele_t * ele0 = meta->ele; - ulong ele_max = meta->ele_max; - ulong meta_seed = meta->seed; - ulong * lock = meta->lock; - int lock_shift = meta->lock_shift; - - /* Unpack data */ - - ulong data_laddr0 = (ulong)data->laddr0; - fd_vinyl_data_vol_t const * vol = data->vol; - ulong vol_cnt = data->vol_cnt; - - /* Connected clients */ - - fd_vinyl_client_t _client[ FD_VINYL_CLIENT_MAX ]; - ulong client_cnt = 0UL; /* In [0,client_max) */ - ulong client_idx = 0UL; /* If client_cnt>0, next client to poll for requests, d/c otherwise */ - - ulong quota_free = line_cnt - 1UL; - - /* Received requests */ - - fd_vinyl_req_t _req[ FD_VINYL_REQ_MAX ]; - ulong req_head = 0UL; /* Requests [0,req_head) have been processed */ - ulong req_tail = 0UL; /* Requests [req_head,req_tail) are pending */ - /* Requests [req_tail,ULONG_MAX) have not been received */ - ulong burst_free = FD_VINYL_REQ_MAX; - ulong exec_max = 0UL; - - /* accum_dead_cnt is the number of dead blocks that have been - written since the last partition block. - - accum_move_cnt is the number of move blocks that have been - written since this last partition block. - - accum_garbage_cnt / sz is the number of items / bytes garbage in - the bstream that have accumulated since the last time we compacted - the bstream. We use this to estimate the number of rounds of - compaction to do in async handling. - - accum_drop_link is the number of requests that were silently - dropped because the request link_id did not match the client's - link_id. - - accum_drop_comp is the number of requests that were silently - dropped because an out-of-band completion was requested to be sent - to an unmappable client address. - - accumt_req_full is the number of times we detected the pending - request queue being completely full. */ - - ulong accum_dead_cnt = 0UL; - ulong accum_move_cnt = 0UL; - ulong accum_garbage_cnt = 0UL; - ulong accum_garbage_sz = 0UL; - ulong accum_drop_link = 0UL; - ulong accum_drop_comp = 0UL; - ulong accum_cache_hit = 0UL; - - ulong seq_part = fd_vinyl_io_seq_present( io ); - - /* Run */ - - fd_cnc_signal( cnc, FD_VINYL_CNC_SIGNAL_RUN ); - - ulong async_rem = 1UL; - - for(;;) { - - /* Process background tasks this iteration if necessary */ - - if( FD_UNLIKELY( !(--async_rem) ) ) { - long now = fd_log_wallclock(); - async_rem = async_min + (fd_ulong_hash( (ulong)now ) % (async_max-async_min+1UL)); /* FIXME: FASTER ALGO */ - - fd_cnc_heartbeat( cnc, now ); - - /* If we've written enough to justify appending a parallel - recovery partition, append one. */ - - ulong seq_future = fd_vinyl_io_seq_future( io ); - if( FD_UNLIKELY( (seq_future - seq_part) > vinyl->part_thresh ) ) { - - ulong seq = fd_vinyl_io_append_part( io, seq_part, accum_dead_cnt, accum_move_cnt, NULL, 0UL ); - FD_CRIT( fd_vinyl_seq_eq( seq, seq_future ), "corruption detected" ); - seq_part = seq + FD_VINYL_BSTREAM_BLOCK_SZ; - - accum_dead_cnt = 0UL; - accum_move_cnt = 0UL; - - accum_garbage_cnt++; - accum_garbage_sz += FD_VINYL_BSTREAM_BLOCK_SZ; - - fd_vinyl_io_commit( io, FD_VINYL_IO_FLAG_BLOCKING ); - - } - - diag[ FD_VINYL_DIAG_DROP_LINK ] += accum_drop_link; accum_drop_link = 0UL; - diag[ FD_VINYL_DIAG_DROP_COMP ] += accum_drop_comp; accum_drop_comp = 0UL; - diag[ FD_VINYL_DIAG_CACHE_HIT ] += accum_cache_hit; accum_cache_hit = 0UL; - - /* Let the number of items of garbage generated since the last - compaction be accum_garbage_cnt and let the steady steady - average number of live / garbage items in the bstream's past be - L / G (i.e. L is the average value of pair_cnt). The average - number pieces of garbage collected per garbage collection round - is thus G / (L + G). If we do compact_max rounds garbage - collection this async handling, we expect to collect - - compact_max G / (L + G) - - items of garbage on average. To make sure we collect garbage - faster than we generate it on average, we then require: - - accum_garbage_cnt <~ compact_max G / (L + G) - -> compact_max >~ (L + G) accum_garbage_cnt / G - - Let the be 2^-gc_eager be the maximum fraction of items in the - bstream's past we are willing tolerate as garbage on average. - We then have G = 2^-gc_eager (L + G). This implies: - - -> compact_max >~ accum_garbage_cnt 2^gc_eager - - When accum_garbage_cnt is 0, we use a compact_max of 1 to do - compaction rounds at a minimum rate all the time. This allows - transients (e.g. a sudden change to new steady state - equilibrium, temporary disabling of garbage collection at key - times for highest performance, etc) and unaccounted zero - padding garbage to be absorbed when nothing else is going on. */ - - int gc_eager = vinyl->gc_eager; - if( FD_LIKELY( gc_eager>=0 ) ) { - - /* Saturating wide left shift */ - ulong overflow = (accum_garbage_cnt >> (63-gc_eager) >> 1); /* sigh ... avoid wide shift UB */ - ulong compact_max = fd_ulong_max( fd_ulong_if( !overflow, accum_garbage_cnt << gc_eager, ULONG_MAX ), 1UL ); - - /**/ accum_garbage_cnt = 0UL; - vinyl->garbage_sz += accum_garbage_sz; accum_garbage_sz = 0UL; - - fd_vinyl_compact( vinyl, compact_max ); - - } - - ulong signal = fd_cnc_signal_query( cnc ); - if( FD_UNLIKELY( signal!=FD_VINYL_CNC_SIGNAL_RUN ) ) { - if( FD_UNLIKELY( signal==FD_VINYL_CNC_SIGNAL_HALT ) ) break; - - switch( signal ) { - - case FD_VINYL_CNC_SIGNAL_SYNC: { - fd_vinyl_io_sync( io, FD_VINYL_IO_FLAG_BLOCKING ); - break; - } - - case FD_VINYL_CNC_SIGNAL_GET: { - ulong old; - int err = FD_VINYL_SUCCESS; - switch( cmd->get.opt ) { - case FD_VINYL_OPT_PART_THRESH: old = vinyl->part_thresh; break; - case FD_VINYL_OPT_GC_THRESH: old = vinyl->gc_thresh; break; - case FD_VINYL_OPT_GC_EAGER: old = (ulong)(long)vinyl->gc_eager; break; - case FD_VINYL_OPT_STYLE: old = (ulong)(uint)vinyl->style; break; - default: old = 0UL; err = FD_VINYL_ERR_INVAL; break; - } - cmd->get.val = old; - cmd->get.err = err; - break; - } - - case FD_VINYL_CNC_SIGNAL_SET: { /* FIXME: ADD VALIDATION TO SET VALUES FOR OPT_GC_EAGER AND OPT_STYLE */ - ulong new = cmd->set.val; - ulong old; - int err = FD_VINYL_SUCCESS; - switch( cmd->set.opt ) { - case FD_VINYL_OPT_PART_THRESH: old = vinyl->part_thresh; vinyl->part_thresh = new; break; - case FD_VINYL_OPT_GC_THRESH: old = vinyl->gc_thresh; vinyl->gc_thresh = new; break; - case FD_VINYL_OPT_GC_EAGER: old = (ulong)(long)vinyl->gc_eager; vinyl->gc_eager = (int)new; break; - case FD_VINYL_OPT_STYLE: old = (ulong)(uint)vinyl->style; vinyl->style = (int)new; break; - default: old = 0UL; err = FD_VINYL_ERR_INVAL; break; - } - cmd->set.val = old; - cmd->set.err = err; - break; - } - - case FD_VINYL_CNC_SIGNAL_CLIENT_JOIN: { - int err; - - ulong link_id = cmd->join.link_id; - ulong burst_max = cmd->join.burst_max; - ulong quota_max = cmd->join.quota_max; - char const * _rq = cmd->join.rq; - char const * _cq = cmd->join.cq; - char const * _wksp = cmd->join.wksp; - - if( FD_UNLIKELY( client_cnt>=FD_VINYL_CLIENT_MAX ) ) { - FD_LOG_WARNING(( "Too many clients (increase FD_VINYL_CLIENT_MAX)" )); - err = FD_VINYL_ERR_FULL; - goto join_done; - } - - if( FD_UNLIKELY( burst_max > burst_free ) ) { - FD_LOG_WARNING(( "Too large burst_max (increase FD_VINYL_REQ_MAX or decrease burst_max)" )); - err = FD_VINYL_ERR_FULL; - goto join_done; - } - - if( FD_UNLIKELY( quota_max > fd_ulong_min( quota_free, FD_VINYL_COMP_QUOTA_MAX ) ) ) { - FD_LOG_WARNING(( "Too large quota_max (increase line_cnt or decrease quota_max)" )); - err = FD_VINYL_ERR_FULL; - goto join_done; - } - - for( ulong client_idx=0UL; client_idxgaddr_hi; /* FIXME: HOW TO GET THIS CLEANLY */ - _client[ client_cnt ].quota_rem = quota_max; - _client[ client_cnt ].quota_max = quota_max; - client_cnt++; - - quota_free -= quota_max; - burst_free -= burst_max; - - /* Every client_cnt run loop iterations we receive at most: - - sum_clients recv_max = FD_VINYL_REQ_MAX - burst_free - - requests. To guarantee we processe requests fast enough - that we never overrun our receive queue, under maximum - client load, we need to process: - - sum_clients recv_max / client_cnt - - requests per run loop iteration. We thus set exec_max - to the ceil sum_clients recv_max / client_cnt. */ - - exec_max = (FD_VINYL_REQ_MAX - burst_free + client_cnt - 1UL) / client_cnt; - - err = FD_VINYL_SUCCESS; - - join_done: - cmd->join.err = err; - break; - } - - case FD_VINYL_CNC_SIGNAL_CLIENT_LEAVE: { - int err; - - ulong link_id = cmd->leave.link_id; - - for( ulong client_idx=0UL; client_idxleave.err = err; - break; - } - - default: { - FD_LOG_WARNING(( "unknown signal received (%lu); ignoring", signal )); - break; - } - - } - - fd_cnc_signal( cnc, FD_VINYL_CNC_SIGNAL_RUN ); - } - } - - /* Receive requests from clients */ - - if( FD_LIKELY( client_cnt ) ) { - - /* Select client to poll this run loop iteration */ - - client_idx = fd_ulong_if( client_idx+1ULrq; - ulong seq = client->seq; - ulong burst_max = client->burst_max; - ulong link_id = client->link_id; - - /* Enqueue up to burst_max requests from this client into the - local request queue. Using burst_max << FD_VINYL_REQ_MAX - allows applications to prevent a bursty client from starving - other clients of resources while preserving the spatial and - temporal locality of reasonably sized O(burst_max) bursts from - an individual client in processing below. Each run loop - iteration can enqueue up to burst_max requests per iterations. */ - - for( ulong recv_rem=fd_ulong_min( FD_VINYL_REQ_MAX-(req_tail-req_head), burst_max ); recv_rem; recv_rem-- ) { - fd_vinyl_req_t * req = _req + (req_tail & (FD_VINYL_REQ_MAX-1UL)); - - long diff = fd_vinyl_rq_recv( rq, seq, req ); - - if( FD_LIKELY( diff>0L ) ) break; /* No requests waiting in rq at this time */ - - if( FD_UNLIKELY( diff ) ) FD_LOG_CRIT(( "client overran request queue" )); - - seq++; - - /* We got the next request. Decide if we should accept it. - - Specifically, we ignore requests whose link_id don't match - link_id (e.g. an unknown link_id or matches a different - client's link_id ... don't know if it is where or even if it - is safe to the completion). Even if the request provided an - out-of-band location to send the completion (comp_gaddr!=0), - we have no reason to trust it given the mismatch. - - This also gives a mechanism for a client use a single rq to - send requests to multiple vinyl instances ... the client - should use a different link_id for each vinyl instance. Each - vinyl instance will quickly filter out the requests not - addressed to it. - - Since we know the client_idx at this point, given a matching - link_id, we stash the client_idx in the pending req link_id - to eliminate the need to maintain a link_id<>client_idx map - in the execution loop below. */ - - if( FD_UNLIKELY( req->link_id!=link_id ) ) { - accum_drop_link++; - continue; - } - - req->link_id = client_idx; - - req_tail++; - } - - client->seq = seq; - } - - /* Execute received requests */ - - for( ulong exec_rem=fd_ulong_min( req_tail-req_head, exec_max ); exec_rem; exec_rem-- ) { - fd_vinyl_req_t * req = _req + ((req_head++) & (FD_VINYL_REQ_MAX-1UL)); - - /* Determine the client that sent this request and unpack the - completion fields. We ignore requests with non-NULL but - unmappable out-of-band completion because we can't send the - completion in the expected manner and, in lieu of that, the - receivers aren't expecting any completion to come via the cq - (if any). Note that this implies requests that don't produce a - completion (e.g. FETCH and FLUSH) need to either provide NULL - or a valid non-NULL location for comp_gaddr to pass this - validation (this is not a burden practically). */ - - ulong req_id = req->req_id; - ulong client_idx = req->link_id; /* See note above about link_id / client_idx conversion */ - ulong batch_cnt = (ulong)req->batch_cnt; - ulong comp_gaddr = req->comp_gaddr; - - fd_vinyl_client_t * client = _client + client_idx; - - fd_vinyl_cq_t * cq = client->cq; - ulong link_id = client->link_id; - ulong client_laddr0 = client->laddr0; - ulong client_laddr1 = client->laddr1; - ulong quota_rem = client->quota_rem; - - FD_CRIT( quota_rem<=client->quota_max, "corruption detected" ); - - fd_vinyl_comp_t * comp = MAP_REQ_GADDR( comp_gaddr, fd_vinyl_comp_t, 1UL ); - if( FD_UNLIKELY( (!comp) & (!!comp_gaddr) ) ) { - accum_drop_comp++; - continue; - } - - int comp_err = 1; - ulong fail_cnt = 0UL; - - ulong read_cnt = 0UL; - ulong append_cnt = 0UL; - - switch( req->type ) { - -# include "fd_vinyl_case_acquire.c" -# include "fd_vinyl_case_release.c" -# include "fd_vinyl_case_erase.c" -# include "fd_vinyl_case_move.c" -# include "fd_vinyl_case_fetch.c" -# include "fd_vinyl_case_flush.c" -# include "fd_vinyl_case_try.c" -# include "fd_vinyl_case_test.c" - - default: - comp_err = FD_VINYL_ERR_INVAL; - break; - } - - for( ; read_cnt; read_cnt-- ) { - fd_vinyl_io_rd_t * _rd; /* avoid pointer escape */ - fd_vinyl_io_poll( io, &_rd, FD_VINYL_IO_FLAG_BLOCKING ); - fd_vinyl_io_rd_t * rd = _rd; - - fd_vinyl_data_obj_t * obj = (fd_vinyl_data_obj_t *) rd->ctx; - ulong seq = rd->seq; (void)seq; - fd_vinyl_bstream_phdr_t * cphdr = (fd_vinyl_bstream_phdr_t *)rd->dst; - ulong cpair_sz = rd->sz; (void)cpair_sz; - - fd_vinyl_data_obj_t * cobj = (fd_vinyl_data_obj_t *)fd_ulong_align_dn( (ulong)rd, FD_VINYL_BSTREAM_BLOCK_SZ ); - - FD_CRIT( cphdr==fd_vinyl_data_obj_phdr( cobj ), "corruption detected" ); - - ulong cpair_ctl = cphdr->ctl; - - int cpair_type = fd_vinyl_bstream_ctl_type ( cpair_ctl ); - int cpair_style = fd_vinyl_bstream_ctl_style( cpair_ctl ); - ulong cpair_val_esz = fd_vinyl_bstream_ctl_sz ( cpair_ctl ); - - FD_CRIT( cpair_type==FD_VINYL_BSTREAM_CTL_TYPE_PAIR, "corruption detected" ); - FD_CRIT( cpair_sz ==fd_vinyl_bstream_pair_sz( cpair_val_esz ), "corruption detected" ); - - schar * rd_err = cobj->rd_err; - - FD_CRIT ( rd_err, "corruption detected" ); - FD_ALERT( fd_vinyl_data_is_valid_obj( obj, vol, vol_cnt ), "corruption detected" ); - - ulong line_idx = obj->line_idx; - - FD_CRIT( line_idxinfo.val_sz; - - FD_CRIT( val_sz <= FD_VINYL_VAL_MAX, "corruption detected" ); - FD_CRIT( fd_vinyl_data_obj_val_max( obj ) >= val_sz, "corruption detected" ); - - if( FD_LIKELY( cpair_style==FD_VINYL_BSTREAM_CTL_STYLE_RAW ) ) { - - FD_CRIT( obj==cobj, "corruption detected" ); - FD_CRIT( cpair_val_esz==val_sz, "corruption detected" ); - - } else { - - char const * cval = (char const *)fd_vinyl_data_obj_val( cobj ); - ulong cval_sz = fd_vinyl_bstream_ctl_sz( cpair_ctl ); - - ulong _val_sz = (ulong)LZ4_decompress_safe( cval, val, (int)cval_sz, (int)val_sz ); - if( FD_UNLIKELY( _val_sz!=val_sz ) ) FD_LOG_CRIT(( "LZ4_decompress_safe failed" )); - - fd_vinyl_data_free( data, cobj ); - - fd_vinyl_bstream_phdr_t * phdr = fd_vinyl_data_obj_phdr( obj ); - - phdr->ctl = fd_vinyl_bstream_ctl( FD_VINYL_BSTREAM_CTL_TYPE_PAIR, FD_VINYL_BSTREAM_CTL_STYLE_RAW, val_sz ); - phdr->key = cphdr->key; - phdr->info = cphdr->info; - - } - - obj->rd_active = (short)0; - - /* Fill any trailing region with zeros (there is at least - FD_VINYL_BSTREAM_FTR_SZ) and tell the client the item was - successfully processed. */ - - memset( val + val_sz, 0, fd_vinyl_data_szc_obj_footprint( (ulong)obj->szc ) - - (sizeof(fd_vinyl_data_obj_t) + sizeof(fd_vinyl_bstream_phdr_t) + val_sz) ); - - FD_COMPILER_MFENCE(); - *rd_err = (schar)FD_VINYL_SUCCESS; - FD_COMPILER_MFENCE(); - - } - - if( FD_UNLIKELY( append_cnt ) ) fd_vinyl_io_commit( io, FD_VINYL_IO_FLAG_BLOCKING ); - - if( FD_LIKELY( comp_err<=0 ) ) fd_vinyl_cq_send( cq, comp, req_id, link_id, comp_err, batch_cnt, fail_cnt, quota_rem ); - - client->quota_rem = quota_rem; - - } - - } /* run loop */ - - ulong discard_cnt = req_tail - req_head; - - /* Append the final partition and sync so we can resume with a fast - parallel recovery */ - - fd_vinyl_io_append_part( io, seq_part, accum_dead_cnt, accum_move_cnt, NULL, 0UL ); - - accum_dead_cnt = 0UL; - accum_move_cnt = 0UL; - - accum_garbage_cnt++; - accum_garbage_sz += FD_VINYL_BSTREAM_BLOCK_SZ; - - fd_vinyl_io_commit( io, FD_VINYL_IO_FLAG_BLOCKING ); - - fd_vinyl_io_sync( io, FD_VINYL_IO_FLAG_BLOCKING ); - - /* Drain outstanding accumulators */ - - /**/ accum_garbage_cnt = 0UL; - vinyl->garbage_sz += accum_garbage_sz; accum_garbage_sz = 0UL; - - diag[ FD_VINYL_DIAG_DROP_LINK ] += accum_drop_link; accum_drop_link = 0UL; - diag[ FD_VINYL_DIAG_DROP_COMP ] += accum_drop_comp; accum_drop_comp = 0UL; - diag[ FD_VINYL_DIAG_CACHE_HIT ] += accum_cache_hit; accum_cache_hit = 0UL; - - /* Disconnect from the clients */ - - ulong released_cnt = 0UL; - for( ulong client_idx=0UL; client_idxpart.seq0; - seq0 = fd_vinyl_seq_gt( seq0, seq_past ) ? seq0 : seq_past; - -# if 0 - /* Compute the maximum number of deads the portion of this partition - in the bstream's past that could produce as the lesser the number - of deads reported in the partition and the number of blocks in - the partition. Similarly for move (note that each move makes two - tombstone but also requires at least two blocks ... so moves also - make, at most, 1 tombstone per block on average). */ - - ulong part_sz = seq1 - seq0 - FD_VINYL_BSTREAM_BLOCK_SZ; /* exclude trailing part block for below */ - - ulong dead_max = fd_ulong_min( block->part.dead_cnt, part_sz ); - ulong move_max = fd_ulong_min( block->part.move_cnt, part_sz ); - - tstone_req += fd_ulong_min( dead_max + 2UL*move_max, part_sz ); -# endif - - /* Move to the previous partition */ - - seq1 = seq0; - } - - /* We seem to have a valid partitioning for parallel recovery */ - -# if 0 - if( FD_UNLIKELY( tstone_req > tstone_max ) ) { - FD_LOG_WARNING(( "insufficient scratch space for parallel recovery" - "\n\tincrease data cache size" - "\n\tfalling back to serial recovery" )); - return FD_VINYL_ERR_FULL; - } -# endif - - return FD_VINYL_SUCCESS; -} - -/* fd_vinyl_recover_line_task tests parallel flushes all vinyl - lines and resets the evicition priority sequence. */ - -static FD_FOR_ALL_BEGIN( fd_vinyl_recover_line_task, 1L ) { - fd_vinyl_t * vinyl = (fd_vinyl_t *)arg[0]; - - fd_vinyl_line_t * line = vinyl->line; - ulong line_cnt = vinyl->line_cnt; - - ulong line0 = (ulong)block_i0; - ulong line1 = (ulong)block_i1; - - for( ulong line_idx=line0; line_idxmeta->lock; - - ulong reclaim_cnt = 0UL; - - for( long lock_idx=block_i0; lock_idxmeta->ele; - - fd_vinyl_meta_ele_t init_ele[1]; - memset( init_ele, 0, sizeof(fd_vinyl_meta_ele_t) ); - init_ele->line_idx = ULONG_MAX; - - for( long ele_idx=block_i0; ele_idxmeta->lock; - - for( long lock_idx=block_i0; lock_idxphdr.ctl ) ) { - - /* There is no version or tstone for pair key in the meta currently. - Insert a tstone at seq for key so any versions or tstone for pair - key encountered later in parallel recovery can tell if they are - before or after this tstone. Because we don't know if there will - version of key after this, we need to append key to the tstone - array. */ - - //pair_cnt unchanged - //garbage_sz unchanged - (*_tstone_cnt)++; - - ele->memo = fd_vinyl_meta_query_memo( query ); - ele->phdr.ctl = 1UL; - ele->phdr.key = *key; - //ele->phdr.info = d/c - ele->line_idx = ULONG_MAX - 1UL; // tstone - ele->seq = seq; - - fd_vinyl_meta_publish( query ); - - } else if( FD_LIKELY( fd_vinyl_seq_lt( ele->seq, seq ) ) ) { - - /* The version (or tstone) for pair key in the meta is older than - seq. We append a key to the tstone array if we haven't already. */ - - int old_ele_is_pair = (ele->line_idx==ULONG_MAX); - - (*_pair_cnt) -= (ulong)old_ele_is_pair; - (*_garbage_sz) += old_ele_is_pair ? fd_vinyl_bstream_pair_sz( fd_vinyl_bstream_ctl_sz( ele->phdr.ctl ) ) : 0UL; - (*_tstone_cnt) += (ulong)old_ele_is_pair; - - //ele->memo = already init - //ele->phdr.ctl = already init - //ele->phdr.key = already init - //ele->phdr.info = d/c - ele->line_idx = ULONG_MAX - 1UL; // tstone - ele->seq = seq; - - fd_vinyl_meta_publish( query ); - - } else { - - /* The meta entry (pair or tstone) for pair key in the meta is newer - than seq. We can skip this tstone. */ - - //pair_cnt unchanged - //garbage_sz unchanged - //tstone_cnt unchanged - - int corrupt = fd_vinyl_seq_eq( ele->seq, seq ); - - fd_vinyl_meta_cancel( query ); - - if( FD_UNLIKELY( corrupt ) ) { - FD_LOG_WARNING(( "%016lx: probable corruption detected", seq )); - return FD_VINYL_ERR_CORRUPT; - } - - } - - return FD_VINYL_SUCCESS; -} - -/* fd_vinyl_recover_part_task dynamically assigns the partitions of the - bstream's past to threads for recovery and then recovers them in - parallel. The bstream past partition iteration is near identical - to bstream past iteration in serial recovery. See - fd_vinyl_recover_serial.c for more details. */ - -/* FIXME: ADD MORE EXTENSIVE DATA INTEGRITY CHECKING LIKE SERIAL IMPL */ - -static FD_FN_UNUSED FD_MAP_REDUCE_BEGIN( fd_vinyl_recover_part_task, 1UL, alignof(ulong), sizeof(ulong), 4UL ) { - ulong * _rlocal = (ulong *) arg[0]; - fd_vinyl_t * vinyl = (fd_vinyl_t *) arg[1]; - ulong * _lock = (ulong *) arg[2]; - - fd_vinyl_io_t * io = vinyl->io; - fd_vinyl_meta_t * meta = vinyl->meta; - - ulong io_seed = fd_vinyl_io_seed ( io ); - ulong seq_past = fd_vinyl_io_seq_past( io ); - uchar * mmio = (uchar *)fd_vinyl_mmio ( io ); - ulong mmio_sz = fd_vinyl_mmio_sz ( io ); - - ulong fail = 1UL; - ulong pair_cnt = 0UL; - ulong garbage_sz = 0UL; - ulong tstone_cnt = 0UL; - - for(;;) { - - /* Determine the range of the bstream past we should process next. */ - - ulong seq0; - ulong seq1; - - /* Lock and fetch the task assignment cursor */ - - FD_COMPILER_MFENCE(); -# if FD_HAS_ATOMIC - while( FD_ATOMIC_CAS( _lock, 0UL, 1UL ) ) FD_SPIN_PAUSE(); -# else - *_lock = 1UL; -# endif - FD_COMPILER_MFENCE(); - - seq1 = _lock[1]; - - /* At this point, the bstream range [seq_past,seq1) has not been - assigned. If seq1 is at seq_past, everything has been assigned - already. Otherwise, the block before cursor is a valid partition - block (as per the test above) and we claim the range: - - [ the older of part_seq0 and seq_past, seq1 ) - - to process. */ - - if( FD_UNLIKELY( fd_vinyl_seq_le( seq1, seq_past ) ) ) seq0 = seq_past; - else { - fd_vinyl_bstream_block_t const * block = PEEK( seq1 - FD_VINYL_BSTREAM_BLOCK_SZ ); - seq0 = block->part.seq0; - if( fd_vinyl_seq_lt( seq0, seq_past ) ) seq0 = seq_past; - } - - /* Update and unlock the task assignment cursor */ - - _lock[1] = seq0; - FD_COMPILER_MFENCE(); - _lock[0] = 0UL; - FD_COMPILER_MFENCE(); - - if( FD_UNLIKELY( fd_vinyl_seq_le( seq1, seq_past ) ) ) break; - - /* At this point, we need to recover the range [seq0,seq1). */ - - ulong seq = seq0; - while( fd_vinyl_seq_lt( seq, seq1 ) ) { - - fd_vinyl_bstream_block_t block[1]; - - block[0] = *(fd_vinyl_bstream_block_t *)PEEK( seq ); /* testing is destructive */ - - ulong ctl = block->ctl; - - int type = fd_vinyl_bstream_ctl_type( ctl ); - - switch( type ) { - - case FD_VINYL_BSTREAM_CTL_TYPE_PAIR: { - - ulong pair_val_esz = fd_vinyl_bstream_ctl_sz( ctl ); - - ulong pair_sz = fd_vinyl_bstream_pair_sz( pair_val_esz ); - - if( FD_UNLIKELY( pair_sz > (seq1-seq) ) ) { /* Wrapping safe */ - FD_LOG_WARNING(( "%016lx: truncated", seq )); - goto done; - } - - fd_vinyl_bstream_block_t ftr[1]; - - ftr[0] = *PEEK( seq + pair_sz - FD_VINYL_BSTREAM_BLOCK_SZ ); - - char const * _err = fd_vinyl_bstream_pair_test_fast( io_seed, seq, block, ftr ); - if( FD_UNLIKELY( _err ) ) { - FD_LOG_WARNING(( "%016lx: %s", seq, _err )); - goto done; - } - - /* At this point, we appear to have valid completely written - pair. Prepare the meta to do an update for this key. */ - - fd_vinyl_meta_query_t query[1]; - - fd_vinyl_meta_prepare( meta, &block->phdr.key, NULL, query, FD_MAP_FLAG_BLOCKING ); - - fd_vinyl_meta_ele_t * ele = fd_vinyl_meta_query_ele( query ); - - if( FD_UNLIKELY( !ele ) ) { - FD_LOG_WARNING(( "%016lx: corruption detected or meta cache too small for parallel recovery", seq )); - goto done; - } - - if( FD_LIKELY( (!ele->phdr.ctl) | fd_vinyl_seq_gt( seq, ele->seq ) ) ) { - - pair_cnt++; - - /* At this point, this is the first time any thread has seen - pair key or this version of pair key is newer than the - version (or tstone) of pair key has been seed */ - - ele->memo = fd_vinyl_meta_query_memo( query ); - ele->phdr = block->phdr; - ele->line_idx = ULONG_MAX; // pair - ele->seq = seq; - - fd_vinyl_meta_publish( query ); - - } else { - - /* At this point, this version of pair key is older than the - version (or tstone) for pair key seen by all threads so - far. */ - - fd_vinyl_meta_cancel( query ); - - garbage_sz += pair_sz; - - } - - seq += pair_sz; - break; - } - - case FD_VINYL_BSTREAM_CTL_TYPE_DEAD: { - - char const * _err = fd_vinyl_bstream_dead_test( io_seed, seq, block ); - if( FD_UNLIKELY( _err ) ) { - FD_LOG_WARNING(( "%016lx: %s", seq, _err )); - goto done; - } - - int err = fd_vinyl_recover_tstone( meta, &block->dead.phdr.key, seq, &pair_cnt, &garbage_sz, &tstone_cnt ); - if( FD_UNLIKELY( err ) ) goto done; /* logs details */ - - garbage_sz += FD_VINYL_BSTREAM_BLOCK_SZ; - seq += FD_VINYL_BSTREAM_BLOCK_SZ; - break; - } - - case FD_VINYL_BSTREAM_CTL_TYPE_MOVE: { - - if( FD_UNLIKELY( 2UL*FD_VINYL_BSTREAM_BLOCK_SZ > (seq1-seq) ) ) { /* Wrapping safe */ - FD_LOG_WARNING(( "%016lx: truncated", seq )); - goto done; - } - - fd_vinyl_bstream_block_t dst[1]; - - dst[0] = *PEEK( seq + FD_VINYL_BSTREAM_BLOCK_SZ ); - - char const * _err = fd_vinyl_bstream_move_test( io_seed, seq, block, dst ); - if( FD_UNLIKELY( _err ) ) { - FD_LOG_WARNING(( "%016lx: %s", seq, _err )); - goto done; - } - - int err = fd_vinyl_recover_tstone( meta, &block->move.src.key, seq, &pair_cnt, &garbage_sz, &tstone_cnt ); - if( FD_UNLIKELY( err ) ) goto done; /* logs details */ - - /**/ err = fd_vinyl_recover_tstone( meta, &block->move.dst, seq, &pair_cnt, &garbage_sz, &tstone_cnt ); - if( FD_UNLIKELY( err ) ) goto done; /* logs details */ - - garbage_sz += FD_VINYL_BSTREAM_BLOCK_SZ; - seq += FD_VINYL_BSTREAM_BLOCK_SZ; - break; - } - - case FD_VINYL_BSTREAM_CTL_TYPE_PART: { - - char const * _err = fd_vinyl_bstream_part_test( io_seed, seq, block ); - if( FD_UNLIKELY( _err ) ) { - FD_LOG_WARNING(( "%016lx: %s", seq, _err )); - goto done; - } - - garbage_sz += FD_VINYL_BSTREAM_BLOCK_SZ; - seq += FD_VINYL_BSTREAM_BLOCK_SZ; - break; - } - - case FD_VINYL_BSTREAM_CTL_TYPE_ZPAD: { - - char const * _err = fd_vinyl_bstream_zpad_test( io_seed, seq, block ); - if( FD_UNLIKELY( _err ) ) { - FD_LOG_WARNING(( "%016lx: %s", seq, _err )); - goto done; - } - - seq += FD_VINYL_BSTREAM_BLOCK_SZ; - break; - } - - default: - FD_LOG_WARNING(( "%016lx: unknown type (%x)", seq, (uint)type )); - goto done; - - } - } - - if( FD_UNLIKELY( fd_vinyl_seq_ne( seq, seq1 ) ) ) { - FD_LOG_WARNING(( "%016lx: bad partitioning", seq )); - goto done; - } - - } - - fail = 0UL; - -done: - - /* If we failed, tell all the other threads to not continue by - setting the task assignment cursor to seq_past. */ - - if( fail ) { - FD_COMPILER_MFENCE(); -# if FD_HAS_ATOMIC - while( FD_ATOMIC_CAS( _lock, 0UL, 1UL ) ) FD_SPIN_PAUSE(); -# else - *_lock = 1UL; -# endif - FD_COMPILER_MFENCE(); - _lock[1]= seq_past; - FD_COMPILER_MFENCE(); - _lock[0]= 0UL; - } - - _rlocal[0] = fail; - _rlocal[1] = pair_cnt; - _rlocal[2] = garbage_sz; - _rlocal[3] = tstone_cnt; - -} FD_MAP_END { - - ulong * _rlocal = (ulong *) arg[0]; - ulong const * _rremote = (ulong const *)_r1; - - _rlocal[0] |= _rremote[0]; - _rlocal[1] += _rremote[1]; - _rlocal[2] += _rremote[2]; - _rlocal[3] += _rremote[3]; - -} FD_REDUCE_END - -static FD_FN_UNUSED FD_MAP_REDUCE_BEGIN( fd_vinyl_recover_meta_cleanup_task, 1L, alignof(ulong), sizeof(ulong), 1UL ) { - ulong * _rlocal = (ulong *)arg[0]; - - fd_vinyl_t * vinyl = (fd_vinyl_t *)arg[1]; - - fd_vinyl_meta_t * meta = vinyl->meta; - - fd_vinyl_meta_ele_t * ele0 = meta->ele; - ulong const * lock = meta->lock; - int lock_shift = meta->lock_shift; - - ulong remove_cnt = 0UL; - - for( long ele_idx=block_i0; ele_idx> lock_shift; - - fd_vinyl_key_t key; - int try_remove; - - /* Do a non-blocking query by ele_idx (not be key). We have to do - this direct because this is no standard API for this. This is - highly unlikely to ever block (but theoretically could if the - remove in a different thread has locked a probe chain that - touches elements in this thread). */ - - for(;;) { - FD_COMPILER_MFENCE(); - ulong ver0 = lock[ lock_idx ]; - FD_COMPILER_MFENCE(); - if( FD_LIKELY( !(ver0 & 1UL) ) ) { - - try_remove = (!!ele0[ ele_idx ].phdr.ctl) & (ele0[ ele_idx ].line_idx==(ULONG_MAX-1UL)); - key = ele0[ ele_idx ].phdr.key; - - FD_COMPILER_MFENCE(); - ulong ver1 = lock[ lock_idx ]; - FD_COMPILER_MFENCE(); - if( FD_LIKELY( ver0==ver1 ) ) break; - } - FD_SPIN_PAUSE(); - } - - /* If try_remove is not set, ele_idx either had no key it in or - had a pair entry. So we continue to the next slot. */ - - if( FD_LIKELY( !try_remove ) ) continue; - - /* At this point, we observed key had a tstone in the meta above. - So we try to remove it. It is possible (though extremely - unlikely for big sparse maps and the vanilla thread partitioning - here) that a remove on another thread got key first. So it is - okay if this fails. We have to use the parallel version of this - (even if it is highly unlikely to interfere with other threads) - for the same reason we had to use a non-blocking query above. */ - - fd_vinyl_meta_query_t query[1]; - remove_cnt += (ulong)!fd_vinyl_meta_remove( meta, &key, query, FD_MAP_FLAG_BLOCKING ); - } - - *_rlocal = remove_cnt; - -} FD_MAP_END { - - ulong * _rlocal = (ulong *) arg[0]; - ulong const * _rremote = (ulong const *)_r1; - - *_rlocal += *_rremote; - -} FD_REDUCE_END - -ulong -fd_vinyl_recover( fd_tpool_t * tpool, ulong t0, ulong t1, int level, - fd_vinyl_t * vinyl ) { - - fd_vinyl_meta_t * meta = vinyl->meta; - ulong line_cnt = vinyl->line_cnt; - - ulong ele_max = meta->ele_max; - ulong lock_cnt = meta->lock_cnt; - - /* Using all avaialble threads, flush the lines and meta cache. We do - the meta flush locked so we don't confuse any concurrent meta - readers. This will claim any existing locks (e.g. the previous - meta writer died while holding a lock and the user didn't clean it - up before calling this). */ - - ulong reclaim_cnt; - - FD_FOR_ALL ( fd_vinyl_recover_line_task, tpool,t0,t1, 0L,(long)line_cnt, vinyl ); - FD_MAP_REDUCE( fd_vinyl_recover_reclaim_task, tpool,t0,t1, 0L,(long)lock_cnt, &reclaim_cnt, vinyl ); - FD_FOR_ALL ( fd_vinyl_recover_meta_flush_task, tpool,t0,t1, 0L,(long)ele_max, vinyl ); - FD_FOR_ALL ( fd_vinyl_recover_unlock_task, tpool,t0,t1, 0L,(long)lock_cnt, vinyl ); - - if( FD_UNLIKELY( reclaim_cnt ) ) FD_LOG_WARNING(( "reclaimed %lu locks (dead writer?); attempting to continue", reclaim_cnt )); - - /* FIXME: should this fail if it detects in progress io? */ - - /* If there is only 1 thread provided or the bstream past doesn't - have a valid partitioning, use the serial recovery algorithm */ - -t1 = t0 + 1UL; /* Turn off parallel recovery while it is untested */ - - if( FD_UNLIKELY( (t1-t0)<=1UL ) || - FD_UNLIKELY( fd_vinyl_recover_test( vinyl->io ) || - !FD_HAS_ATOMIC ) ) { - fd_vinyl_data_reset( tpool,t0,t1, level, vinyl->data ); - return fd_vinyl_recover_serial( vinyl ); - } - -# if FD_HAS_ATOMIC - - /* The parallel recovery of bstream partition may leave tstones in the - meta elements. To clean this up, we have two options. - - Option 1 (simplest and most robust): we parallel scan all the meta - elements in parallel for tstones and remove them. We might have to - do more than one pass because the removal of elements could mean - some elements are not placed well. This requires no scratch (and - thus is more robust against arbitrary erase / move patterns in the - recovery region). While it isn't any less algo inefficient - (because we paralllel scan all the elements already to clear them), - it is pracitcally less efficient for applications access patterns - that don't generate many tombstones and/or have pair_cnt<io ); - - ulong rtmp[4]; - ulong lock[2]; - - lock[0] = 0UL; - lock[1] = seq; - - FD_MAP_REDUCE( fd_vinyl_recover_part_task, tpool,t0,t1, 0L,(long)(t1-t0), rtmp, vinyl, lock ); - - ulong fail = rtmp[0]; - if( FD_UNLIKELY( fail ) ) { - FD_LOG_WARNING(( "parallel recovery failed; attempting serial recovery" )); - - /* Reset the meta from whatever messy state failed parallel recovery - left it */ - - FD_MAP_REDUCE( fd_vinyl_recover_reclaim_task, tpool,t0,t1, 0L,(long)lock_cnt, &reclaim_cnt, vinyl ); - FD_FOR_ALL ( fd_vinyl_recover_meta_flush_task, tpool,t0,t1, 0L,(long)ele_max, vinyl ); - FD_FOR_ALL ( fd_vinyl_recover_unlock_task, tpool,t0,t1, 0L,(long)lock_cnt, vinyl ); - - fd_vinyl_data_reset( tpool,t0,t1, level, vinyl->data ); - - return fd_vinyl_recover_serial( vinyl ); - } - - vinyl->pair_cnt = rtmp[1]; - vinyl->garbage_sz = rtmp[2]; - - ulong tstone_rem = rtmp[3]; - - while( tstone_rem ) { - FD_FOR_ALL( fd_vinyl_recover_meta_cleanup_task, tpool,t0,t1, 0L,(long)ele_max, rtmp, vinyl ); - tstone_rem -= rtmp[0]; - } - - /* Reset the data cache to clean up any scratch usage (currently none - but no reason to do earlier) */ - - fd_vinyl_data_reset( tpool,t0,t1, level, vinyl->data ); - - return seq; - -# endif /* FD_HAS_ATOMIC */ -} diff --git a/src/vinyl/fd_vinyl_recover_serial.c b/src/vinyl/fd_vinyl_recover_serial.c deleted file mode 100644 index 39fc6056893..00000000000 --- a/src/vinyl/fd_vinyl_recover_serial.c +++ /dev/null @@ -1,351 +0,0 @@ -/* This is included directly by fd_vinyl_recover.c */ - -ulong -fd_vinyl_recover_serial( fd_vinyl_t * vinyl ) { - - /* Iterate over the bstream's past to populate the meta. Note that - our caller flushed the meta cache, data cache and reset the cache - line eviction priorities to their default. */ - - fd_vinyl_meta_t * meta = vinyl->meta; - fd_vinyl_line_t * line = vinyl->line; - fd_vinyl_io_t * io = vinyl->io; - - ulong line_cnt = vinyl->line_cnt; - ulong pair_max = vinyl->pair_max; - - ulong io_seed = fd_vinyl_io_seed ( io ); - ulong seq_past = fd_vinyl_io_seq_past ( io ); - ulong seq_present = fd_vinyl_io_seq_present( io ); - - fd_vinyl_meta_ele_t * ele0 = meta->ele; - ulong ele_max = meta->ele_max; - ulong meta_seed = meta->seed; - ulong * lock = meta->lock; - int lock_shift = meta->lock_shift; - - ulong seq = seq_past; - ulong pair_cnt = 0UL; - ulong garbage_sz = 0UL; - - while( fd_vinyl_seq_lt( seq, seq_present ) ) { - - /* At this point, we've recovered [seq_past,seq) and still need - recover [seq,seq_present) (non-empty). Read the block at seq. */ - - fd_vinyl_bstream_block_t block[1]; - - fd_vinyl_io_read_imm( io, seq, block, FD_VINYL_BSTREAM_BLOCK_SZ ); - - ulong ctl = block->ctl; - - int type = fd_vinyl_bstream_ctl_type( ctl ); - - switch( type ) { - - case FD_VINYL_BSTREAM_CTL_TYPE_PAIR: { - - /* Notes: - - - It is okay if we are in a move (move block processing the - previous iteration already confirmed this is the proper pair. - - - We could rewind the bstream to seq on truncation - automatically but then we might have failed to recover the - most recent pair and thus have recovered to a state that does - not correspond to the bstream's past. We instead kick this - to the user to decide if they want to discard an incompletely - written pair or not. */ - - ulong pair_val_esz = fd_vinyl_bstream_ctl_sz( ctl ); - - ulong pair_sz = fd_vinyl_bstream_pair_sz( pair_val_esz ); - - if( FD_UNLIKELY( pair_sz > (seq_present-seq) ) ) { /* Wrapping safe */ - FD_LOG_WARNING(( "%016lx: truncated", seq )); - goto done; - } - - fd_vinyl_bstream_block_t _ftr[1]; - fd_vinyl_bstream_block_t * ftr = _ftr; - - if( pair_sz <= FD_VINYL_BSTREAM_BLOCK_SZ ) ftr = block; - else fd_vinyl_io_read_imm( io, seq + pair_sz - FD_VINYL_BSTREAM_BLOCK_SZ, ftr, FD_VINYL_BSTREAM_BLOCK_SZ ); - - char const * _err = fd_vinyl_bstream_pair_test_fast( io_seed, seq, block, ftr ); - if( FD_UNLIKELY( _err ) ) { - FD_LOG_WARNING(( "%016lx: %s", seq, _err )); - goto done; - } - - /* At this point, we appear to have valid completely written pair. - Extract the pair metadata and determine if this replaces a - version we've already seen. Since this single threaded, we can - use the single threaded optimized meta APIs here. */ - - fd_vinyl_key_t const * pair_key = &block->phdr.key; - - ulong pair_memo = fd_vinyl_key_memo( meta_seed, pair_key ); - - ulong _ele_idx; /* avoid pointer escape */ - int err = fd_vinyl_meta_query_fast( ele0, ele_max, pair_key, pair_memo, &_ele_idx ); - ulong ele_idx = _ele_idx; - - if( FD_LIKELY( err==FD_VINYL_ERR_KEY ) ) { - - /* This is the first time we've seen pair key or pair key was - erased in a previous iteration (e.g. we most recently - processed an erase for pair key or we are in a move). If we - have room for pair key, insert it into the meta at ele_idx. */ - - if( FD_UNLIKELY( pair_cnt>=pair_max ) ) { - FD_LOG_WARNING(( "%016lx: increase pair_max", seq )); - goto done; - } - - ele0[ ele_idx ].memo = pair_memo; - ele0[ ele_idx ].phdr = block->phdr; - ele0[ ele_idx ].seq = seq; - ele0[ ele_idx ].line_idx = ULONG_MAX; /* key-val not in cache */ - - pair_cnt++; - - } else if( FD_LIKELY( !err ) ) { - - /* This is a more recent version of a pair we saw previously and - meta element ele_idx currently maps pair key to this previous - version. Mark the old version as garbage to collect in the - future and update the mapping to this version. */ - - ulong old_pair_ctl = ele0[ ele_idx ].phdr.ctl; - - ulong old_pair_val_esz = fd_vinyl_bstream_ctl_sz( old_pair_ctl ); - - garbage_sz += fd_vinyl_bstream_pair_sz( old_pair_val_esz ); - - //ele0[ ele_idx ].memo = pair_memo; /* already current */ - ele0[ ele_idx ].phdr = block->phdr; - ele0[ ele_idx ].seq = seq; - //ele0[ ele_idx ].line_idx = ULONG_MAX; /* already current */ - - } else { - - FD_LOG_WARNING(( "%016lx: corrupt meta", seq )); - goto done; - - } - - seq += pair_sz; - break; - - } - - case FD_VINYL_BSTREAM_CTL_TYPE_DEAD: { - - char const * _err = fd_vinyl_bstream_dead_test( io_seed, seq, block ); - if( FD_UNLIKELY( _err ) ) { - FD_LOG_WARNING(( "%016lx: %s", seq, _err )); - goto done; - } - - /* At this point, we appear to have a valid DEAD block. Look up - the pair it erases. */ - - ulong pair_val_esz = fd_vinyl_bstream_ctl_sz( block->dead.phdr.ctl ); - - fd_vinyl_key_t const * pair_key = &block->dead.phdr.key; - - ulong pair_memo = fd_vinyl_key_memo( meta_seed, pair_key ); - - ulong _ele_idx; /* avoid pointer escape */ - int err = fd_vinyl_meta_query_fast( ele0, ele_max, pair_key, pair_memo, &_ele_idx ); - ulong ele_idx = _ele_idx;; - - if( FD_LIKELY( err==FD_VINYL_ERR_KEY ) ) { - - /* This erases the most recent version of pair key in the - bstream's antiquity or is a redundant erase block (which is - arguably an error but, as we can't tell the difference at - this point, we assume the more likely antiquity case). In - short, there's nothing to do but mark this block as garbage - to collect in the future. */ - - garbage_sz += FD_VINYL_BSTREAM_BLOCK_SZ; - - } else { - - /* This erases the most recent version of pair key we've - processed. Validate the erasure target is correct. If so, - mark this block and that version of pair key as garbage for - future collection and remove pair key from the meta. */ - - int bad_order = fd_vinyl_seq_ge( ele0[ ele_idx ].seq, seq ); - int bad_phdr = !!memcmp( &ele0[ ele_idx ].phdr, &block->dead.phdr, sizeof(fd_vinyl_bstream_phdr_t) ); - - if( FD_UNLIKELY( bad_order | bad_phdr ) ) { - FD_LOG_WARNING(( "%016lx: %s", seq, bad_order ? "unordered sequence" : "mismatched dead pair metadata" )); - goto done; - } - - garbage_sz += FD_VINYL_BSTREAM_BLOCK_SZ + fd_vinyl_bstream_pair_sz( pair_val_esz ); - - fd_vinyl_meta_remove_fast( ele0, ele_max, lock, lock_shift, line, line_cnt, ele_idx ); - - FD_CRIT( pair_cnt, "corruption detected" ); - pair_cnt--; - - } - - seq += FD_VINYL_BSTREAM_BLOCK_SZ; - break; - - } - - case FD_VINYL_BSTREAM_CTL_TYPE_MOVE: { - - if( FD_UNLIKELY( 2UL*FD_VINYL_BSTREAM_BLOCK_SZ > (seq_present-seq) ) ) { /* Wrapping safe */ - FD_LOG_WARNING(( "%016lx: truncated", seq )); - goto done; - } - - fd_vinyl_bstream_block_t dst[1]; - - fd_vinyl_io_read_imm( io, seq + FD_VINYL_BSTREAM_BLOCK_SZ, dst, FD_VINYL_BSTREAM_BLOCK_SZ ); - - char const * _err = fd_vinyl_bstream_move_test( io_seed, seq, block, dst ); - if( FD_UNLIKELY( _err ) ) { - FD_LOG_WARNING(( "%016lx: %s", seq, _err )); - goto done; - } - - /* At this point, we appear to have a valid move. Technically, a - move is an atomic "erase pair src_key if any, erase pair - dst_key if any, insert pair dst_key with the info src_info_old - and val src_val_new" where src_val_new is typically the same as - src_val_old, but, strictly speaking, doesn't have to be. - - We do the "erase pair src_key if any" part of the move here. - The next iteration will handle rest naturally (including doing - more extensive validation on the new pair_dst). Note that if - the next iteration detects the new pair dst is invalid, it will - fail recovery in the middle of the move. So applications - should be very wary of using a partial recovery as such can - break move atomicity. */ - - ulong src_val_esz = fd_vinyl_bstream_ctl_sz( block->move.src.ctl ); - fd_vinyl_key_t const * src_key = &block->move.src.key; - - ulong src_memo = fd_vinyl_key_memo( meta_seed, src_key ); - - ulong _ele_idx; /* avoid pointer escape */ - int err = fd_vinyl_meta_query_fast( ele0, ele_max, src_key, src_memo, &_ele_idx ); - ulong ele_idx = _ele_idx; - - if( FD_LIKELY( err==FD_VINYL_ERR_KEY ) ) { - - /* This move erases the most recent version of pair src_key in - the bstream's antiquity or is a redundant move block (which - is arguably an error but, as we can't tell the difference at - this point, we assume the more likely antiquity case). In - short, there's nothing to do but mark this block as garbage - to collect in the future. */ - - garbage_sz += FD_VINYL_BSTREAM_BLOCK_SZ; - - } else { - - /* This move erases the most recent version of pair src_key - we've processed. Validate the erasure target is correct. If - so, mark this block and this version of pair src_key as - garbage for future collection and remove pair src_key from - the meta. */ - - int bad_order = fd_vinyl_seq_ge( ele0[ ele_idx ].seq, seq ); - int bad_cnt = !pair_cnt; - int bad_phdr = !!memcmp( &ele0[ ele_idx ].phdr, &block->move.src, sizeof(fd_vinyl_bstream_phdr_t) ); - - if( FD_UNLIKELY( bad_order | bad_cnt | bad_phdr ) ) { - FD_LOG_WARNING(( "%016lx: %s", seq, bad_order ? "unordered sequence" : - bad_cnt ? "corrupt meta" : - "mismatched move src metadata" )); - goto done; - } - - garbage_sz += FD_VINYL_BSTREAM_BLOCK_SZ + fd_vinyl_bstream_pair_sz( src_val_esz ); - - fd_vinyl_meta_remove_fast( ele0, ele_max, lock, lock_shift, line, line_cnt, ele_idx ); - - pair_cnt--; - - } - - /* At this point, we've handled the "erase old src if any" part of - the move. The next iteration will handle the "erase old dst if - any" and the "insert new dst" part of the move. We know there - will be a next iteration for a type pair object with the - appropriate mojo because of the checks we've already done. So - moves behave atomically from the point of view of the - application when fully recovered. */ - - seq += FD_VINYL_BSTREAM_BLOCK_SZ; - break; - - } - - case FD_VINYL_BSTREAM_CTL_TYPE_PART: { - - if( FD_UNLIKELY( fd_vinyl_seq_ne( block->part.seq, seq ) ) ) { - FD_LOG_WARNING(( "%016lx: unexpected part seq", seq )); - goto done; - } - - char const * _err = fd_vinyl_bstream_part_test( io_seed, seq, block ); - if( FD_UNLIKELY( _err ) ) { - FD_LOG_WARNING(( "%016lx: %s", seq, _err )); - goto done; - } - - garbage_sz += FD_VINYL_BSTREAM_BLOCK_SZ; - seq += FD_VINYL_BSTREAM_BLOCK_SZ; - break; - - } - - case FD_VINYL_BSTREAM_CTL_TYPE_ZPAD: { - - char const * _err = fd_vinyl_bstream_zpad_test( io_seed, seq, block ); - if( FD_UNLIKELY( _err ) ) { - FD_LOG_WARNING(( "%016lx: %s", seq, _err )); - goto done; - } - - /* Note: zpad blocks aren't included in garbage_sz because we - don't control when they get created (and thus can't easily - update garbage_sz to account for them when they are created). */ - - seq += FD_VINYL_BSTREAM_BLOCK_SZ; - break; - - } - - default: - FD_LOG_WARNING(( "%016lx: unknown type (%x)", seq, (uint)type )); - goto done; - } - } - -done: - - /* At this point, the meta is populated appropriately up to seq. - Update the vinyl state and return. If we did not get to - seq_present, we log a warning. */ - - vinyl->pair_cnt = pair_cnt; - vinyl->garbage_sz = garbage_sz; - - if( FD_UNLIKELY( fd_vinyl_seq_ne( seq, seq_present ) ) ) - FD_LOG_WARNING(( "recovery failed, recovered [%016lx,%016lx)/%lu, unrecovered [%016lx,%016lx)/%lu", - seq_past, seq, seq-seq_past, seq, seq_present, seq_present-seq )); - - return seq; -} diff --git a/src/vinyl/line/fd_vinyl_line.h b/src/vinyl/line/fd_vinyl_line.h index 6a7302ccc6f..cac1e5ab8cf 100644 --- a/src/vinyl/line/fd_vinyl_line.h +++ b/src/vinyl/line/fd_vinyl_line.h @@ -83,7 +83,10 @@ acquired-for-read ref times. */ struct __attribute__((aligned(32))) fd_vinyl_line { - fd_vinyl_data_obj_t * obj; /* location in the data cache of the data_obj storing val, NULL if not caching a pair */ + union { + fd_vinyl_data_obj_t * obj; /* location in the data cache of the data_obj storing val, NULL if not caching a pair */ + ulong obj_gaddr; /* same, as a global address for cross-address-space use */ + }; ulong ele_idx; /* map element storing key and the pair metadata (app and key), in [0,map_cnt) */ ulong ctl; /* packs the line version and line reference count */ uint line_idx_older; /* older line in eviction sequence, in [0,line_cnt) */ @@ -92,6 +95,8 @@ struct __attribute__((aligned(32))) fd_vinyl_line { typedef struct fd_vinyl_line fd_vinyl_line_t; +FD_STATIC_ASSERT( sizeof(fd_vinyl_line_t)==32UL, layout ); + FD_PROTOTYPES_BEGIN /* fd_vinyl_line_ctl returns ver and ref encoded as a line ctl. ver is diff --git a/src/vinyl/test_vinyl_req.c b/src/vinyl/test_vinyl_req.c deleted file mode 100644 index 5fc56dd8094..00000000000 --- a/src/vinyl/test_vinyl_req.c +++ /dev/null @@ -1,1101 +0,0 @@ -#include "fd_vinyl.h" - -#define PAIR_MAX (4UL) - -struct pair { - int creating; - long acq; - ulong ver; - ulong val_max; - fd_vinyl_key_t key [ 1 ]; - fd_vinyl_info_t info [ 1 ]; - fd_vinyl_info_t backup_info[ 1 ]; - uchar val [ FD_VINYL_VAL_MAX ]; - uchar backup_val [ FD_VINYL_VAL_MAX ]; -}; - -typedef struct pair pair_t; - -static struct { - ulong quota_rem; - ulong used; - pair_t pair[ PAIR_MAX ]; -} ref; - -static int -req( int type, /* request type */ - ulong flags, /* request flags */ - ulong val_max, /* for acquire-with-modify */ - fd_vinyl_key_t const * key, /* Key for req */ - pair_t ** _pair, /* Location of pair on successful acquire and/or successful try */ - ulong * _ver ) { /* Version of pair on try, has try version on test, ignored otherwise */ - - switch( type ) { - - case FD_VINYL_REQ_TYPE_ACQUIRE: { - if( !ref.quota_rem ) return FD_VINYL_ERR_FULL; /* (comp err) client quota exhausted */ - - if( fd_vinyl_req_flag_modify( flags ) && (val_max>FD_VINYL_VAL_MAX) ) return FD_VINYL_ERR_INVAL; /* bad req val_max */ - - ulong idx = 0UL; - pair_t * pair = NULL; - for( ; idx> idx) & 1UL) && fd_vinyl_key_eq( key, ref.pair[ idx ].key ) ) { - pair = &ref.pair[ idx ]; - break; - } - } - - if( pair ) { - - if( !fd_vinyl_req_flag_modify( flags ) ) { - - /* start blocking read of an existing pair */ - - if( pair->acq < 0L ) return FD_VINYL_ERR_AGAIN; /* key acquired for modify */ - - ulong ref_szc = fd_vinyl_data_szc( (ulong)pair->info->val_sz ); - pair->val_max = fd_vinyl_data_szc_val_max( ref_szc ); - - FD_TEST( !pair->creating ); - pair->acq++; - //pair->ver unchanged - - ref.quota_rem--; - - *_pair = pair; - return FD_VINYL_SUCCESS; - - } - - /* start modify of an existing pair */ - - if( pair->acq ) return FD_VINYL_ERR_AGAIN; /* key acquired at least once */ - if( fd_vinyl_req_flag_excl( flags ) ) return FD_VINYL_ERR_INVAL; /* not allowed to modify existing */ - - pair->backup_info[0] = pair->info[0]; - if( pair->info->val_sz ) memcpy( pair->backup_val, pair->val, (ulong)pair->info->val_sz ); - - if( fd_vinyl_req_flag_ignore( flags ) ) pair->info->val_sz = 0U; - - ulong ref_szc = fd_vinyl_data_szc( fd_ulong_max( val_max, (ulong)pair->info->val_sz ) ); - pair->val_max = fd_vinyl_data_szc_val_max( ref_szc ); - - FD_TEST( !pair->creating ); - pair->acq = -1L; - pair->ver++; - - ref.quota_rem--; - - *_pair = pair; - return FD_VINYL_SUCCESS; - - } - - /* start creating a pair */ - - if( !(fd_vinyl_req_flag_modify( flags ) && fd_vinyl_req_flag_create( flags )) ) return FD_VINYL_ERR_KEY; - - FD_TEST( (ulong)fd_ulong_popcnt( ref.used ) < PAIR_MAX ); - idx = (ulong)fd_ulong_find_lsb( ~ref.used ); - pair = &ref.pair[ idx ]; - ref.used |= (1UL << idx); - - pair->key[0] = key[0]; - memset( pair->info, 0UL, sizeof(fd_vinyl_info_t) ); - - ulong ref_szc = fd_vinyl_data_szc( val_max ); - pair->val_max = fd_vinyl_data_szc_val_max( ref_szc ); - - pair->creating = 1; - pair->acq = -1L; - pair->ver++; - - ref.quota_rem--; - - *_pair = pair; - return FD_VINYL_SUCCESS; - } - - case FD_VINYL_REQ_TYPE_RELEASE: { - ulong idx = 0UL; - pair_t * pair = NULL; - for( ; idx> idx) & 1UL) && fd_vinyl_key_eq( key, ref.pair[ idx ].key ) ) { - pair = &ref.pair[ idx ]; - break; - } - } - - if( !pair ) return FD_VINYL_ERR_INVAL; /* Key does not exist and is not being created (cannot have been acquired) */ - if( !pair->acq ) return FD_VINYL_ERR_INVAL; /* Key is not acquired */ - - if( pair->acq > 0L ) { - - /* finish blocking read */ - - if( fd_vinyl_req_flag_modify( flags ) ) FD_LOG_CRIT(( "modify read only" )); - - FD_TEST( !pair->creating ); - - pair->acq--; - - ref.quota_rem++; - return FD_VINYL_SUCCESS; - - } - - if( pair->creating ) { - - if( ((!fd_vinyl_req_flag_modify( flags )) | fd_vinyl_req_flag_erase( flags )) ) { - - /* cancel / erase a create */ - - pair->ver++; - - ref.used &= ~(1UL << idx); - - ref.quota_rem++; - return FD_VINYL_SUCCESS; - - } - - /* finish a create */ - - if( (ulong)pair->info->val_sz > pair->val_max ) FD_LOG_CRIT(( "val buffer overrun" )); - - ulong ref_szc = fd_vinyl_data_szc( (ulong)pair->info->val_sz ); - pair->val_max = fd_vinyl_data_szc_val_max( ref_szc ); - - pair->creating = 0; - pair->acq = 0L; - pair->ver++; - - ref.quota_rem++; - return FD_VINYL_SUCCESS; - - } - - if( !fd_vinyl_req_flag_modify( flags ) ) { - - if( !fd_vinyl_req_flag_ignore( flags ) ) { - - /* cancel modify existing (info/val were not clobbered during the modify attempt) */ - - pair->acq = 0L; - pair->ver--; - - ref.quota_rem++; - return FD_VINYL_SUCCESS; - - } - - /* cancel modify existing (info/val were potentially clobbered during the modify attempt) */ - - pair->info[0] = pair->backup_info[0]; - if( pair->backup_info->val_sz ) memcpy( pair->val, pair->backup_val, (ulong)pair->backup_info->val_sz ); - - ulong ref_szc = fd_vinyl_data_szc( (ulong)pair->info->val_sz ); - pair->val_max = fd_vinyl_data_szc_val_max( ref_szc ); - - pair->acq = 0L; - pair->ver++; - - ref.quota_rem++; - return FD_VINYL_SUCCESS; - - } - - if( fd_vinyl_req_flag_erase( flags ) ) { - - /* erase existing */ - - pair->ver++; - - ref.used &= ~(1UL<info->val_sz > pair->val_max ) FD_LOG_CRIT(( "val buffer overrun" )); - - ulong ref_szc = fd_vinyl_data_szc( (ulong)pair->info->val_sz ); - pair->val_max = fd_vinyl_data_szc_val_max( ref_szc ); - - pair->acq = 0L; - pair->ver++; - - ref.quota_rem++; - return FD_VINYL_SUCCESS; - } - - case FD_VINYL_REQ_TYPE_ERASE: { - ulong idx = 0UL; - pair_t * pair = NULL; - for( ; idx> idx) & 1UL) && fd_vinyl_key_eq( key, ref.pair[ idx ].key ) ) { - pair = &ref.pair[ idx ]; - break; - } - } - - if( !pair ) return FD_VINYL_ERR_KEY; /* Key does not exist */ - if( pair->acq ) return FD_VINYL_ERR_AGAIN; /* Key acquired at least once */ - - pair->ver++; - - ref.used &= ~(1UL<> dst_idx) & 1UL) && fd_vinyl_key_eq( dst_key, ref.pair[ dst_idx ].key ) ) { - dst_pair = &ref.pair[ dst_idx ]; - break; - } - } - - if( dst_pair && dst_pair->acq ) return FD_VINYL_ERR_AGAIN; - - /* Lookup pair src_key. If it doesn't exist, fail with KEY. If it - is acquired, fail with AGAIN. */ - - ulong src_idx = 0UL; - pair_t * src_pair = NULL; - for( ; src_idx> src_idx) & 1UL) && fd_vinyl_key_eq( src_key, ref.pair[ src_idx ].key ) ) { - src_pair = &ref.pair[ src_idx ]; - break; - } - } - - if( !src_pair ) return FD_VINYL_ERR_KEY; - if( src_pair->acq ) return FD_VINYL_ERR_AGAIN; - - /* At this point: - - pair dst_key may or may not exist. If it exists it is not - acquired. - - pair src_key exists and is not acquired. - Thus we are clear to move. Erase pair dst_key if it exists. Then - rename pair src_key to pair dst_key. */ - - if( dst_pair ) { - dst_pair->ver++; - ref.used &= ~(1UL<key[0] = dst_key[0]; - src_pair->ver++; - - return FD_VINYL_SUCCESS; - } - - case FD_VINYL_REQ_TYPE_FETCH: { - return FD_VINYL_SUCCESS; - } - - case FD_VINYL_REQ_TYPE_FLUSH: { - return FD_VINYL_SUCCESS; - } - - case FD_VINYL_REQ_TYPE_TRY: { - ulong idx = 0UL; - pair_t * pair = NULL; - for( ; idx> idx) & 1UL) && fd_vinyl_key_eq( key, ref.pair[ idx ].key ) ) { - pair = &ref.pair[ idx ]; - break; - } - } - - if( !pair ) return FD_VINYL_ERR_KEY; /* Key does not exist */ - if( pair->acq<0L ) return FD_VINYL_ERR_AGAIN; /* Key acquired-for-modify */ - - *_pair = pair; - *_ver = pair->ver; - - return FD_VINYL_SUCCESS; - } - - case FD_VINYL_REQ_TYPE_TEST: { - - if( (*_pair)->ver != (*_ver) ) return FD_VINYL_ERR_CORRUPT; /* Key modified during the try */ - - return FD_VINYL_SUCCESS; - } - - default: - break; - } - - return FD_VINYL_ERR_INVAL; -} - -static int -fd_vinyl_tile( int argc, - char ** argv ) { - (void)argc; - fd_vinyl_exec( (fd_vinyl_t *)argv ); - return 0; -} - -static void -client_tile( ulong iter_max, - fd_cnc_t * cnc, - ulong link_id, - fd_vinyl_rq_t * rq, - fd_vinyl_cq_t * cq, - fd_wksp_t * wksp, - void * _scratch ) { - fd_rng_t _rng[1]; fd_rng_t * rng = fd_rng_join( fd_rng_new( _rng, 0U, 0UL ) ); - - uchar * top = (uchar *)_scratch; - - fd_vinyl_key_t * _key = (fd_vinyl_key_t *) top; top += sizeof(fd_vinyl_key_t)*PAIR_MAX; - ulong * _try_gaddr = (ulong *) top; top += sizeof(ulong)*2UL *PAIR_MAX; - - fd_vinyl_comp_t * comp = (fd_vinyl_comp_t *)top; top += sizeof(fd_vinyl_comp_t); - ulong * val_gaddr = (ulong *) top; top += sizeof(ulong); - schar * err = (schar *) top; top += sizeof(schar); - - ulong comp_gaddr = fd_wksp_gaddr( wksp, comp ); - ulong val_gaddr_gaddr = fd_wksp_gaddr( wksp, val_gaddr ); - ulong err_gaddr = fd_wksp_gaddr( wksp, err ); - - ulong cq_seq = fd_vinyl_cq_seq( cq ); - -# define WAIT do { \ - if( oob ) { \ - while( !FD_VOLATILE_CONST( comp->seq ) ) FD_SPIN_PAUSE(); \ - FD_TEST( comp->seq==1UL ); \ - } else { \ - while( fd_vinyl_cq_recv( cq, cq_seq, comp ) ) FD_SPIN_PAUSE(); \ - FD_TEST( comp->seq==cq_seq ); \ - cq_seq++; \ - } \ - FD_TEST( comp->req_id ==req_id ); \ - FD_TEST( comp->link_id==link_id ); \ - } while(0) - - ulong val_max_bad = FD_VINYL_VAL_MAX+1UL; - - long acq [ PAIR_MAX ]; for( ulong idx=0UL; idx>= 2; - ulong dst_idx = (r & 3UL); r >>= 2; - int op = (int)(r & 63UL); r >>= 6; - int by_key = (int)(r & 1UL); r >>= 1; - int do_mod = (int)(r & 1UL); r >>= 1; - ulong oob = (int)(r & 1UL) ? comp_gaddr : 0UL; r >>= 1; - ulong flags = (r & 255UL); r >>= 8; - ulong val_max = (r & (ulong)UINT_MAX) % (FD_VINYL_VAL_MAX+1UL); r >>= 32; - int pat = (int)(r & 255UL); r >>= 8; - - fd_vinyl_key_t * src_key = _key + src_idx; - fd_vinyl_key_t * dst_key = _key + dst_idx; - ulong * try_gaddr = _try_gaddr + 2UL*src_idx; - - ulong src_key_gaddr = fd_wksp_gaddr( wksp, src_key ); - ulong dst_key_gaddr = fd_wksp_gaddr( wksp, dst_key ); - ulong try_gaddr_gaddr = fd_wksp_gaddr( wksp, try_gaddr ); - - comp->seq = 0UL; - - switch( op ) { - - case 0: /* mismatched link id (dropped and ticks the DROP_LINK counter) */ - val_gaddr[0] = val_max; - fd_vinyl_rq_send( rq, req_id, ~link_id, FD_VINYL_REQ_TYPE_ACQUIRE, flags, 1UL, - src_key_gaddr, val_gaddr_gaddr, err_gaddr, oob ); - break; - - case 1: /* unmappable oob completion (dropped and ticks the DROP_COMP counter) */ - val_gaddr[0] = val_max; - fd_vinyl_rq_send( rq, req_id, link_id, FD_VINYL_REQ_TYPE_ACQUIRE, flags, 1UL, - src_key_gaddr, val_gaddr_gaddr, err_gaddr, ULONG_MAX ); - break; - - case 2: /* bad request type */ - fd_vinyl_rq_send( rq, req_id, link_id, -1, flags, 1UL, - src_key_gaddr, val_gaddr_gaddr, err_gaddr, oob ); WAIT; - FD_TEST( comp->err ==FD_VINYL_ERR_INVAL ); FD_TEST( comp->batch_cnt==(ushort)1 ); - FD_TEST( comp->fail_cnt ==(ushort)0 ); FD_TEST( comp->quota_rem==(ushort)ref.quota_rem ); - break; - - /* acquire tests */ - - case 3: /* acquire with unmappable key */ - val_gaddr[0] = val_max; - fd_vinyl_rq_send( rq, req_id, link_id, FD_VINYL_REQ_TYPE_ACQUIRE, flags, 1UL, - 0UL, val_gaddr_gaddr, err_gaddr, oob ); WAIT; - FD_TEST( comp->err ==FD_VINYL_ERR_INVAL ); FD_TEST( comp->batch_cnt==(ushort)1 ); - FD_TEST( comp->fail_cnt ==(ushort)0 ); FD_TEST( comp->quota_rem==(ushort)ref.quota_rem ); - break; - - case 4: /* acquire with unmappable val */ - val_gaddr[0] = val_max; - fd_vinyl_rq_send( rq, req_id, link_id, FD_VINYL_REQ_TYPE_ACQUIRE, flags, 1UL, - src_key_gaddr, 0UL, err_gaddr, oob ); WAIT; - FD_TEST( comp->err ==FD_VINYL_ERR_INVAL ); FD_TEST( comp->batch_cnt==(ushort)1 ); - FD_TEST( comp->fail_cnt ==(ushort)0 ); FD_TEST( comp->quota_rem==(ushort)ref.quota_rem ); - break; - - case 5: /* acquire with unmappable err */ - val_gaddr[0] = val_max; - fd_vinyl_rq_send( rq, req_id, link_id, FD_VINYL_REQ_TYPE_ACQUIRE, flags, 1UL, - src_key_gaddr, val_gaddr_gaddr, 0UL, oob ); WAIT; - FD_TEST( comp->err ==FD_VINYL_ERR_INVAL ); FD_TEST( comp->batch_cnt==(ushort)1 ); - FD_TEST( comp->fail_cnt ==(ushort)0 ); FD_TEST( comp->quota_rem==(ushort)ref.quota_rem ); - break; - - case 6: /* acquire with zero batch */ - val_gaddr[0] = val_max; - fd_vinyl_rq_send( rq, req_id, link_id, FD_VINYL_REQ_TYPE_ACQUIRE, flags, 0UL, - 0UL, 0UL, 0UL, oob ); WAIT; - FD_TEST( comp->err ==FD_VINYL_SUCCESS ); FD_TEST( comp->batch_cnt==(ushort)0 ); - FD_TEST( comp->fail_cnt ==(ushort)0 ); FD_TEST( comp->quota_rem==(ushort)ref.quota_rem ); - break; - - case 7: { /* acquire with bad val_max */ - pair_t * pair; - int ref_err = req( FD_VINYL_REQ_TYPE_ACQUIRE, flags | FD_VINYL_REQ_FLAG_MODIFY, val_max_bad, src_key, &pair, NULL ); - val_gaddr[0] = val_max_bad; - fd_vinyl_rq_send( rq, req_id, link_id, FD_VINYL_REQ_TYPE_ACQUIRE, flags | FD_VINYL_REQ_FLAG_MODIFY, 1UL, - src_key_gaddr, val_gaddr_gaddr, err_gaddr, oob ); WAIT; - if( ref_err==FD_VINYL_ERR_FULL ) { - FD_TEST( comp->err ==FD_VINYL_ERR_FULL ); FD_TEST( comp->batch_cnt==(ushort)1 ); - FD_TEST( comp->fail_cnt ==(ushort)0 ); FD_TEST( comp->quota_rem==(ushort)ref.quota_rem ); - break; - } - FD_TEST( comp->err ==FD_VINYL_SUCCESS ); FD_TEST( comp->batch_cnt==(ushort)1 ); - FD_TEST( comp->fail_cnt ==(ushort)1 ); FD_TEST( comp->quota_rem==(ushort)ref.quota_rem ); - FD_TEST( err[0]==(schar)FD_VINYL_ERR_INVAL ); - break; - } - - case 8: { /* acquire */ - pair_t * pair; - int ref_err = req( FD_VINYL_REQ_TYPE_ACQUIRE, flags, val_max, src_key, &pair, NULL ); - val_gaddr[0] = val_max; - fd_vinyl_rq_send( rq, req_id, link_id, FD_VINYL_REQ_TYPE_ACQUIRE, flags, 1UL, - src_key_gaddr, val_gaddr_gaddr, err_gaddr, oob ); WAIT; - if( ref_err==FD_VINYL_ERR_FULL ) { - FD_TEST( comp->err ==FD_VINYL_ERR_FULL ); FD_TEST( comp->batch_cnt==(ushort)1 ); - FD_TEST( comp->fail_cnt ==(ushort)0 ); FD_TEST( comp->quota_rem==(ushort)ref.quota_rem ); - break; - } - FD_TEST( comp->err ==FD_VINYL_SUCCESS ); FD_TEST( comp->batch_cnt==(ushort)1 ); - FD_TEST( comp->fail_cnt ==(ushort)!!ref_err ); FD_TEST( comp->quota_rem==(ushort)ref.quota_rem ); - FD_TEST( err[0]==(schar)ref_err ); - - if( !ref_err ) { - acq_gaddr[ src_idx ] = val_gaddr[0]; - - void * val = fd_wksp_laddr_fast( wksp, val_gaddr[0] ); - fd_vinyl_info_t * info = fd_vinyl_data_info( val ); - ulong val_sz = (ulong)info->val_sz; - ulong val_max = fd_vinyl_data_val_max( val ); - - FD_TEST( val_max==pair->val_max ); - - FD_TEST( !memcmp( info, pair->info, sizeof(fd_vinyl_info_t) ) ); - - if( val_sz ) FD_TEST( !memcmp( val, pair->val, val_sz ) ); - - /* FIXME: TEST [VAL_SZ,VAL_MAX) ZPAD? */ - - if( fd_vinyl_req_flag_modify( flags ) ) { - acq [ src_idx ] = -1L; - acq_modified[ src_idx ] = fd_vinyl_req_flag_ignore( flags ); - if( do_mod ) { - val_sz = fd_rng_ulong_roll( rng, val_max + 1UL ); - memset( info, pat, sizeof(fd_vinyl_info_t) ); memset( pair->info, pat, sizeof(fd_vinyl_info_t) ); - memset( val, pat, val_sz ); memset( pair->val, pat, val_sz ); - info->val_sz = (uint)val_sz; pair->info->val_sz = (uint)val_sz; - acq_modified[ src_idx ] |= 1; - } - } else { - FD_TEST( !memcmp( info, pair->info, sizeof(fd_vinyl_info_t) ) ); - FD_TEST( !memcmp( val, pair->val, val_sz ) ); - acq [ src_idx ]++; - acq_modified[ src_idx ] = 0; - } - } - break; - } - - /* release tests */ - - case 9: /* release with unmappable key */ - fd_vinyl_rq_send( rq, req_id, link_id, FD_VINYL_REQ_TYPE_RELEASE, flags | FD_VINYL_REQ_FLAG_BY_KEY, 1UL, - 0UL, val_gaddr_gaddr, err_gaddr, oob ); WAIT; - FD_TEST( comp->err ==FD_VINYL_ERR_INVAL ); FD_TEST( comp->batch_cnt==(ushort)1 ); - FD_TEST( comp->fail_cnt ==(ushort)0 ); FD_TEST( comp->quota_rem==(ushort)ref.quota_rem ); - break; - - case 10: /* release with unmappable val */ - fd_vinyl_rq_send( rq, req_id, link_id, FD_VINYL_REQ_TYPE_RELEASE, flags & ~FD_VINYL_REQ_FLAG_BY_KEY, 1UL, - src_key_gaddr, 0UL, err_gaddr, oob ); WAIT; - FD_TEST( comp->err ==FD_VINYL_ERR_INVAL ); FD_TEST( comp->batch_cnt==(ushort)1 ); - FD_TEST( comp->fail_cnt ==(ushort)0 ); FD_TEST( comp->quota_rem==(ushort)ref.quota_rem ); - break; - - case 11: /* release with unmappable err */ - fd_vinyl_rq_send( rq, req_id, link_id, FD_VINYL_REQ_TYPE_RELEASE, flags, 1UL, - src_key_gaddr, val_gaddr_gaddr, 0UL, oob ); WAIT; - FD_TEST( comp->err ==FD_VINYL_ERR_INVAL ); FD_TEST( comp->batch_cnt==(ushort)1 ); - FD_TEST( comp->fail_cnt ==(ushort)0 ); FD_TEST( comp->quota_rem==(ushort)ref.quota_rem ); - break; - - case 12: /* release with zero batch */ - fd_vinyl_rq_send( rq, req_id, link_id, FD_VINYL_REQ_TYPE_RELEASE, flags, 0UL, - 0UL, 0UL, 0UL, oob ); WAIT; - FD_TEST( comp->err ==FD_VINYL_SUCCESS ); FD_TEST( comp->batch_cnt==(ushort)0 ); - FD_TEST( comp->fail_cnt ==(ushort)0 ); FD_TEST( comp->quota_rem==(ushort)ref.quota_rem ); - break; - - case 13: { /* release */ - if( acq[ src_idx ] > 0L ) flags &= ~FD_VINYL_REQ_FLAG_MODIFY; /* can't say modify on an acquire-for-read */ - if( ((acq[ src_idx ] < 0L) & (!fd_vinyl_req_flag_modify( flags )) & acq_modified[ src_idx ]) ) - flags |= FD_VINYL_REQ_FLAG_IGNORE; - int ref_err = req( FD_VINYL_REQ_TYPE_RELEASE, flags, val_max_bad, src_key, NULL, NULL ); - if( by_key || !acq[ src_idx ] ) { - fd_vinyl_rq_send( rq, req_id, link_id, FD_VINYL_REQ_TYPE_RELEASE, flags | FD_VINYL_REQ_FLAG_BY_KEY, 1UL, - src_key_gaddr, 0UL, err_gaddr, oob ); - } else { - val_gaddr[0] = acq_gaddr[ src_idx ]; - fd_vinyl_rq_send( rq, req_id, link_id, FD_VINYL_REQ_TYPE_RELEASE, flags & ~FD_VINYL_REQ_FLAG_BY_KEY, 1UL, - 0UL, val_gaddr_gaddr, err_gaddr, oob ); - } - WAIT; - FD_TEST( comp->err ==FD_VINYL_SUCCESS ); FD_TEST( comp->batch_cnt==(ushort)1 ); - FD_TEST( comp->fail_cnt ==(ushort)!!ref_err ); FD_TEST( comp->quota_rem==(ushort)ref.quota_rem ); - FD_TEST( err[0]==(schar)ref_err ); - if( !ref_err ) acq[ src_idx ] = (acq[ src_idx ] > 0L) ? (acq[ src_idx ]-1L) : 0L; - break; - } - - /* erase tests */ - - case 14: /* erase with unmappable key */ - fd_vinyl_rq_send( rq, req_id, link_id, FD_VINYL_REQ_TYPE_ERASE, flags, 1UL, - 0UL, 0UL, err_gaddr, oob ); WAIT; - FD_TEST( comp->err ==FD_VINYL_ERR_INVAL ); FD_TEST( comp->batch_cnt==(ushort)1 ); - FD_TEST( comp->fail_cnt ==(ushort)0 ); FD_TEST( comp->quota_rem==(ushort)ref.quota_rem ); - break; - - case 15: /* erase with unmappable err */ - fd_vinyl_rq_send( rq, req_id, link_id, FD_VINYL_REQ_TYPE_ERASE, flags, 1UL, - src_key_gaddr, 0UL, 0UL, oob ); WAIT; - FD_TEST( comp->err ==FD_VINYL_ERR_INVAL ); FD_TEST( comp->batch_cnt==(ushort)1 ); - FD_TEST( comp->fail_cnt ==(ushort)0 ); FD_TEST( comp->quota_rem==(ushort)ref.quota_rem ); - break; - - case 16: /* erase with zero batch */ - fd_vinyl_rq_send( rq, req_id, link_id, FD_VINYL_REQ_TYPE_ERASE, flags, 0UL, - src_key_gaddr, 0UL, err_gaddr, oob ); WAIT; - FD_TEST( comp->err ==FD_VINYL_SUCCESS ); FD_TEST( comp->batch_cnt==(ushort)0 ); - FD_TEST( comp->fail_cnt ==(ushort)0 ); FD_TEST( comp->quota_rem==(ushort)ref.quota_rem ); - break; - - case 17: { /* erase */ - int ref_err = req( FD_VINYL_REQ_TYPE_ERASE, flags, val_max_bad, src_key, NULL, NULL ); - fd_vinyl_rq_send( rq, req_id, link_id, FD_VINYL_REQ_TYPE_ERASE, flags, 1UL, - src_key_gaddr, 1UL, err_gaddr, oob ); WAIT; - FD_TEST( comp->err ==FD_VINYL_SUCCESS ); FD_TEST( comp->batch_cnt==(ushort)1 ); - FD_TEST( comp->fail_cnt ==(ushort)!!ref_err ); FD_TEST( comp->quota_rem==(ushort)ref.quota_rem ); - FD_TEST( err[0]==(schar)ref_err ); - break; - } - - /* move tests */ - - case 18: /* move with unmappable src */ - fd_vinyl_rq_send( rq, req_id, link_id, FD_VINYL_REQ_TYPE_MOVE, flags, 1UL, - 0UL, src_key_gaddr, err_gaddr, oob ); WAIT; - FD_TEST( comp->err ==FD_VINYL_ERR_INVAL ); FD_TEST( comp->batch_cnt==(ushort)1 ); - FD_TEST( comp->fail_cnt ==(ushort)0 ); FD_TEST( comp->quota_rem==(ushort)ref.quota_rem ); - break; - - case 19: /* move with unmappable dst */ - fd_vinyl_rq_send( rq, req_id, link_id, FD_VINYL_REQ_TYPE_MOVE, flags, 1UL, - src_key_gaddr, 0UL, err_gaddr, oob ); WAIT; - FD_TEST( comp->err ==FD_VINYL_ERR_INVAL ); FD_TEST( comp->batch_cnt==(ushort)1 ); - FD_TEST( comp->fail_cnt ==(ushort)0 ); FD_TEST( comp->quota_rem==(ushort)ref.quota_rem ); - break; - - case 20: /* move with unmappable err */ - fd_vinyl_rq_send( rq, req_id, link_id, FD_VINYL_REQ_TYPE_MOVE, flags, 1UL, - src_key_gaddr, dst_key_gaddr, 0UL, oob ); WAIT; - FD_TEST( comp->err ==FD_VINYL_ERR_INVAL ); FD_TEST( comp->batch_cnt==(ushort)1 ); - FD_TEST( comp->fail_cnt ==(ushort)0 ); FD_TEST( comp->quota_rem==(ushort)ref.quota_rem ); - break; - - case 21: /* move with zero batch */ - fd_vinyl_rq_send( rq, req_id, link_id, FD_VINYL_REQ_TYPE_MOVE, flags, 0UL, - src_key_gaddr, dst_key_gaddr, err_gaddr, oob ); WAIT; - FD_TEST( comp->err ==FD_VINYL_SUCCESS ); FD_TEST( comp->batch_cnt==(ushort)0 ); - FD_TEST( comp->fail_cnt ==(ushort)0 ); FD_TEST( comp->quota_rem==(ushort)ref.quota_rem ); - break; - - case 22: { /* move */ - int ref_err = req( FD_VINYL_REQ_TYPE_MOVE, flags, val_max_bad, src_key, (pair_t **)dst_key, NULL ); - fd_vinyl_rq_send( rq, req_id, link_id, FD_VINYL_REQ_TYPE_MOVE, flags, 1UL, - src_key_gaddr, dst_key_gaddr, err_gaddr, oob ); WAIT; - FD_TEST( comp->err ==FD_VINYL_SUCCESS ); FD_TEST( comp->batch_cnt==(ushort)1 ); - FD_TEST( comp->fail_cnt ==(ushort)!!ref_err ); FD_TEST( comp->quota_rem==(ushort)ref.quota_rem ); - FD_TEST( err[0]==(schar)ref_err ); - break; - } - - /* fetch tests (these are logical no-op / hints and don't generate completions) */ - - case 23: /* fetch with unmappable key */ - fd_vinyl_rq_send( rq, req_id, link_id, FD_VINYL_REQ_TYPE_FETCH, flags, 1UL, 0UL, 0UL, 0UL, oob ); - break; - - case 24: /* fetch with zero batch cnt */ - fd_vinyl_rq_send( rq, req_id, link_id, FD_VINYL_REQ_TYPE_FETCH, flags, 0UL, 0UL, 0UL, 0UL, oob ); - break; - - case 25: /* fetch */ - FD_TEST( !req( FD_VINYL_REQ_TYPE_FETCH, 0UL, 0UL, src_key, NULL, NULL ) ); - fd_vinyl_rq_send( rq, req_id, link_id, FD_VINYL_REQ_TYPE_FETCH, flags, 1UL, src_key_gaddr, 0UL, 0UL, oob ); - break; - - /* flush tests (these are logical no-ops / hints and don't generate completions) */ - - case 26: /* flush with unmappable key */ - fd_vinyl_rq_send( rq, req_id, link_id, FD_VINYL_REQ_TYPE_FLUSH, flags, 1UL, 0UL, 0UL, 0UL, oob ); - break; - - case 27: /* flush with zero batch cnt */ - fd_vinyl_rq_send( rq, req_id, link_id, FD_VINYL_REQ_TYPE_FLUSH, flags, 0UL, 0UL, 0UL, 0UL, oob ); - break; - - case 28: /* flush */ - FD_TEST( !req( FD_VINYL_REQ_TYPE_FLUSH, 0UL, 0UL, src_key, NULL, NULL ) ); - fd_vinyl_rq_send( rq, req_id, link_id, FD_VINYL_REQ_TYPE_FLUSH, flags, 1UL, src_key_gaddr, 0UL, 0UL, oob ); - break; - - /* try tests */ - - case 29: /* try with unmappable key */ - fd_vinyl_rq_send( rq, req_id, link_id, FD_VINYL_REQ_TYPE_TRY, flags, 1UL, - 0UL, try_gaddr_gaddr, err_gaddr, oob ); WAIT; - FD_TEST( comp->err ==FD_VINYL_ERR_INVAL ); FD_TEST( comp->batch_cnt==(ushort)1 ); - FD_TEST( comp->fail_cnt ==(ushort)0 ); FD_TEST( comp->quota_rem==(ushort)ref.quota_rem ); - break; - - case 30: /* try with unmappable try */ - fd_vinyl_rq_send( rq, req_id, link_id, FD_VINYL_REQ_TYPE_TRY, flags, 1UL, - src_key_gaddr, 0UL, err_gaddr, oob ); WAIT; - FD_TEST( comp->err ==FD_VINYL_ERR_INVAL ); FD_TEST( comp->batch_cnt==(ushort)1 ); - FD_TEST( comp->fail_cnt ==(ushort)0 ); FD_TEST( comp->quota_rem==(ushort)ref.quota_rem ); - break; - - case 31: /* try with unmappable err */ - fd_vinyl_rq_send( rq, req_id, link_id, FD_VINYL_REQ_TYPE_TRY, flags, 1UL, - src_key_gaddr, try_gaddr_gaddr, 0UL, oob ); WAIT; - FD_TEST( comp->err ==FD_VINYL_ERR_INVAL ); FD_TEST( comp->batch_cnt==(ushort)1 ); - FD_TEST( comp->fail_cnt ==(ushort)0 ); FD_TEST( comp->quota_rem==(ushort)ref.quota_rem ); - break; - - case 32: /* try with zero batch */ - fd_vinyl_rq_send( rq, req_id, link_id, FD_VINYL_REQ_TYPE_TRY, flags, 0UL, - src_key_gaddr, try_gaddr_gaddr, err_gaddr, oob ); WAIT; - FD_TEST( comp->err ==FD_VINYL_SUCCESS ); FD_TEST( comp->batch_cnt==(ushort)0 ); - FD_TEST( comp->fail_cnt ==(ushort)0 ); FD_TEST( comp->quota_rem==(ushort)ref.quota_rem ); - break; - - case 33: { /* try */ - int ref_err = req( FD_VINYL_REQ_TYPE_TRY, flags, val_max_bad, src_key, &try_pair[ src_idx ], &try_ver[ src_idx ] ); - fd_vinyl_rq_send( rq, req_id, link_id, FD_VINYL_REQ_TYPE_TRY, flags, 1UL, - src_key_gaddr, try_gaddr_gaddr, err_gaddr, oob ); WAIT; - FD_TEST( comp->err ==FD_VINYL_SUCCESS ); FD_TEST( comp->batch_cnt==(ushort)1 ); - FD_TEST( comp->fail_cnt ==(ushort)!!ref_err ); FD_TEST( comp->quota_rem==(ushort)ref.quota_rem ); - FD_TEST( err[0]==(schar)ref_err ); - if( !ref_err ) try_live[ src_idx ] = 1; - break; - } - - /* test tests */ - - case 34: /* test with unmappable try */ - fd_vinyl_rq_send( rq, req_id, link_id, FD_VINYL_REQ_TYPE_TEST, flags, 1UL, - 0UL, 0UL, err_gaddr, oob ); WAIT; - FD_TEST( comp->err ==FD_VINYL_ERR_INVAL ); FD_TEST( comp->batch_cnt==(ushort)1 ); - FD_TEST( comp->fail_cnt ==(ushort)0 ); FD_TEST( comp->quota_rem==(ushort)ref.quota_rem ); - break; - - case 35: /* test with unmappable err */ - fd_vinyl_rq_send( rq, req_id, link_id, FD_VINYL_REQ_TYPE_TEST, flags, 1UL, - 0UL, try_gaddr_gaddr, 0UL, oob ); WAIT; - FD_TEST( comp->err ==FD_VINYL_ERR_INVAL ); FD_TEST( comp->batch_cnt==(ushort)1 ); - FD_TEST( comp->fail_cnt ==(ushort)0 ); FD_TEST( comp->quota_rem==(ushort)ref.quota_rem ); - break; - - case 36: /* test with zero batch */ - fd_vinyl_rq_send( rq, req_id, link_id, FD_VINYL_REQ_TYPE_TEST, flags, 0UL, - 0UL, try_gaddr_gaddr, err_gaddr, oob ); WAIT; - FD_TEST( comp->err ==FD_VINYL_SUCCESS ); FD_TEST( comp->batch_cnt==(ushort)0 ); - FD_TEST( comp->fail_cnt ==(ushort)0 ); FD_TEST( comp->quota_rem==(ushort)ref.quota_rem ); - break; - - case 37: { /* test */ - if( !try_live[ src_idx ] ) break; - void * try_val = fd_wksp_laddr_fast( wksp, try_gaddr[0] ); - fd_vinyl_info_t * try_info = fd_vinyl_data_info( try_val ); - ulong try_val_sz = fd_ulong_min( (ulong)try_info->val_sz, FD_VINYL_VAL_MAX ); - int try_cmp = (!memcmp( try_info, try_pair[ src_idx ]->info, sizeof(fd_vinyl_info_t) )) && - (!memcmp( try_val, try_pair[ src_idx ]->val, try_val_sz )); - int ref_err = req( FD_VINYL_REQ_TYPE_TEST, flags, val_max_bad, NULL, &try_pair[ src_idx ], &try_ver[ src_idx ] ); - fd_vinyl_rq_send( rq, req_id, link_id, FD_VINYL_REQ_TYPE_TEST, flags, 1UL, - 0UL, try_gaddr_gaddr, err_gaddr, oob ); WAIT; - try_live[ src_idx ] = 0; - FD_TEST( comp->err ==FD_VINYL_SUCCESS ); FD_TEST( comp->batch_cnt==(ushort)1 ); - /**/ FD_TEST( comp->quota_rem==(ushort)ref.quota_rem ); - /* Note that it is possible for a try to work in the ref version - and fail in the full version (e.g. cache line flushing, val - resizing in the background, etc). */ - if( ((!ref_err) & (!!err[0])) ) break; - FD_TEST( comp->fail_cnt ==(ushort)!!ref_err ); - FD_TEST( (!!err[0]) | try_cmp ); - break; - } - - case 38: { /* sync */ - FD_TEST( !fd_vinyl_sync( cnc ) ); - break; - } - - case 39: { /* randomly toggle data compression on and off */ - int new_style = (r & 1UL) ? (ulong)FD_VINYL_BSTREAM_CTL_STYLE_RAW : (ulong)FD_VINYL_BSTREAM_CTL_STYLE_LZ4; - FD_TEST( !fd_vinyl_set( cnc, FD_VINYL_OPT_STYLE, (ulong)new_style, NULL ) ); - break; - } - - default: break; - } - } - - /* Clean up */ - - for( ulong src_idx=0UL; src_idxerr ==FD_VINYL_SUCCESS ); FD_TEST( comp->batch_cnt==(ushort)1 ); - FD_TEST( comp->fail_cnt ==(ushort)!!ref_err ); FD_TEST( comp->quota_rem==(ushort)ref.quota_rem ); - FD_TEST( err[0]==(schar)ref_err ); - } - - } - - fd_rng_delete( fd_rng_leave( rng ) ); -} - -int -main( int argc, - char ** argv ) { - fd_boot( &argc, &argv ); - - if( FD_UNLIKELY( fd_tile_cnt() < 2UL ) ) FD_LOG_ERR(( "This test requires at least 2 tiles" )); - - char const * _wksp = fd_env_strip_cmdline_cstr ( &argc, &argv, "--wksp", NULL, NULL ); - char const * _page_sz = fd_env_strip_cmdline_cstr ( &argc, &argv, "--page-sz", NULL, "gigantic" ); - ulong page_cnt = fd_env_strip_cmdline_ulong( &argc, &argv, "--page-cnt", NULL, 8UL ); - ulong near_cpu = fd_env_strip_cmdline_ulong( &argc, &argv, "--near-cpu", NULL, fd_log_cpu_id() ); - ulong tag = fd_env_strip_cmdline_ulong( &argc, &argv, "--tag", NULL, 1UL ); - - ulong spad_max = fd_env_strip_cmdline_ulong( &argc, &argv, "--spad-max", NULL, fd_vinyl_io_spad_est() ); - ulong dev_sz = fd_env_strip_cmdline_ulong( &argc, &argv, "--dev-sz", NULL, 1UL << 30 ); - ulong io_seed = fd_env_strip_cmdline_ulong( &argc, &argv, "--io-seed", NULL, 1234UL ); - - ulong line_cnt = fd_env_strip_cmdline_ulong( &argc, &argv, "--line-cnt", NULL, 7UL ); - - ulong ele_max = fd_env_strip_cmdline_ulong( &argc, &argv, "--ele-max", NULL, 8UL ); - ulong lock_cnt = fd_env_strip_cmdline_ulong( &argc, &argv, "--lock_cnt", NULL, 8UL ); - ulong probe_max = ele_max; - ulong seed = fd_env_strip_cmdline_ulong( &argc, &argv, "--seed", NULL, 5678UL ); - - ulong obj_sz = fd_env_strip_cmdline_ulong( &argc, &argv, "--obj-sz", NULL, 6UL << 30 ); - - ulong async_min = fd_env_strip_cmdline_ulong( &argc, &argv, "--async-min", NULL, 5UL ); - ulong async_max = fd_env_strip_cmdline_ulong( &argc, &argv, "--async-max", NULL, 2UL*async_min ); - ulong part_thresh = fd_env_strip_cmdline_ulong( &argc, &argv, "--part-thresh", NULL, 64UL << 20 ); - ulong gc_thresh = fd_env_strip_cmdline_ulong( &argc, &argv, "--gc-thresh", NULL, 128UL << 20 ); - int gc_eager = fd_env_strip_cmdline_int ( &argc, &argv, "--gc-eager", NULL, 2 ); - char const * _style = fd_env_strip_cmdline_cstr ( &argc, &argv, "--style", NULL, "lz4" ); - int level = fd_env_strip_cmdline_int ( &argc, &argv, "--level", NULL, 0 ); - - ulong rq_max = fd_env_strip_cmdline_ulong( &argc, &argv, "--rq-max", NULL, 32UL ); - ulong cq_max = fd_env_strip_cmdline_ulong( &argc, &argv, "--cq-max", NULL, 32UL ); - ulong link_id = fd_env_strip_cmdline_ulong( &argc, &argv, "--link-id", NULL, 2345UL ); - ulong burst_max = fd_env_strip_cmdline_ulong( &argc, &argv, "--burst-max", NULL, 1UL ); - ulong quota_max = fd_env_strip_cmdline_ulong( &argc, &argv, "--quota-max", NULL, 2UL ); - ulong scratch_sz = fd_env_strip_cmdline_ulong( &argc, &argv, "--scratch-sz", NULL, 4096UL ); - - ulong iter_max = fd_env_strip_cmdline_ulong( &argc, &argv, "--iter-max", NULL, (ulong)1e7 ); - - int style = fd_cstr_to_vinyl_bstream_ctl_style( _style ); - - fd_wksp_t * wksp; - if( _wksp ) { - FD_LOG_NOTICE(( "Attaching to --wksp %s", _wksp )); - wksp = fd_wksp_attach( _wksp ); - } else { - FD_LOG_NOTICE(( "--wksp not specified, using an anonymous local workspace (--page-sz %s --page-cnt %lu --near-cpu %lu)", - _page_sz, page_cnt, near_cpu )); - wksp = fd_wksp_new_anonymous( fd_cstr_to_shmem_page_sz( _page_sz ), page_cnt, near_cpu, "wksp", 0UL ); - } - FD_TEST( wksp ); - - FD_LOG_NOTICE(( "Creating vinyl tile" )); - - ulong io_footprint = fd_vinyl_io_mm_footprint( spad_max ); FD_TEST( io_footprint ); - ulong dev_footprint = fd_ulong_align_dn( dev_sz, FD_VINYL_BSTREAM_BLOCK_SZ ); FD_TEST( dev_footprint ); - ulong vinyl_footprint = fd_vinyl_footprint(); FD_TEST( vinyl_footprint ); - ulong cnc_footprint = fd_cnc_footprint( FD_VINYL_CNC_APP_SZ ); FD_TEST( cnc_footprint ); - ulong meta_footprint = fd_vinyl_meta_footprint( ele_max, lock_cnt, probe_max ); FD_TEST( meta_footprint ); - ulong line_footprint = sizeof(fd_vinyl_line_t) * line_cnt; FD_TEST( line_footprint ); - ulong ele_footprint = sizeof(fd_vinyl_meta_ele_t) * ele_max; FD_TEST( ele_footprint ); - ulong obj_footprint = fd_ulong_align_dn( obj_sz, alignof(fd_vinyl_data_obj_t) ); FD_TEST( obj_footprint ); - ulong rq_footprint = fd_vinyl_rq_footprint( rq_max ); FD_TEST( rq_footprint ); - ulong cq_footprint = fd_vinyl_cq_footprint( cq_max ); FD_TEST( cq_footprint ); - - void * _io = fd_wksp_alloc_laddr( wksp, fd_vinyl_io_mm_align(), io_footprint, tag ); FD_TEST( _io ); - void * _dev = fd_wksp_alloc_laddr( wksp, FD_VINYL_BSTREAM_BLOCK_SZ, dev_footprint, tag ); FD_TEST( _dev ); - void * _vinyl = fd_wksp_alloc_laddr( wksp, fd_vinyl_align(), vinyl_footprint, tag ); FD_TEST( _vinyl ); - void * _cnc = fd_wksp_alloc_laddr( wksp, fd_cnc_align(), cnc_footprint, tag ); FD_TEST( _cnc ); - void * _meta = fd_wksp_alloc_laddr( wksp, fd_vinyl_meta_align(), meta_footprint, tag ); FD_TEST( _meta ); - void * _line = fd_wksp_alloc_laddr( wksp, alignof(fd_vinyl_line_t), line_footprint, tag ); FD_TEST( _line ); - void * _ele = fd_wksp_alloc_laddr( wksp, alignof(fd_vinyl_meta_ele_t), ele_footprint, tag ); FD_TEST( _ele ); - void * _obj = fd_wksp_alloc_laddr( wksp, alignof(fd_vinyl_data_obj_t), obj_footprint, tag ); FD_TEST( _obj ); - void * _rq = fd_wksp_alloc_laddr( wksp, fd_vinyl_rq_align(), rq_footprint, tag ); FD_TEST( _rq ); - void * _cq = fd_wksp_alloc_laddr( wksp, fd_vinyl_cq_align(), cq_footprint, tag ); FD_TEST( _cq ); - void * _scratch = fd_wksp_alloc_laddr( wksp, 4096UL, scratch_sz, tag ); FD_TEST( _scratch ); - - fd_vinyl_io_t * io = fd_vinyl_io_mm_init( _io, spad_max, _dev, dev_footprint, 1, "test", 5UL, io_seed ); FD_TEST( io ); - - fd_tpool_t * tpool = NULL; - - ulong thread_cnt = fd_tile_cnt(); - - if( FD_LIKELY( thread_cnt>1UL ) ) { - FD_LOG_NOTICE(( "Creating temporary tpool from all %lu tiles for thread parallel init", thread_cnt )); - - static uchar _tpool[ FD_TPOOL_FOOTPRINT( FD_TILE_MAX ) ] __attribute__((aligned(FD_TPOOL_ALIGN))); - - tpool = fd_tpool_init( _tpool, thread_cnt, 0UL ); /* logs details */ - if( FD_UNLIKELY( !tpool ) ) FD_LOG_ERR(( "fd_tpool_init failed" )); - - for( ulong thread_idx=1UL; thread_idx