diff --git a/book/api/metrics-generated.md b/book/api/metrics-generated.md
index aa72b7ed776..1ffbe0c75b7 100644
--- a/book/api/metrics-generated.md
+++ b/book/api/metrics-generated.md
@@ -610,6 +610,12 @@
| execle_transaction_landed
{transaction_landed="landed_failed"} | counter | Whether a transaction landed in the block or not (Transaction landed, but failed to execute) |
| execle_transaction_landed
{transaction_landed="unlanded"} | counter | Whether a transaction landed in the block or not (Transaction did not land) |
| execle_compute_units_total | counter | Estimated number of compute units executed since tile start |
+| execle_accdb_lookup_funk | counter | Number of account lookups resolved from funk (in-memory fork store) |
+| execle_accdb_lookup_specrd | counter | Number of account lookups resolved from speculative read (vinyl cache) |
+| execle_accdb_lookup_accdb | counter | Number of account lookups sent to accdb tile (vinyl rq/cq) |
+| execle_accdb_dt_funk | counter | Cumulative time spent in funk (in-memory) account lookups |
+| execle_accdb_dt_specrd | counter | Cumulative time spent in speculative read (vinyl cache) account lookups |
+| execle_accdb_dt_vinyl | counter | Cumulative time spent waiting for vinyl rq/cq account lookups |
@@ -925,15 +931,12 @@
| replay_progcache_gc_root | counter | Number of program cache entries garbage collected while rooting |
| replay_accdb_created | counter | Number of account database records created |
| replay_accdb_reverted | counter | Number of account database records reverted |
-| replay_accdb_rooted | counter | Number of account database entries rooted |
-| replay_accdb_rooted_bytes | counter | Number of bytes in account database entries rooted (including overhead) |
-| replay_accdb_gc_root | counter | Number of account database entries garbage collected |
-| replay_accdb_reclaimed | counter | Number of account database entries reclaimed (deletion rooted) |
-| replay_root_slot_duration_seconds | histogram | Time in seconds spent updating the rooted account store (one sample per block) |
-| replay_root_account_duration_seconds | histogram | Time in seconds spent updating the rooted account store (one sample per block, normalized by account count) |
-| replay_root_elapsed_seconds
{root_phase="db"} | counter | Total time in seconds spent rooting accounts (Waiting on database server) |
-| replay_root_elapsed_seconds
{root_phase="copy"} | counter | Total time in seconds spent rooting accounts (Copying account data) |
-| replay_root_elapsed_seconds
{root_phase="gc"} | counter | Total time in seconds spent rooting accounts (Garbage collecting old account data) |
+| replay_accdb_lookup_funk | counter | Number of account lookups resolved from funk (in-memory fork store) |
+| replay_accdb_lookup_specrd | counter | Number of account lookups resolved from speculative read (vinyl cache) |
+| replay_accdb_lookup_accdb | counter | Number of account lookups sent to accdb tile (vinyl rq/cq) |
+| replay_accdb_dt_funk | counter | Cumulative time spent in funk (in-memory) account lookups |
+| replay_accdb_dt_specrd | counter | Cumulative time spent in speculative read (vinyl cache) account lookups |
+| replay_accdb_dt_vinyl | counter | Cumulative time spent waiting for vinyl rq/cq account lookups |
@@ -951,6 +954,12 @@
| execrp_progcache_dup_inserts | counter | Number of time two tiles raced to insert the same cache entry |
| execrp_progcache_invalidations | counter | Number of program cache invalidations |
| execrp_accdb_created | counter | Number of account database records created |
+| execrp_accdb_lookup_funk | counter | Number of account lookups resolved from funk (in-memory fork store) |
+| execrp_accdb_lookup_specrd | counter | Number of account lookups resolved from speculative read (vinyl cache) |
+| execrp_accdb_lookup_accdb | counter | Number of account lookups sent to accdb tile (vinyl rq/cq) |
+| execrp_accdb_dt_funk | counter | Cumulative time spent in funk (in-memory) account lookups |
+| execrp_accdb_dt_specrd | counter | Cumulative time spent in speculative read (vinyl cache) account lookups |
+| execrp_accdb_dt_vinyl | counter | Cumulative time spent waiting for vinyl rq/cq account lookups |
| execrp_txn_regime
{txn_regime="setup"} | counter | Mutually exclusive and exhaustive duration of time spent in transaction execution regimes (Transaction setup) |
| execrp_txn_regime
{txn_regime="exec"} | counter | Mutually exclusive and exhaustive duration of time spent in transaction execution regimes (Transaction execution (includes VM setup/execution)) |
| execrp_txn_regime
{txn_regime="commit"} | counter | Mutually exclusive and exhaustive duration of time spent in transaction execution regimes (Transaction result commit) |
@@ -994,9 +1003,7 @@
| accdb_bstream_seq
{bstream_seq="present"} | gauge | Current bstream sequence number (Blocks between present and future are being written (write only)) |
| accdb_bstream_seq
{bstream_seq="future"} | gauge | Current bstream sequence number (Blocks between future and ancient have not been written (no read, no write)) |
| accdb_request_batches | counter | Number of request batches processed |
-| accdb_requests
{vinyl_request="acquire"} | counter | Number of requests processed (Acquire record) |
-| accdb_requests
{vinyl_request="release"} | counter | Number of requests processed (Release record) |
-| accdb_requests
{vinyl_request="erase"} | counter | Number of requests processed (Erase record) |
+| accdb_requests | counter | Number of requests processed |
| accdb_blocks
{vinyl_blocks="pair"} | counter | Number of blocks written to bstream (Record) |
| accdb_blocks
{vinyl_blocks="dead"} | counter | Number of blocks written to bstream (Record deletion) |
| accdb_blocks
{vinyl_blocks="part"} | counter | Number of blocks written to bstream (Partition/divider) |
diff --git a/src/app/firedancer-dev/commands/backtest.c b/src/app/firedancer-dev/commands/backtest.c
index 9113f1409b6..f25d01c2d8b 100644
--- a/src/app/firedancer-dev/commands/backtest.c
+++ b/src/app/firedancer-dev/commands/backtest.c
@@ -23,6 +23,7 @@
#include "../../../disco/topo/fd_topob_vinyl.h"
#include "../../../util/pod/fd_pod_format.h"
#include "../../../discof/genesis/fd_genesi_tile.h"
+#include "../../../funk/fd_funk_base.h"
#include "../../../discof/replay/fd_replay_tile.h"
#include "../../../discof/restore/fd_snapin_tile_private.h"
#include "../../../discof/restore/fd_snaplv_tile_private.h"
@@ -90,6 +91,11 @@ backtest_topo( config_t * config ) {
ulong funk_locks_obj_id; FD_TEST( (funk_locks_obj_id = fd_pod_query_ulong( topo->props, "funk_locks", ULONG_MAX ) )!=ULONG_MAX );
fd_topob_tile_uses( topo, replay_tile, &topo->objs[ funk_obj_id ], FD_SHMEM_JOIN_MODE_READ_WRITE );
fd_topob_tile_uses( topo, replay_tile, &topo->objs[ funk_locks_obj_id ], FD_SHMEM_JOIN_MODE_READ_WRITE );
+ if( vinyl_enabled ) {
+ fd_topo_tile_t * accdb_tile = &topo->tiles[ fd_topo_find_tile( topo, "accdb", 0UL ) ];
+ fd_topob_tile_uses( topo, accdb_tile, &topo->objs[ funk_obj_id ], FD_SHMEM_JOIN_MODE_READ_WRITE );
+ fd_topob_tile_uses( topo, accdb_tile, &topo->objs[ funk_locks_obj_id ], FD_SHMEM_JOIN_MODE_READ_WRITE );
+ }
fd_topob_wksp( topo, "progcache" );
setup_topo_progcache( topo, "progcache",
@@ -349,8 +355,10 @@ backtest_topo( config_t * config ) {
setup_topo_accdb_meta( topo, &config->firedancer );
ulong vinyl_map_obj_id = fd_pod_query_ulong( topo->props, "accdb.meta_map", ULONG_MAX ); FD_TEST( vinyl_map_obj_id !=ULONG_MAX );
ulong vinyl_pool_obj_id = fd_pod_query_ulong( topo->props, "accdb.meta_pool", ULONG_MAX ); FD_TEST( vinyl_pool_obj_id!=ULONG_MAX );
+ ulong vinyl_line_obj_id = fd_pod_query_ulong( topo->props, "accdb.line", ULONG_MAX ); FD_TEST( vinyl_line_obj_id!=ULONG_MAX );
fd_topo_obj_t * accdb_map_obj = &topo->objs[ vinyl_map_obj_id ];
fd_topo_obj_t * accdb_pool_obj = &topo->objs[ vinyl_pool_obj_id ];
+ fd_topo_obj_t * accdb_line_obj = &topo->objs[ vinyl_line_obj_id ];
fd_topo_obj_t * accdb_data = setup_topo_accdb_cache( topo, &config->firedancer );
@@ -359,13 +367,20 @@ backtest_topo( config_t * config ) {
fd_topob_tile_uses( topo, accdb_tile, accdb_data, FD_SHMEM_JOIN_MODE_READ_WRITE );
fd_topob_tile_uses( topo, accdb_tile, accdb_map_obj, FD_SHMEM_JOIN_MODE_READ_WRITE );
fd_topob_tile_uses( topo, accdb_tile, accdb_pool_obj, FD_SHMEM_JOIN_MODE_READ_WRITE );
+ fd_topob_tile_uses( topo, accdb_tile, accdb_line_obj, FD_SHMEM_JOIN_MODE_READ_WRITE );
- fd_topob_tile_uses( topo, replay_tile, accdb_data, FD_SHMEM_JOIN_MODE_READ_WRITE );
+ fd_topob_tile_uses( topo, replay_tile, accdb_data, FD_SHMEM_JOIN_MODE_READ_WRITE );
+ fd_topob_tile_uses( topo, replay_tile, accdb_line_obj, FD_SHMEM_JOIN_MODE_READ_WRITE );
for( ulong i=0UL; itiles[ fd_topo_find_tile( topo, "execrp", i ) ], accdb_data, FD_SHMEM_JOIN_MODE_READ_WRITE );
+ fd_topo_tile_t * t = &topo->tiles[ fd_topo_find_tile( topo, "execrp", i ) ];
+ fd_topob_tile_uses( topo, t, accdb_data, FD_SHMEM_JOIN_MODE_READ_WRITE );
+ fd_topob_tile_uses( topo, t, accdb_line_obj, FD_SHMEM_JOIN_MODE_READ_WRITE );
}
fd_topob_wksp( topo, "accdb_replay" );
+
+ fd_topob_wksp( topo, "replay_accdb" );
+ fd_topob_link( topo, "replay_accdb", "replay_accdb", 128UL, sizeof(fd_funk_txn_xid_t), 1UL );
}
/**********************************************************************/
@@ -415,6 +430,10 @@ backtest_topo( config_t * config ) {
fd_topob_wksp( topo, "replay_execrp" );
fd_topob_link( topo, "replay_execrp", "replay_execrp", 16384UL, 2240UL, 1UL );
fd_topob_tile_out( topo, "replay", 0UL, "replay_execrp", 0UL );
+ if( vinyl_enabled ) {
+ fd_topob_tile_out( topo, "replay", 0UL, "replay_accdb", 0UL );
+ fd_topob_tile_in( topo, "accdb", 0UL, "metric_in", "replay_accdb", 0UL, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED );
+ }
for( ulong i=0UL; iid ), 0, vinyl_line_footprint( topo, obj ) );
+}
+
+fd_topo_obj_callbacks_t fd_obj_cb_vinyl_line = {
+ .name = "vinyl_line",
+ .footprint = vinyl_line_footprint,
+ .align = vinyl_line_align,
+ .new = vinyl_line_new,
+};
+
/* vinyl_data: memory arena for data cache entries */
static ulong
diff --git a/src/app/firedancer/config/default.toml b/src/app/firedancer/config/default.toml
index 2acb3d2350c..1602125f88f 100644
--- a/src/app/firedancer/config/default.toml
+++ b/src/app/firedancer/config/default.toml
@@ -543,7 +543,7 @@ telemetry = true
# how much memory is reserved for such account changes.
# If the amount of inflight account changes exceeds this limit, the
# validator will crash.
- max_unrooted_account_size_gib = 16
+ max_unrooted_account_size_gib = 64
# Keep frequently accessed accounts in memory to improve
# performance.
@@ -579,7 +579,7 @@ telemetry = true
# The expected mean size of recently accessed accounts.
#
# This heuristic is used to pack accounts into caches optimally.
- mean_account_footprint = 256
+ mean_account_footprint = 64
# io_uring specific options
#
diff --git a/src/app/firedancer/config/mainnet.toml b/src/app/firedancer/config/mainnet.toml
index 7c6509b06d6..e85d0ac3fb8 100644
--- a/src/app/firedancer/config/mainnet.toml
+++ b/src/app/firedancer/config/mainnet.toml
@@ -9,8 +9,8 @@
[consensus]
expected_genesis_hash = "5eykt4UsFv8P8NJdTREpY1vzqKqZKvdpKuc147dw2N9d"
[accounts]
- file_size_gib = 600
- max_accounts = 1_100_000_000
+ file_size_gib = 700
+ max_accounts = 1_300_000_000
[snapshots]
[snapshots.sources]
servers = [ "http://solana-mainnet-rpc.jumpisolated.com:8899" ]
diff --git a/src/app/firedancer/main.c b/src/app/firedancer/main.c
index 7445f0dd0f5..40776ee742f 100644
--- a/src/app/firedancer/main.c
+++ b/src/app/firedancer/main.c
@@ -27,6 +27,7 @@ extern fd_topo_obj_callbacks_t fd_obj_cb_rnonce_ss;
extern fd_topo_obj_callbacks_t fd_obj_cb_vinyl_meta;
extern fd_topo_obj_callbacks_t fd_obj_cb_vinyl_meta_ele;
+extern fd_topo_obj_callbacks_t fd_obj_cb_vinyl_line;
extern fd_topo_obj_callbacks_t fd_obj_cb_vinyl_data;
extern fd_topo_obj_callbacks_t fd_obj_cb_vinyl_req_pool;
extern fd_topo_obj_callbacks_t fd_obj_cb_vinyl_rq;
@@ -53,6 +54,7 @@ fd_topo_obj_callbacks_t * CALLBACKS[] = {
&fd_obj_cb_acc_pool,
&fd_obj_cb_vinyl_meta,
&fd_obj_cb_vinyl_meta_ele,
+ &fd_obj_cb_vinyl_line,
&fd_obj_cb_vinyl_data,
&fd_obj_cb_acc_pool,
&fd_obj_cb_vinyl_req_pool,
diff --git a/src/app/firedancer/topology.c b/src/app/firedancer/topology.c
index 0f562d5b9b8..d3dda80850c 100644
--- a/src/app/firedancer/topology.c
+++ b/src/app/firedancer/topology.c
@@ -28,6 +28,7 @@
#include "../../discof/restore/utils/fd_ssctrl.h"
#include "../../discof/restore/utils/fd_ssmsg.h"
#include "../../flamenco/capture/fd_solcap_writer.h"
+#include "../../funk/fd_funk_base.h"
#include "../../flamenco/progcache/fd_progcache_admin.h"
#include "../../flamenco/runtime/fd_acc_pool.h"
#include "../../flamenco/accdb/fd_accdb_lineage.h"
@@ -216,8 +217,11 @@ setup_topo_accdb_meta( fd_topo_t * topo,
fd_topo_obj_t * meta_pool_obj = fd_topob_obj( topo, "vinyl_meta_e", "accdb_meta" );
fd_pod_insertf_ulong( topo->props, meta_max, "obj.%lu.cnt", meta_pool_obj->id );
+ fd_topo_obj_t * line_obj = fd_topob_obj( topo, "vinyl_line", "accdb_meta" );
+
fd_pod_insert_ulong( topo->props, "accdb.meta_map", map_obj->id );
fd_pod_insert_ulong( topo->props, "accdb.meta_pool", meta_pool_obj->id );
+ fd_pod_insert_ulong( topo->props, "accdb.line", line_obj->id );
}
fd_topo_obj_t *
@@ -617,6 +621,10 @@ fd_topo_initialize( config_t * config ) {
/**/ fd_topob_link( topo, "replay_epoch", "replay_epoch", 128UL, FD_EPOCH_OUT_MTU, 1UL ); /* TODO: This should be 2 but requires fixing STEM_BURST */
/**/ fd_topob_link( topo, "replay_out", "replay_out", 65536UL, sizeof(fd_replay_message_t), 1UL );
fd_topob_link( topo, "replay_execrp", "replay_execrp", 16384UL, sizeof(fd_execrp_task_msg_t), 1UL );
+ if( !config->firedancer.accounts.in_memory_only ) {
+ fd_topob_wksp( topo, "replay_accdb" );
+ fd_topob_link( topo, "replay_accdb", "replay_accdb", 128UL, sizeof(fd_funk_txn_xid_t), 1UL );
+ }
if( leader_enabled ) {
/**/ fd_topob_link( topo, "dedup_resolv", "dedup_resolv", 65536UL, FD_TPU_PARSED_MTU, 1UL );
FOR(resolv_tile_cnt) fd_topob_link( topo, "resolv_pack", "resolv_pack", 65536UL, FD_TPU_RESOLVED_MTU, 1UL );
@@ -748,32 +756,48 @@ fd_topo_initialize( config_t * config ) {
ulong vinyl_map_obj_id = fd_pod_query_ulong( topo->props, "accdb.meta_map", ULONG_MAX ); FD_TEST( vinyl_map_obj_id !=ULONG_MAX );
ulong vinyl_pool_obj_id = fd_pod_query_ulong( topo->props, "accdb.meta_pool", ULONG_MAX ); FD_TEST( vinyl_pool_obj_id!=ULONG_MAX );
+ ulong vinyl_line_obj_id = fd_pod_query_ulong( topo->props, "accdb.line", ULONG_MAX ); FD_TEST( vinyl_line_obj_id!=ULONG_MAX );
fd_topo_obj_t * accdb_map_obj = &topo->objs[ vinyl_map_obj_id ];
fd_topo_obj_t * accdb_pool_obj = &topo->objs[ vinyl_pool_obj_id ];
+ fd_topo_obj_t * accdb_line_obj = &topo->objs[ vinyl_line_obj_id ];
fd_topob_wksp( topo, "accdb" );
fd_topo_tile_t * accdb_tile = fd_topob_tile( topo, "accdb", "accdb", "metric_in", tile_to_cpu[ topo->tile_cnt ], 0, 0, 0 );
fd_topob_tile_uses( topo, accdb_tile, accdb_data, FD_SHMEM_JOIN_MODE_READ_WRITE );
fd_topob_tile_uses( topo, accdb_tile, accdb_map_obj, FD_SHMEM_JOIN_MODE_READ_WRITE );
fd_topob_tile_uses( topo, accdb_tile, accdb_pool_obj, FD_SHMEM_JOIN_MODE_READ_WRITE );
+ fd_topob_tile_uses( topo, accdb_tile, accdb_line_obj, FD_SHMEM_JOIN_MODE_READ_WRITE );
fd_topob_tile_uses( topo, &topo->tiles[ fd_topo_find_tile( topo, "genesi", 0UL ) ], accdb_data, FD_SHMEM_JOIN_MODE_READ_WRITE );
- fd_topob_tile_uses( topo, &topo->tiles[ fd_topo_find_tile( topo, "replay", 0UL ) ], accdb_data, FD_SHMEM_JOIN_MODE_READ_WRITE );
+ fd_topo_tile_t * replay_tile = &topo->tiles[ fd_topo_find_tile( topo, "replay", 0UL ) ];
+ fd_topob_tile_uses( topo, replay_tile, accdb_data, FD_SHMEM_JOIN_MODE_READ_WRITE );
+ fd_topob_tile_uses( topo, replay_tile, accdb_line_obj, FD_SHMEM_JOIN_MODE_READ_WRITE );
for( ulong i=0UL; itiles[ fd_topo_find_tile( topo, "execrp", i ) ], accdb_data, FD_SHMEM_JOIN_MODE_READ_WRITE );
+ fd_topo_tile_t * t = &topo->tiles[ fd_topo_find_tile( topo, "execrp", i ) ];
+ fd_topob_tile_uses( topo, t, accdb_data, FD_SHMEM_JOIN_MODE_READ_WRITE );
+ fd_topob_tile_uses( topo, t, accdb_line_obj, FD_SHMEM_JOIN_MODE_READ_WRITE );
}
for( ulong i=0UL; itiles[ fd_topo_find_tile( topo, "execle", i ) ], accdb_data, FD_SHMEM_JOIN_MODE_READ_WRITE );
+ fd_topo_tile_t * t = &topo->tiles[ fd_topo_find_tile( topo, "execle", i ) ];
+ fd_topob_tile_uses( topo, t, accdb_data, FD_SHMEM_JOIN_MODE_READ_WRITE );
+ fd_topob_tile_uses( topo, t, accdb_line_obj, FD_SHMEM_JOIN_MODE_READ_WRITE );
+ }
+ fd_topo_tile_t * tower_tile = &topo->tiles[ fd_topo_find_tile( topo, "tower", 0UL ) ];
+ fd_topob_tile_uses( topo, tower_tile, accdb_data, FD_SHMEM_JOIN_MODE_READ_ONLY );
+ fd_topob_tile_uses( topo, tower_tile, accdb_line_obj, FD_SHMEM_JOIN_MODE_READ_WRITE );
+ FOR(resolv_tile_cnt) {
+ fd_topo_tile_t * t = &topo->tiles[ fd_topo_find_tile( topo, "resolv", i ) ];
+ fd_topob_tile_uses( topo, t, accdb_data, FD_SHMEM_JOIN_MODE_READ_ONLY );
+ fd_topob_tile_uses( topo, t, accdb_line_obj, FD_SHMEM_JOIN_MODE_READ_WRITE );
}
- fd_topob_tile_uses( topo, &topo->tiles[ fd_topo_find_tile( topo, "tower", 0UL ) ], accdb_data, FD_SHMEM_JOIN_MODE_READ_ONLY );
- FOR(resolv_tile_cnt) fd_topob_tile_uses( topo, &topo->tiles[ fd_topo_find_tile( topo, "resolv", i ) ], accdb_data, FD_SHMEM_JOIN_MODE_READ_ONLY );
if( rpc_enabled ) {
- fd_topob_tile_uses( topo, &topo->tiles[ fd_topo_find_tile( topo, "rpc", 0UL ) ], accdb_data, FD_SHMEM_JOIN_MODE_READ_ONLY );
+ fd_topo_tile_t * t = &topo->tiles[ fd_topo_find_tile( topo, "rpc", 0UL ) ];
+ fd_topob_tile_uses( topo, t, accdb_data, FD_SHMEM_JOIN_MODE_READ_ONLY );
+ fd_topob_tile_uses( topo, t, accdb_line_obj, FD_SHMEM_JOIN_MODE_READ_WRITE );
}
- fd_topob_wksp( topo, "accdb_genesi" );
fd_topob_wksp( topo, "accdb_replay" );
fd_topob_wksp( topo, "accdb_execrp" );
if( leader_enabled ) fd_topob_wksp( topo, "accdb_execle" );
@@ -929,6 +953,10 @@ fd_topo_initialize( config_t * config ) {
/**/ fd_topob_tile_out( topo, "replay", 0UL, "replay_out", 0UL );
/**/ fd_topob_tile_out( topo, "replay", 0UL, "replay_epoch", 0UL );
/**/ fd_topob_tile_out( topo, "replay", 0UL, "replay_execrp", 0UL );
+ if( vinyl_enabled ) {
+ fd_topob_tile_out( topo, "replay", 0UL, "replay_accdb", 0UL );
+ fd_topob_tile_in( topo, "accdb", 0UL, "metric_in", "replay_accdb", 0UL, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED );
+ }
FOR(execrp_tile_cnt) fd_topob_tile_in ( topo, "replay", 0UL, "metric_in", "execrp_replay", i, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED );
if(leader_enabled) {fd_topob_tile_in ( topo, "replay", 0UL, "metric_in", "poh_replay", 0UL, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED );}
/**/ fd_topob_tile_in ( topo, "replay", 0UL, "metric_in", "tower_out", 0UL, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED );
@@ -1233,12 +1261,14 @@ fd_topo_initialize( config_t * config ) {
FOR(execrp_tile_cnt) fd_topob_tile_uses( topo, &topo->tiles[ fd_topo_find_tile( topo, "execrp", i ) ], funk_obj, FD_SHMEM_JOIN_MODE_READ_WRITE );
FOR(execle_tile_cnt) fd_topob_tile_uses( topo, &topo->tiles[ fd_topo_find_tile( topo, "execle", i ) ], funk_obj, FD_SHMEM_JOIN_MODE_READ_WRITE );
FOR(resolv_tile_cnt) fd_topob_tile_uses( topo, &topo->tiles[ fd_topo_find_tile( topo, "resolv", i ) ], funk_obj, FD_SHMEM_JOIN_MODE_READ_ONLY );
+ if( vinyl_enabled ) fd_topob_tile_uses( topo, &topo->tiles[ fd_topo_find_tile( topo, "accdb", 0UL ) ], funk_obj, FD_SHMEM_JOIN_MODE_READ_WRITE );
/**/ fd_topob_tile_uses( topo, &topo->tiles[ fd_topo_find_tile( topo, "replay", 0UL ) ], funk_locks_obj, FD_SHMEM_JOIN_MODE_READ_WRITE );
/**/ fd_topob_tile_uses( topo, &topo->tiles[ fd_topo_find_tile( topo, "tower", 0UL ) ], funk_locks_obj, FD_SHMEM_JOIN_MODE_READ_WRITE );
FOR(execrp_tile_cnt) fd_topob_tile_uses( topo, &topo->tiles[ fd_topo_find_tile( topo, "execrp", i ) ], funk_locks_obj, FD_SHMEM_JOIN_MODE_READ_WRITE );
FOR(execle_tile_cnt) fd_topob_tile_uses( topo, &topo->tiles[ fd_topo_find_tile( topo, "execle", i ) ], funk_locks_obj, FD_SHMEM_JOIN_MODE_READ_WRITE );
FOR(resolv_tile_cnt) fd_topob_tile_uses( topo, &topo->tiles[ fd_topo_find_tile( topo, "resolv", i ) ], funk_locks_obj, FD_SHMEM_JOIN_MODE_READ_WRITE );
+ if( vinyl_enabled ) fd_topob_tile_uses( topo, &topo->tiles[ fd_topo_find_tile( topo, "accdb", 0UL ) ], funk_locks_obj, FD_SHMEM_JOIN_MODE_READ_WRITE );
fd_topo_obj_t * banks_obj = setup_topo_banks( topo, "banks", config->firedancer.runtime.max_live_slots, config->firedancer.runtime.max_fork_width, config->development.bench.larger_max_cost_per_block );
/**/ fd_topob_tile_uses( topo, &topo->tiles[ fd_topo_find_tile( topo, "replay", 0UL ) ], banks_obj, FD_SHMEM_JOIN_MODE_READ_WRITE );
@@ -1387,7 +1417,6 @@ fd_topo_initialize( config_t * config ) {
fd_pod_insert_int( topo->props, "sandbox", config->development.sandbox ? 1 : 0 );
if( vinyl_enabled ) {
- fd_topob_vinyl_rq( topo, "genesi", 0UL, "accdb_genesi", "genesi", 4UL, 1024UL, 1024UL );
fd_topob_vinyl_rq( topo, "replay", 0UL, "accdb_replay", "replay", 4UL, 1024UL, 1024UL );
for( ulong i=0UL; iaccdb.meta_map_obj_id = fd_pod_query_ulong( config->topo.props, "accdb.meta_map", ULONG_MAX );
tile->accdb.meta_pool_obj_id = fd_pod_query_ulong( config->topo.props, "accdb.meta_pool", ULONG_MAX );
+ tile->accdb.line_obj_id = fd_pod_query_ulong( config->topo.props, "accdb.line", ULONG_MAX );
tile->accdb.line_max = (config->firedancer.accounts.cache_size_gib << 30) / config->firedancer.accounts.mean_account_footprint;
tile->accdb.data_obj_id = fd_pod_query_ulong( config->topo.props, "accdb.data", ULONG_MAX );
fd_cstr_ncpy( tile->accdb.bstream_path, config->paths.accounts, sizeof(tile->accdb.bstream_path) );
@@ -1893,6 +1923,7 @@ fd_topo_configure_tile( fd_topo_tile_t * tile,
tile->accdb.io_type = !strcmp(config->firedancer.accounts.io_provider, "io_uring") ?
FD_VINYL_IO_TYPE_UR : FD_VINYL_IO_TYPE_BD;
tile->accdb.uring_depth = config->firedancer.accounts.io_uring.queue_depth;
+ tile->accdb.write_delay_slots = config->firedancer.accounts.write_delay_slots;
/* Minimum bound for cache entry count */
ulong required_cache_entries = 0UL;
@@ -1908,6 +1939,7 @@ fd_topo_configure_tile( fd_topo_tile_t * tile,
if( FD_UNLIKELY( required_cache_entries > tile->accdb.line_max ) ) {
tile->accdb.line_max = required_cache_entries;
}
+ fd_pod_insertf_ulong( config->topo.props, tile->accdb.line_max, "obj.%lu.line_cnt", tile->accdb.line_obj_id );
} else if( FD_UNLIKELY( !strcmp( tile->name, "solcap" ) ) ) {
diff --git a/src/app/shared/commands/mem.c b/src/app/shared/commands/mem.c
index 8ac10f447f7..aad09d4323a 100644
--- a/src/app/shared/commands/mem.c
+++ b/src/app/shared/commands/mem.c
@@ -20,7 +20,7 @@ reconstruct_topo( config_t * config,
if( !topo_name[0] ) return; /* keep default action topo */
action_t const * selected = NULL;
- for( action_t ** a=ACTIONS; a; a++ ) {
+ for( action_t ** a=ACTIONS; *a; a++ ) {
action_t const * action = *a;
if( 0==strcmp( action->name, topo_name ) ) {
selected = action;
diff --git a/src/app/shared/commands/metrics.c b/src/app/shared/commands/metrics.c
index 3f175d2649c..883a689c6ec 100644
--- a/src/app/shared/commands/metrics.c
+++ b/src/app/shared/commands/metrics.c
@@ -27,7 +27,7 @@ reconstruct_topo( config_t * config,
if( !topo_name[0] ) return; /* keep default action topo */
action_t const * selected = NULL;
- for( action_t ** a=ACTIONS; a; a++ ) {
+ for( action_t ** a=ACTIONS; *a; a++ ) {
action_t const * action = *a;
if( 0==strcmp( action->name, topo_name ) ) {
selected = action;
diff --git a/src/app/shared/commands/monitor/monitor.c b/src/app/shared/commands/monitor/monitor.c
index 35c20a00b10..4582b2538f1 100644
--- a/src/app/shared/commands/monitor/monitor.c
+++ b/src/app/shared/commands/monitor/monitor.c
@@ -22,6 +22,8 @@
#include
#include "generated/monitor_seccomp.h"
+extern action_t * ACTIONS[];
+
void
monitor_cmd_args( int * pargc,
char *** pargv,
@@ -35,6 +37,12 @@ monitor_cmd_args( int * pargc,
args->monitor.with_bench = fd_env_strip_cmdline_contains( pargc, pargv, "--bench" );
args->monitor.with_sankey = fd_env_strip_cmdline_contains( pargc, pargv, "--sankey" );
+ char const * topo_name = fd_env_strip_cmdline_cstr( pargc, pargv, "--topo", NULL, "" );
+
+ ulong topo_name_len = strlen( topo_name );
+ if( FD_UNLIKELY( topo_name_len > sizeof(args->monitor.topo)-1 ) ) FD_LOG_ERR(( "Unknown --topo %s", topo_name ));
+ fd_cstr_fini( fd_cstr_append_text( fd_cstr_init( args->monitor.topo ), topo_name, topo_name_len ) );
+
if( FD_UNLIKELY( args->monitor.dt_min<0L ) ) FD_LOG_ERR(( "--dt-min should be positive" ));
if( FD_UNLIKELY( args->monitor.dt_maxmonitor.dt_min ) ) FD_LOG_ERR(( "--dt-max should be at least --dt-min" ));
if( FD_UNLIKELY( args->monitor.duration<0L ) ) FD_LOG_ERR(( "--duration should be non-negative" ));
@@ -498,9 +506,31 @@ signal1( int sig ) {
exit( 0 ); /* gracefully exit */
}
+static void
+reconstruct_topo( config_t * config,
+ char const * topo_name ) {
+ if( !topo_name[0] ) return; /* keep default action topo */
+
+ action_t const * selected = NULL;
+ for( action_t ** a=ACTIONS; *a; a++ ) {
+ action_t const * action = *a;
+ if( 0==strcmp( action->name, topo_name ) ) {
+ selected = action;
+ break;
+ }
+ }
+
+ if( !selected ) FD_LOG_ERR(( "Unknown --topo %s", topo_name ));
+ if( !selected->topo ) FD_LOG_ERR(( "Cannot recover topology for --topo %s", topo_name ));
+
+ selected->topo( config );
+}
+
void
monitor_cmd_fn( args_t * args,
config_t * config ) {
+ reconstruct_topo( config, args->monitor.topo );
+
if( FD_UNLIKELY( args->monitor.with_bench ) ) {
add_bench_topo( &config->topo,
config->development.bench.affinity,
diff --git a/src/app/shared/commands/watch/watch.c b/src/app/shared/commands/watch/watch.c
index d9bbf6e3c20..c899a0ecf1c 100644
--- a/src/app/shared/commands/watch/watch.c
+++ b/src/app/shared/commands/watch/watch.c
@@ -428,7 +428,8 @@ write_snapshots( config_t const * config,
static uint
write_accdb( config_t const * config,
- ulong const * cur_tile ) {
+ ulong const * cur_tile,
+ ulong const * prev_tile ) {
ulong accdb_tile_idx = fd_topo_find_tile( &config->topo, "accdb", 0UL );
ulong snapwm_tile_idx = fd_topo_find_tile( &config->topo, "snapwm", 0UL );
ulong snapwr_tile_idx = fd_topo_find_tile( &config->topo, "snapwr", 0UL );
@@ -455,11 +456,25 @@ write_accdb( config_t const * config,
for( ulong i=0UL; i0L ? 100.0*(double)(lookup_funk+lookup_specrd)/(double)lookup_total : 100.0;
+
PRINT( "💾 " BOLD GREEN "ACCOUNTS...." RESET UNBOLD
" " BOLD "DATA" UNBOLD " %4.1f%% (%.1f GB) "
" " BOLD "INDEX" UNBOLD " %4.1f%% (%.1fM) "
- " " BOLD "RPS" UNBOLD " %s" CLEARLN "\n",
- data_pct, used_gb, index_pct, (double)acct_cnt/1e6, rps_str );
+ " " BOLD "RPS" UNBOLD " %s"
+ " " BOLD "CACHE" UNBOLD " %4.1f%%" CLEARLN "\n",
+ data_pct, used_gb, index_pct, (double)acct_cnt/1e6, rps_str, cache_hit_pct );
return 1;
}
@@ -720,7 +735,7 @@ write_summary( config_t const * config,
write_snapshots( config, cur_tile, prev_tile );
}
- lines_printed += write_accdb( config, cur_tile );
+ lines_printed += write_accdb( config, cur_tile, prev_tile );
lines_printed += write_gossip( config, cur_tile, prev_tile, cur_link, prev_link );
lines_printed += write_repair( config, cur_tile, cur_link, prev_link );
lines_printed += write_replay( config, cur_tile );
@@ -836,16 +851,9 @@ run( config_t const * config,
event_bytes_read_samples[ event_bytes_read_samples_idx%(sizeof(event_bytes_read_samples)/sizeof(event_bytes_read_samples[0])) ] = (ulong)diff_tile( config, "event", tiles+(1UL-last_snap)*tile_cnt*FD_METRICS_TOTAL_SZ, tiles+last_snap*tile_cnt*FD_METRICS_TOTAL_SZ, MIDX( COUNTER, EVENT, BYTES_READ ) );
event_bytes_read_samples_idx++;
rps_samples[ rps_samples_idx%(sizeof(rps_samples)/sizeof(rps_samples[0])) ] = (ulong)(
- diff_tile( config, "execrp", tiles+(1UL-last_snap)*tile_cnt*FD_METRICS_TOTAL_SZ, tiles+last_snap*tile_cnt*FD_METRICS_TOTAL_SZ, MIDX( COUNTER, EXECRP, TXN_ACCOUNT_CHANGES_UNCHANGED_NONEXIST ) ) +
- diff_tile( config, "execrp", tiles+(1UL-last_snap)*tile_cnt*FD_METRICS_TOTAL_SZ, tiles+last_snap*tile_cnt*FD_METRICS_TOTAL_SZ, MIDX( COUNTER, EXECRP, TXN_ACCOUNT_CHANGES_CREATED ) ) +
- diff_tile( config, "execrp", tiles+(1UL-last_snap)*tile_cnt*FD_METRICS_TOTAL_SZ, tiles+last_snap*tile_cnt*FD_METRICS_TOTAL_SZ, MIDX( COUNTER, EXECRP, TXN_ACCOUNT_CHANGES_DELETE ) ) +
- diff_tile( config, "execrp", tiles+(1UL-last_snap)*tile_cnt*FD_METRICS_TOTAL_SZ, tiles+last_snap*tile_cnt*FD_METRICS_TOTAL_SZ, MIDX( COUNTER, EXECRP, TXN_ACCOUNT_CHANGES_MODIFY ) ) +
- diff_tile( config, "execrp", tiles+(1UL-last_snap)*tile_cnt*FD_METRICS_TOTAL_SZ, tiles+last_snap*tile_cnt*FD_METRICS_TOTAL_SZ, MIDX( COUNTER, EXECRP, TXN_ACCOUNT_CHANGES_UNCHANGED ) ) +
- diff_tile( config, "replay", tiles+(1UL-last_snap)*tile_cnt*FD_METRICS_TOTAL_SZ, tiles+last_snap*tile_cnt*FD_METRICS_TOTAL_SZ, MIDX( COUNTER, REPLAY, ACCDB_CREATED ) ) +
- diff_tile( config, "replay", tiles+(1UL-last_snap)*tile_cnt*FD_METRICS_TOTAL_SZ, tiles+last_snap*tile_cnt*FD_METRICS_TOTAL_SZ, MIDX( COUNTER, REPLAY, ACCDB_ROOTED ) ) +
- diff_tile( config, "replay", tiles+(1UL-last_snap)*tile_cnt*FD_METRICS_TOTAL_SZ, tiles+last_snap*tile_cnt*FD_METRICS_TOTAL_SZ, MIDX( COUNTER, REPLAY, ACCDB_REVERTED ) ) +
- diff_tile( config, "replay", tiles+(1UL-last_snap)*tile_cnt*FD_METRICS_TOTAL_SZ, tiles+last_snap*tile_cnt*FD_METRICS_TOTAL_SZ, MIDX( COUNTER, REPLAY, ACCDB_GC_ROOT ) ) +
- diff_tile( config, "replay", tiles+(1UL-last_snap)*tile_cnt*FD_METRICS_TOTAL_SZ, tiles+last_snap*tile_cnt*FD_METRICS_TOTAL_SZ, MIDX( COUNTER, REPLAY, ACCDB_RECLAIMED ) ) );
+ diff_tile( config, "replay", tiles+(1UL-last_snap)*tile_cnt*FD_METRICS_TOTAL_SZ, tiles+last_snap*tile_cnt*FD_METRICS_TOTAL_SZ, MIDX( COUNTER, REPLAY, ACCDB_LOOKUP_ACCDB ) ) +
+ diff_tile( config, "execrp", tiles+(1UL-last_snap)*tile_cnt*FD_METRICS_TOTAL_SZ, tiles+last_snap*tile_cnt*FD_METRICS_TOTAL_SZ, MIDX( COUNTER, EXECRP, ACCDB_LOOKUP_ACCDB ) ) +
+ diff_tile( config, "execle", tiles+(1UL-last_snap)*tile_cnt*FD_METRICS_TOTAL_SZ, tiles+last_snap*tile_cnt*FD_METRICS_TOTAL_SZ, MIDX( COUNTER, EXECLE, ACCDB_LOOKUP_ACCDB ) ) );
rps_samples_idx++;
/* Move cursor to top of dashboard and overwrite in place.
diff --git a/src/app/shared/fd_action.h b/src/app/shared/fd_action.h
index e922c2245fe..bab0f203b8e 100644
--- a/src/app/shared/fd_action.h
+++ b/src/app/shared/fd_action.h
@@ -18,6 +18,7 @@ union fdctl_args {
int drain_output_fd;
int with_bench;
int with_sankey;
+ char topo[ 64 ];
} monitor;
struct {
diff --git a/src/disco/metrics/generated/fd_metrics_accdb.c b/src/disco/metrics/generated/fd_metrics_accdb.c
index 1253e16ec32..1da9ca3d3ad 100644
--- a/src/disco/metrics/generated/fd_metrics_accdb.c
+++ b/src/disco/metrics/generated/fd_metrics_accdb.c
@@ -22,9 +22,7 @@ const fd_metrics_meta_t FD_METRICS_ACCDB[FD_METRICS_ACCDB_TOTAL] = {
DECLARE_METRIC_ENUM( ACCDB_BSTREAM_SEQ, GAUGE, BSTREAM_SEQ, PRESENT ),
DECLARE_METRIC_ENUM( ACCDB_BSTREAM_SEQ, GAUGE, BSTREAM_SEQ, FUTURE ),
DECLARE_METRIC( ACCDB_REQUEST_BATCHES, COUNTER ),
- DECLARE_METRIC_ENUM( ACCDB_REQUESTS, COUNTER, VINYL_REQUEST, ACQUIRE ),
- DECLARE_METRIC_ENUM( ACCDB_REQUESTS, COUNTER, VINYL_REQUEST, RELEASE ),
- DECLARE_METRIC_ENUM( ACCDB_REQUESTS, COUNTER, VINYL_REQUEST, ERASE ),
+ DECLARE_METRIC( ACCDB_REQUESTS, COUNTER ),
DECLARE_METRIC_ENUM( ACCDB_BLOCKS, COUNTER, VINYL_BLOCKS, PAIR ),
DECLARE_METRIC_ENUM( ACCDB_BLOCKS, COUNTER, VINYL_BLOCKS, DEAD ),
DECLARE_METRIC_ENUM( ACCDB_BLOCKS, COUNTER, VINYL_BLOCKS, PART ),
diff --git a/src/disco/metrics/generated/fd_metrics_accdb.h b/src/disco/metrics/generated/fd_metrics_accdb.h
index 77aa3bd188d..31ae2985c9a 100644
--- a/src/disco/metrics/generated/fd_metrics_accdb.h
+++ b/src/disco/metrics/generated/fd_metrics_accdb.h
@@ -91,42 +91,37 @@
#define FD_METRICS_COUNTER_ACCDB_REQUESTS_TYPE (FD_METRICS_TYPE_COUNTER)
#define FD_METRICS_COUNTER_ACCDB_REQUESTS_DESC "Number of requests processed"
#define FD_METRICS_COUNTER_ACCDB_REQUESTS_CVT (FD_METRICS_CONVERTER_NONE)
-#define FD_METRICS_COUNTER_ACCDB_REQUESTS_CNT (3UL)
-#define FD_METRICS_COUNTER_ACCDB_REQUESTS_ACQUIRE_OFF (43UL)
-#define FD_METRICS_COUNTER_ACCDB_REQUESTS_RELEASE_OFF (44UL)
-#define FD_METRICS_COUNTER_ACCDB_REQUESTS_ERASE_OFF (45UL)
-
-#define FD_METRICS_COUNTER_ACCDB_BLOCKS_OFF (46UL)
+#define FD_METRICS_COUNTER_ACCDB_BLOCKS_OFF (44UL)
#define FD_METRICS_COUNTER_ACCDB_BLOCKS_NAME "accdb_blocks"
#define FD_METRICS_COUNTER_ACCDB_BLOCKS_TYPE (FD_METRICS_TYPE_COUNTER)
#define FD_METRICS_COUNTER_ACCDB_BLOCKS_DESC "Number of blocks written to bstream"
#define FD_METRICS_COUNTER_ACCDB_BLOCKS_CVT (FD_METRICS_CONVERTER_NONE)
#define FD_METRICS_COUNTER_ACCDB_BLOCKS_CNT (3UL)
-#define FD_METRICS_COUNTER_ACCDB_BLOCKS_PAIR_OFF (46UL)
-#define FD_METRICS_COUNTER_ACCDB_BLOCKS_DEAD_OFF (47UL)
-#define FD_METRICS_COUNTER_ACCDB_BLOCKS_PART_OFF (48UL)
+#define FD_METRICS_COUNTER_ACCDB_BLOCKS_PAIR_OFF (44UL)
+#define FD_METRICS_COUNTER_ACCDB_BLOCKS_DEAD_OFF (45UL)
+#define FD_METRICS_COUNTER_ACCDB_BLOCKS_PART_OFF (46UL)
-#define FD_METRICS_GAUGE_ACCDB_GARBAGE_BYTES_OFF (49UL)
+#define FD_METRICS_GAUGE_ACCDB_GARBAGE_BYTES_OFF (47UL)
#define FD_METRICS_GAUGE_ACCDB_GARBAGE_BYTES_NAME "accdb_garbage_bytes"
#define FD_METRICS_GAUGE_ACCDB_GARBAGE_BYTES_TYPE (FD_METRICS_TYPE_GAUGE)
#define FD_METRICS_GAUGE_ACCDB_GARBAGE_BYTES_DESC ""
#define FD_METRICS_GAUGE_ACCDB_GARBAGE_BYTES_CVT (FD_METRICS_CONVERTER_NONE)
-#define FD_METRICS_COUNTER_ACCDB_CUM_GC_BYTES_OFF (50UL)
+#define FD_METRICS_COUNTER_ACCDB_CUM_GC_BYTES_OFF (48UL)
#define FD_METRICS_COUNTER_ACCDB_CUM_GC_BYTES_NAME "accdb_cum_gc_bytes"
#define FD_METRICS_COUNTER_ACCDB_CUM_GC_BYTES_TYPE (FD_METRICS_TYPE_COUNTER)
#define FD_METRICS_COUNTER_ACCDB_CUM_GC_BYTES_DESC "Total number of record bytes that were garbage collected"
#define FD_METRICS_COUNTER_ACCDB_CUM_GC_BYTES_CVT (FD_METRICS_CONVERTER_NONE)
-#define FD_METRICS_GAUGE_ACCDB_ACCOUNT_INDEX_REMAINING_FREE_OFF (51UL)
+#define FD_METRICS_GAUGE_ACCDB_ACCOUNT_INDEX_REMAINING_FREE_OFF (49UL)
#define FD_METRICS_GAUGE_ACCDB_ACCOUNT_INDEX_REMAINING_FREE_NAME "accdb_account_index_remaining_free"
#define FD_METRICS_GAUGE_ACCDB_ACCOUNT_INDEX_REMAINING_FREE_TYPE (FD_METRICS_TYPE_GAUGE)
#define FD_METRICS_GAUGE_ACCDB_ACCOUNT_INDEX_REMAINING_FREE_DESC "Remaining free slots in the account database index (validator crashes when this number reaches zero)"
#define FD_METRICS_GAUGE_ACCDB_ACCOUNT_INDEX_REMAINING_FREE_CVT (FD_METRICS_CONVERTER_NONE)
-#define FD_METRICS_ACCDB_TOTAL (29UL)
+#define FD_METRICS_ACCDB_TOTAL (27UL)
extern const fd_metrics_meta_t FD_METRICS_ACCDB[FD_METRICS_ACCDB_TOTAL];
#endif /* HEADER_fd_src_disco_metrics_generated_fd_metrics_accdb_h */
diff --git a/src/disco/metrics/generated/fd_metrics_enums.h b/src/disco/metrics/generated/fd_metrics_enums.h
index d5a3faab4d9..e095e990f1b 100644
--- a/src/disco/metrics/generated/fd_metrics_enums.h
+++ b/src/disco/metrics/generated/fd_metrics_enums.h
@@ -828,15 +828,6 @@
#define FD_METRICS_ENUM_ACCOUNT_CHANGE_V_UNCHANGED_IDX 4
#define FD_METRICS_ENUM_ACCOUNT_CHANGE_V_UNCHANGED_NAME "unchanged"
-#define FD_METRICS_ENUM_VINYL_REQUEST_NAME "vinyl_request"
-#define FD_METRICS_ENUM_VINYL_REQUEST_CNT (3UL)
-#define FD_METRICS_ENUM_VINYL_REQUEST_V_ACQUIRE_IDX 0
-#define FD_METRICS_ENUM_VINYL_REQUEST_V_ACQUIRE_NAME "acquire"
-#define FD_METRICS_ENUM_VINYL_REQUEST_V_RELEASE_IDX 1
-#define FD_METRICS_ENUM_VINYL_REQUEST_V_RELEASE_NAME "release"
-#define FD_METRICS_ENUM_VINYL_REQUEST_V_ERASE_IDX 2
-#define FD_METRICS_ENUM_VINYL_REQUEST_V_ERASE_NAME "erase"
-
#define FD_METRICS_ENUM_VINYL_BLOCKS_NAME "vinyl_blocks"
#define FD_METRICS_ENUM_VINYL_BLOCKS_CNT (3UL)
#define FD_METRICS_ENUM_VINYL_BLOCKS_V_PAIR_IDX 0
diff --git a/src/disco/metrics/generated/fd_metrics_execle.c b/src/disco/metrics/generated/fd_metrics_execle.c
index e499ff0e54e..004954ac7f3 100644
--- a/src/disco/metrics/generated/fd_metrics_execle.c
+++ b/src/disco/metrics/generated/fd_metrics_execle.c
@@ -33,4 +33,10 @@ const fd_metrics_meta_t FD_METRICS_EXECLE[FD_METRICS_EXECLE_TOTAL] = {
DECLARE_METRIC_ENUM( EXECLE_TRANSACTION_LANDED, COUNTER, TRANSACTION_LANDED, LANDED_FAILED ),
DECLARE_METRIC_ENUM( EXECLE_TRANSACTION_LANDED, COUNTER, TRANSACTION_LANDED, UNLANDED ),
DECLARE_METRIC( EXECLE_COMPUTE_UNITS_TOTAL, COUNTER ),
+ DECLARE_METRIC( EXECLE_ACCDB_LOOKUP_FUNK, COUNTER ),
+ DECLARE_METRIC( EXECLE_ACCDB_LOOKUP_SPECRD, COUNTER ),
+ DECLARE_METRIC( EXECLE_ACCDB_LOOKUP_ACCDB, COUNTER ),
+ DECLARE_METRIC( EXECLE_ACCDB_DT_FUNK, COUNTER ),
+ DECLARE_METRIC( EXECLE_ACCDB_DT_SPECRD, COUNTER ),
+ DECLARE_METRIC( EXECLE_ACCDB_DT_VINYL, COUNTER ),
};
diff --git a/src/disco/metrics/generated/fd_metrics_execle.h b/src/disco/metrics/generated/fd_metrics_execle.h
index 12f5692cc1d..b9a0ff7194e 100644
--- a/src/disco/metrics/generated/fd_metrics_execle.h
+++ b/src/disco/metrics/generated/fd_metrics_execle.h
@@ -58,7 +58,43 @@
#define FD_METRICS_COUNTER_EXECLE_COMPUTE_UNITS_TOTAL_DESC "Estimated number of compute units executed since tile start"
#define FD_METRICS_COUNTER_EXECLE_COMPUTE_UNITS_TOTAL_CVT (FD_METRICS_CONVERTER_NONE)
-#define FD_METRICS_EXECLE_TOTAL (31UL)
+#define FD_METRICS_COUNTER_EXECLE_ACCDB_LOOKUP_FUNK_OFF (54UL)
+#define FD_METRICS_COUNTER_EXECLE_ACCDB_LOOKUP_FUNK_NAME "execle_accdb_lookup_funk"
+#define FD_METRICS_COUNTER_EXECLE_ACCDB_LOOKUP_FUNK_TYPE (FD_METRICS_TYPE_COUNTER)
+#define FD_METRICS_COUNTER_EXECLE_ACCDB_LOOKUP_FUNK_DESC "Number of account lookups resolved from funk (in-memory fork store)"
+#define FD_METRICS_COUNTER_EXECLE_ACCDB_LOOKUP_FUNK_CVT (FD_METRICS_CONVERTER_NONE)
+
+#define FD_METRICS_COUNTER_EXECLE_ACCDB_LOOKUP_SPECRD_OFF (55UL)
+#define FD_METRICS_COUNTER_EXECLE_ACCDB_LOOKUP_SPECRD_NAME "execle_accdb_lookup_specrd"
+#define FD_METRICS_COUNTER_EXECLE_ACCDB_LOOKUP_SPECRD_TYPE (FD_METRICS_TYPE_COUNTER)
+#define FD_METRICS_COUNTER_EXECLE_ACCDB_LOOKUP_SPECRD_DESC "Number of account lookups resolved from speculative read (vinyl cache)"
+#define FD_METRICS_COUNTER_EXECLE_ACCDB_LOOKUP_SPECRD_CVT (FD_METRICS_CONVERTER_NONE)
+
+#define FD_METRICS_COUNTER_EXECLE_ACCDB_LOOKUP_ACCDB_OFF (56UL)
+#define FD_METRICS_COUNTER_EXECLE_ACCDB_LOOKUP_ACCDB_NAME "execle_accdb_lookup_accdb"
+#define FD_METRICS_COUNTER_EXECLE_ACCDB_LOOKUP_ACCDB_TYPE (FD_METRICS_TYPE_COUNTER)
+#define FD_METRICS_COUNTER_EXECLE_ACCDB_LOOKUP_ACCDB_DESC "Number of account lookups sent to accdb tile (vinyl rq/cq)"
+#define FD_METRICS_COUNTER_EXECLE_ACCDB_LOOKUP_ACCDB_CVT (FD_METRICS_CONVERTER_NONE)
+
+#define FD_METRICS_COUNTER_EXECLE_ACCDB_DT_FUNK_OFF (57UL)
+#define FD_METRICS_COUNTER_EXECLE_ACCDB_DT_FUNK_NAME "execle_accdb_dt_funk"
+#define FD_METRICS_COUNTER_EXECLE_ACCDB_DT_FUNK_TYPE (FD_METRICS_TYPE_COUNTER)
+#define FD_METRICS_COUNTER_EXECLE_ACCDB_DT_FUNK_DESC "Cumulative time spent in funk (in-memory) account lookups"
+#define FD_METRICS_COUNTER_EXECLE_ACCDB_DT_FUNK_CVT (FD_METRICS_CONVERTER_SECONDS)
+
+#define FD_METRICS_COUNTER_EXECLE_ACCDB_DT_SPECRD_OFF (58UL)
+#define FD_METRICS_COUNTER_EXECLE_ACCDB_DT_SPECRD_NAME "execle_accdb_dt_specrd"
+#define FD_METRICS_COUNTER_EXECLE_ACCDB_DT_SPECRD_TYPE (FD_METRICS_TYPE_COUNTER)
+#define FD_METRICS_COUNTER_EXECLE_ACCDB_DT_SPECRD_DESC "Cumulative time spent in speculative read (vinyl cache) account lookups"
+#define FD_METRICS_COUNTER_EXECLE_ACCDB_DT_SPECRD_CVT (FD_METRICS_CONVERTER_SECONDS)
+
+#define FD_METRICS_COUNTER_EXECLE_ACCDB_DT_VINYL_OFF (59UL)
+#define FD_METRICS_COUNTER_EXECLE_ACCDB_DT_VINYL_NAME "execle_accdb_dt_vinyl"
+#define FD_METRICS_COUNTER_EXECLE_ACCDB_DT_VINYL_TYPE (FD_METRICS_TYPE_COUNTER)
+#define FD_METRICS_COUNTER_EXECLE_ACCDB_DT_VINYL_DESC "Cumulative time spent waiting for vinyl rq/cq account lookups"
+#define FD_METRICS_COUNTER_EXECLE_ACCDB_DT_VINYL_CVT (FD_METRICS_CONVERTER_SECONDS)
+
+#define FD_METRICS_EXECLE_TOTAL (37UL)
extern const fd_metrics_meta_t FD_METRICS_EXECLE[FD_METRICS_EXECLE_TOTAL];
#endif /* HEADER_fd_src_disco_metrics_generated_fd_metrics_execle_h */
diff --git a/src/disco/metrics/generated/fd_metrics_execrp.c b/src/disco/metrics/generated/fd_metrics_execrp.c
index 7da8fbd6e99..b8c9ab1d211 100644
--- a/src/disco/metrics/generated/fd_metrics_execrp.c
+++ b/src/disco/metrics/generated/fd_metrics_execrp.c
@@ -10,6 +10,12 @@ const fd_metrics_meta_t FD_METRICS_EXECRP[FD_METRICS_EXECRP_TOTAL] = {
DECLARE_METRIC( EXECRP_PROGCACHE_DUP_INSERTS, COUNTER ),
DECLARE_METRIC( EXECRP_PROGCACHE_INVALIDATIONS, COUNTER ),
DECLARE_METRIC( EXECRP_ACCDB_CREATED, COUNTER ),
+ DECLARE_METRIC( EXECRP_ACCDB_LOOKUP_FUNK, COUNTER ),
+ DECLARE_METRIC( EXECRP_ACCDB_LOOKUP_SPECRD, COUNTER ),
+ DECLARE_METRIC( EXECRP_ACCDB_LOOKUP_ACCDB, COUNTER ),
+ DECLARE_METRIC( EXECRP_ACCDB_DT_FUNK, COUNTER ),
+ DECLARE_METRIC( EXECRP_ACCDB_DT_SPECRD, COUNTER ),
+ DECLARE_METRIC( EXECRP_ACCDB_DT_VINYL, COUNTER ),
DECLARE_METRIC_ENUM( EXECRP_TXN_REGIME, COUNTER, TXN_REGIME, SETUP ),
DECLARE_METRIC_ENUM( EXECRP_TXN_REGIME, COUNTER, TXN_REGIME, EXEC ),
DECLARE_METRIC_ENUM( EXECRP_TXN_REGIME, COUNTER, TXN_REGIME, COMMIT ),
diff --git a/src/disco/metrics/generated/fd_metrics_execrp.h b/src/disco/metrics/generated/fd_metrics_execrp.h
index 4d454832b1e..1eb0dcdf01b 100644
--- a/src/disco/metrics/generated/fd_metrics_execrp.h
+++ b/src/disco/metrics/generated/fd_metrics_execrp.h
@@ -54,50 +54,86 @@
#define FD_METRICS_COUNTER_EXECRP_ACCDB_CREATED_DESC "Number of account database records created"
#define FD_METRICS_COUNTER_EXECRP_ACCDB_CREATED_CVT (FD_METRICS_CONVERTER_NONE)
-#define FD_METRICS_COUNTER_EXECRP_TXN_REGIME_OFF (31UL)
+#define FD_METRICS_COUNTER_EXECRP_ACCDB_LOOKUP_FUNK_OFF (31UL)
+#define FD_METRICS_COUNTER_EXECRP_ACCDB_LOOKUP_FUNK_NAME "execrp_accdb_lookup_funk"
+#define FD_METRICS_COUNTER_EXECRP_ACCDB_LOOKUP_FUNK_TYPE (FD_METRICS_TYPE_COUNTER)
+#define FD_METRICS_COUNTER_EXECRP_ACCDB_LOOKUP_FUNK_DESC "Number of account lookups resolved from funk (in-memory fork store)"
+#define FD_METRICS_COUNTER_EXECRP_ACCDB_LOOKUP_FUNK_CVT (FD_METRICS_CONVERTER_NONE)
+
+#define FD_METRICS_COUNTER_EXECRP_ACCDB_LOOKUP_SPECRD_OFF (32UL)
+#define FD_METRICS_COUNTER_EXECRP_ACCDB_LOOKUP_SPECRD_NAME "execrp_accdb_lookup_specrd"
+#define FD_METRICS_COUNTER_EXECRP_ACCDB_LOOKUP_SPECRD_TYPE (FD_METRICS_TYPE_COUNTER)
+#define FD_METRICS_COUNTER_EXECRP_ACCDB_LOOKUP_SPECRD_DESC "Number of account lookups resolved from speculative read (vinyl cache)"
+#define FD_METRICS_COUNTER_EXECRP_ACCDB_LOOKUP_SPECRD_CVT (FD_METRICS_CONVERTER_NONE)
+
+#define FD_METRICS_COUNTER_EXECRP_ACCDB_LOOKUP_ACCDB_OFF (33UL)
+#define FD_METRICS_COUNTER_EXECRP_ACCDB_LOOKUP_ACCDB_NAME "execrp_accdb_lookup_accdb"
+#define FD_METRICS_COUNTER_EXECRP_ACCDB_LOOKUP_ACCDB_TYPE (FD_METRICS_TYPE_COUNTER)
+#define FD_METRICS_COUNTER_EXECRP_ACCDB_LOOKUP_ACCDB_DESC "Number of account lookups sent to accdb tile (vinyl rq/cq)"
+#define FD_METRICS_COUNTER_EXECRP_ACCDB_LOOKUP_ACCDB_CVT (FD_METRICS_CONVERTER_NONE)
+
+#define FD_METRICS_COUNTER_EXECRP_ACCDB_DT_FUNK_OFF (34UL)
+#define FD_METRICS_COUNTER_EXECRP_ACCDB_DT_FUNK_NAME "execrp_accdb_dt_funk"
+#define FD_METRICS_COUNTER_EXECRP_ACCDB_DT_FUNK_TYPE (FD_METRICS_TYPE_COUNTER)
+#define FD_METRICS_COUNTER_EXECRP_ACCDB_DT_FUNK_DESC "Cumulative time spent in funk (in-memory) account lookups"
+#define FD_METRICS_COUNTER_EXECRP_ACCDB_DT_FUNK_CVT (FD_METRICS_CONVERTER_SECONDS)
+
+#define FD_METRICS_COUNTER_EXECRP_ACCDB_DT_SPECRD_OFF (35UL)
+#define FD_METRICS_COUNTER_EXECRP_ACCDB_DT_SPECRD_NAME "execrp_accdb_dt_specrd"
+#define FD_METRICS_COUNTER_EXECRP_ACCDB_DT_SPECRD_TYPE (FD_METRICS_TYPE_COUNTER)
+#define FD_METRICS_COUNTER_EXECRP_ACCDB_DT_SPECRD_DESC "Cumulative time spent in speculative read (vinyl cache) account lookups"
+#define FD_METRICS_COUNTER_EXECRP_ACCDB_DT_SPECRD_CVT (FD_METRICS_CONVERTER_SECONDS)
+
+#define FD_METRICS_COUNTER_EXECRP_ACCDB_DT_VINYL_OFF (36UL)
+#define FD_METRICS_COUNTER_EXECRP_ACCDB_DT_VINYL_NAME "execrp_accdb_dt_vinyl"
+#define FD_METRICS_COUNTER_EXECRP_ACCDB_DT_VINYL_TYPE (FD_METRICS_TYPE_COUNTER)
+#define FD_METRICS_COUNTER_EXECRP_ACCDB_DT_VINYL_DESC "Cumulative time spent waiting for vinyl rq/cq account lookups"
+#define FD_METRICS_COUNTER_EXECRP_ACCDB_DT_VINYL_CVT (FD_METRICS_CONVERTER_SECONDS)
+
+#define FD_METRICS_COUNTER_EXECRP_TXN_REGIME_OFF (37UL)
#define FD_METRICS_COUNTER_EXECRP_TXN_REGIME_NAME "execrp_txn_regime"
#define FD_METRICS_COUNTER_EXECRP_TXN_REGIME_TYPE (FD_METRICS_TYPE_COUNTER)
#define FD_METRICS_COUNTER_EXECRP_TXN_REGIME_DESC "Mutually exclusive and exhaustive duration of time spent in transaction execution regimes"
-#define FD_METRICS_COUNTER_EXECRP_TXN_REGIME_CVT (FD_METRICS_CONVERTER_NANOSECONDS)
+#define FD_METRICS_COUNTER_EXECRP_TXN_REGIME_CVT (FD_METRICS_CONVERTER_SECONDS)
#define FD_METRICS_COUNTER_EXECRP_TXN_REGIME_CNT (3UL)
-#define FD_METRICS_COUNTER_EXECRP_TXN_REGIME_SETUP_OFF (31UL)
-#define FD_METRICS_COUNTER_EXECRP_TXN_REGIME_EXEC_OFF (32UL)
-#define FD_METRICS_COUNTER_EXECRP_TXN_REGIME_COMMIT_OFF (33UL)
+#define FD_METRICS_COUNTER_EXECRP_TXN_REGIME_SETUP_OFF (37UL)
+#define FD_METRICS_COUNTER_EXECRP_TXN_REGIME_EXEC_OFF (38UL)
+#define FD_METRICS_COUNTER_EXECRP_TXN_REGIME_COMMIT_OFF (39UL)
-#define FD_METRICS_COUNTER_EXECRP_VM_REGIME_OFF (34UL)
+#define FD_METRICS_COUNTER_EXECRP_VM_REGIME_OFF (40UL)
#define FD_METRICS_COUNTER_EXECRP_VM_REGIME_NAME "execrp_vm_regime"
#define FD_METRICS_COUNTER_EXECRP_VM_REGIME_TYPE (FD_METRICS_TYPE_COUNTER)
#define FD_METRICS_COUNTER_EXECRP_VM_REGIME_DESC "Mutually exclusive and exhaustive duration of time spent in virtual machine execution regimes"
-#define FD_METRICS_COUNTER_EXECRP_VM_REGIME_CVT (FD_METRICS_CONVERTER_NANOSECONDS)
+#define FD_METRICS_COUNTER_EXECRP_VM_REGIME_CVT (FD_METRICS_CONVERTER_SECONDS)
#define FD_METRICS_COUNTER_EXECRP_VM_REGIME_CNT (5UL)
-#define FD_METRICS_COUNTER_EXECRP_VM_REGIME_SETUP_OFF (34UL)
-#define FD_METRICS_COUNTER_EXECRP_VM_REGIME_COMMIT_OFF (35UL)
-#define FD_METRICS_COUNTER_EXECRP_VM_REGIME_SETUP_CPI_OFF (36UL)
-#define FD_METRICS_COUNTER_EXECRP_VM_REGIME_COMMIT_CPI_OFF (37UL)
-#define FD_METRICS_COUNTER_EXECRP_VM_REGIME_INTERPRETER_OFF (38UL)
+#define FD_METRICS_COUNTER_EXECRP_VM_REGIME_SETUP_OFF (40UL)
+#define FD_METRICS_COUNTER_EXECRP_VM_REGIME_COMMIT_OFF (41UL)
+#define FD_METRICS_COUNTER_EXECRP_VM_REGIME_SETUP_CPI_OFF (42UL)
+#define FD_METRICS_COUNTER_EXECRP_VM_REGIME_COMMIT_CPI_OFF (43UL)
+#define FD_METRICS_COUNTER_EXECRP_VM_REGIME_INTERPRETER_OFF (44UL)
-#define FD_METRICS_COUNTER_EXECRP_TXN_ACCOUNT_CHANGES_OFF (39UL)
+#define FD_METRICS_COUNTER_EXECRP_TXN_ACCOUNT_CHANGES_OFF (45UL)
#define FD_METRICS_COUNTER_EXECRP_TXN_ACCOUNT_CHANGES_NAME "execrp_txn_account_changes"
#define FD_METRICS_COUNTER_EXECRP_TXN_ACCOUNT_CHANGES_TYPE (FD_METRICS_TYPE_COUNTER)
#define FD_METRICS_COUNTER_EXECRP_TXN_ACCOUNT_CHANGES_DESC "Transaction account change event counters"
#define FD_METRICS_COUNTER_EXECRP_TXN_ACCOUNT_CHANGES_CVT (FD_METRICS_CONVERTER_NONE)
#define FD_METRICS_COUNTER_EXECRP_TXN_ACCOUNT_CHANGES_CNT (5UL)
-#define FD_METRICS_COUNTER_EXECRP_TXN_ACCOUNT_CHANGES_UNCHANGED_NONEXIST_OFF (39UL)
-#define FD_METRICS_COUNTER_EXECRP_TXN_ACCOUNT_CHANGES_CREATED_OFF (40UL)
-#define FD_METRICS_COUNTER_EXECRP_TXN_ACCOUNT_CHANGES_DELETE_OFF (41UL)
-#define FD_METRICS_COUNTER_EXECRP_TXN_ACCOUNT_CHANGES_MODIFY_OFF (42UL)
-#define FD_METRICS_COUNTER_EXECRP_TXN_ACCOUNT_CHANGES_UNCHANGED_OFF (43UL)
+#define FD_METRICS_COUNTER_EXECRP_TXN_ACCOUNT_CHANGES_UNCHANGED_NONEXIST_OFF (45UL)
+#define FD_METRICS_COUNTER_EXECRP_TXN_ACCOUNT_CHANGES_CREATED_OFF (46UL)
+#define FD_METRICS_COUNTER_EXECRP_TXN_ACCOUNT_CHANGES_DELETE_OFF (47UL)
+#define FD_METRICS_COUNTER_EXECRP_TXN_ACCOUNT_CHANGES_MODIFY_OFF (48UL)
+#define FD_METRICS_COUNTER_EXECRP_TXN_ACCOUNT_CHANGES_UNCHANGED_OFF (49UL)
-#define FD_METRICS_COUNTER_EXECRP_COMPUTE_UNITS_TOTAL_OFF (44UL)
+#define FD_METRICS_COUNTER_EXECRP_COMPUTE_UNITS_TOTAL_OFF (50UL)
#define FD_METRICS_COUNTER_EXECRP_COMPUTE_UNITS_TOTAL_NAME "execrp_compute_units_total"
#define FD_METRICS_COUNTER_EXECRP_COMPUTE_UNITS_TOTAL_TYPE (FD_METRICS_TYPE_COUNTER)
#define FD_METRICS_COUNTER_EXECRP_COMPUTE_UNITS_TOTAL_DESC "Estimated number of compute units executed since tile start"
#define FD_METRICS_COUNTER_EXECRP_COMPUTE_UNITS_TOTAL_CVT (FD_METRICS_CONVERTER_NONE)
-#define FD_METRICS_EXECRP_TOTAL (22UL)
+#define FD_METRICS_EXECRP_TOTAL (28UL)
extern const fd_metrics_meta_t FD_METRICS_EXECRP[FD_METRICS_EXECRP_TOTAL];
#endif /* HEADER_fd_src_disco_metrics_generated_fd_metrics_execrp_h */
diff --git a/src/disco/metrics/generated/fd_metrics_replay.c b/src/disco/metrics/generated/fd_metrics_replay.c
index a1e565b97de..0dcede771c4 100644
--- a/src/disco/metrics/generated/fd_metrics_replay.c
+++ b/src/disco/metrics/generated/fd_metrics_replay.c
@@ -31,13 +31,10 @@ const fd_metrics_meta_t FD_METRICS_REPLAY[FD_METRICS_REPLAY_TOTAL] = {
DECLARE_METRIC( REPLAY_PROGCACHE_GC_ROOT, COUNTER ),
DECLARE_METRIC( REPLAY_ACCDB_CREATED, COUNTER ),
DECLARE_METRIC( REPLAY_ACCDB_REVERTED, COUNTER ),
- DECLARE_METRIC( REPLAY_ACCDB_ROOTED, COUNTER ),
- DECLARE_METRIC( REPLAY_ACCDB_ROOTED_BYTES, COUNTER ),
- DECLARE_METRIC( REPLAY_ACCDB_GC_ROOT, COUNTER ),
- DECLARE_METRIC( REPLAY_ACCDB_RECLAIMED, COUNTER ),
- DECLARE_METRIC_HISTOGRAM_SECONDS( REPLAY_ROOT_SLOT_DURATION_SECONDS ),
- DECLARE_METRIC_HISTOGRAM_SECONDS( REPLAY_ROOT_ACCOUNT_DURATION_SECONDS ),
- DECLARE_METRIC_ENUM( REPLAY_ROOT_ELAPSED_SECONDS, COUNTER, ROOT_PHASE, DB ),
- DECLARE_METRIC_ENUM( REPLAY_ROOT_ELAPSED_SECONDS, COUNTER, ROOT_PHASE, COPY ),
- DECLARE_METRIC_ENUM( REPLAY_ROOT_ELAPSED_SECONDS, COUNTER, ROOT_PHASE, GC ),
+ DECLARE_METRIC( REPLAY_ACCDB_LOOKUP_FUNK, COUNTER ),
+ DECLARE_METRIC( REPLAY_ACCDB_LOOKUP_SPECRD, COUNTER ),
+ DECLARE_METRIC( REPLAY_ACCDB_LOOKUP_ACCDB, COUNTER ),
+ DECLARE_METRIC( REPLAY_ACCDB_DT_FUNK, COUNTER ),
+ DECLARE_METRIC( REPLAY_ACCDB_DT_SPECRD, COUNTER ),
+ DECLARE_METRIC( REPLAY_ACCDB_DT_VINYL, COUNTER ),
};
diff --git a/src/disco/metrics/generated/fd_metrics_replay.h b/src/disco/metrics/generated/fd_metrics_replay.h
index 150b5e03342..acb80fddd9c 100644
--- a/src/disco/metrics/generated/fd_metrics_replay.h
+++ b/src/disco/metrics/generated/fd_metrics_replay.h
@@ -184,58 +184,43 @@
#define FD_METRICS_COUNTER_REPLAY_ACCDB_REVERTED_DESC "Number of account database records reverted"
#define FD_METRICS_COUNTER_REPLAY_ACCDB_REVERTED_CVT (FD_METRICS_CONVERTER_NONE)
-#define FD_METRICS_COUNTER_REPLAY_ACCDB_ROOTED_OFF (84UL)
-#define FD_METRICS_COUNTER_REPLAY_ACCDB_ROOTED_NAME "replay_accdb_rooted"
-#define FD_METRICS_COUNTER_REPLAY_ACCDB_ROOTED_TYPE (FD_METRICS_TYPE_COUNTER)
-#define FD_METRICS_COUNTER_REPLAY_ACCDB_ROOTED_DESC "Number of account database entries rooted"
-#define FD_METRICS_COUNTER_REPLAY_ACCDB_ROOTED_CVT (FD_METRICS_CONVERTER_NONE)
-
-#define FD_METRICS_COUNTER_REPLAY_ACCDB_ROOTED_BYTES_OFF (85UL)
-#define FD_METRICS_COUNTER_REPLAY_ACCDB_ROOTED_BYTES_NAME "replay_accdb_rooted_bytes"
-#define FD_METRICS_COUNTER_REPLAY_ACCDB_ROOTED_BYTES_TYPE (FD_METRICS_TYPE_COUNTER)
-#define FD_METRICS_COUNTER_REPLAY_ACCDB_ROOTED_BYTES_DESC "Number of bytes in account database entries rooted (including overhead)"
-#define FD_METRICS_COUNTER_REPLAY_ACCDB_ROOTED_BYTES_CVT (FD_METRICS_CONVERTER_NONE)
-
-#define FD_METRICS_COUNTER_REPLAY_ACCDB_GC_ROOT_OFF (86UL)
-#define FD_METRICS_COUNTER_REPLAY_ACCDB_GC_ROOT_NAME "replay_accdb_gc_root"
-#define FD_METRICS_COUNTER_REPLAY_ACCDB_GC_ROOT_TYPE (FD_METRICS_TYPE_COUNTER)
-#define FD_METRICS_COUNTER_REPLAY_ACCDB_GC_ROOT_DESC "Number of account database entries garbage collected"
-#define FD_METRICS_COUNTER_REPLAY_ACCDB_GC_ROOT_CVT (FD_METRICS_CONVERTER_NONE)
-
-#define FD_METRICS_COUNTER_REPLAY_ACCDB_RECLAIMED_OFF (87UL)
-#define FD_METRICS_COUNTER_REPLAY_ACCDB_RECLAIMED_NAME "replay_accdb_reclaimed"
-#define FD_METRICS_COUNTER_REPLAY_ACCDB_RECLAIMED_TYPE (FD_METRICS_TYPE_COUNTER)
-#define FD_METRICS_COUNTER_REPLAY_ACCDB_RECLAIMED_DESC "Number of account database entries reclaimed (deletion rooted)"
-#define FD_METRICS_COUNTER_REPLAY_ACCDB_RECLAIMED_CVT (FD_METRICS_CONVERTER_NONE)
-
-#define FD_METRICS_HISTOGRAM_REPLAY_ROOT_SLOT_DURATION_SECONDS_OFF (88UL)
-#define FD_METRICS_HISTOGRAM_REPLAY_ROOT_SLOT_DURATION_SECONDS_NAME "replay_root_slot_duration_seconds"
-#define FD_METRICS_HISTOGRAM_REPLAY_ROOT_SLOT_DURATION_SECONDS_TYPE (FD_METRICS_TYPE_HISTOGRAM)
-#define FD_METRICS_HISTOGRAM_REPLAY_ROOT_SLOT_DURATION_SECONDS_DESC "Time in seconds spent updating the rooted account store (one sample per block)"
-#define FD_METRICS_HISTOGRAM_REPLAY_ROOT_SLOT_DURATION_SECONDS_CVT (FD_METRICS_CONVERTER_SECONDS)
-#define FD_METRICS_HISTOGRAM_REPLAY_ROOT_SLOT_DURATION_SECONDS_MIN (0.0005)
-#define FD_METRICS_HISTOGRAM_REPLAY_ROOT_SLOT_DURATION_SECONDS_MAX (1.0)
-
-#define FD_METRICS_HISTOGRAM_REPLAY_ROOT_ACCOUNT_DURATION_SECONDS_OFF (105UL)
-#define FD_METRICS_HISTOGRAM_REPLAY_ROOT_ACCOUNT_DURATION_SECONDS_NAME "replay_root_account_duration_seconds"
-#define FD_METRICS_HISTOGRAM_REPLAY_ROOT_ACCOUNT_DURATION_SECONDS_TYPE (FD_METRICS_TYPE_HISTOGRAM)
-#define FD_METRICS_HISTOGRAM_REPLAY_ROOT_ACCOUNT_DURATION_SECONDS_DESC "Time in seconds spent updating the rooted account store (one sample per block, normalized by account count)"
-#define FD_METRICS_HISTOGRAM_REPLAY_ROOT_ACCOUNT_DURATION_SECONDS_CVT (FD_METRICS_CONVERTER_SECONDS)
-#define FD_METRICS_HISTOGRAM_REPLAY_ROOT_ACCOUNT_DURATION_SECONDS_MIN (1e-07)
-#define FD_METRICS_HISTOGRAM_REPLAY_ROOT_ACCOUNT_DURATION_SECONDS_MAX (0.1)
-
-#define FD_METRICS_COUNTER_REPLAY_ROOT_ELAPSED_SECONDS_OFF (122UL)
-#define FD_METRICS_COUNTER_REPLAY_ROOT_ELAPSED_SECONDS_NAME "replay_root_elapsed_seconds"
-#define FD_METRICS_COUNTER_REPLAY_ROOT_ELAPSED_SECONDS_TYPE (FD_METRICS_TYPE_COUNTER)
-#define FD_METRICS_COUNTER_REPLAY_ROOT_ELAPSED_SECONDS_DESC "Total time in seconds spent rooting accounts"
-#define FD_METRICS_COUNTER_REPLAY_ROOT_ELAPSED_SECONDS_CVT (FD_METRICS_CONVERTER_SECONDS)
-#define FD_METRICS_COUNTER_REPLAY_ROOT_ELAPSED_SECONDS_CNT (3UL)
-
-#define FD_METRICS_COUNTER_REPLAY_ROOT_ELAPSED_SECONDS_DB_OFF (122UL)
-#define FD_METRICS_COUNTER_REPLAY_ROOT_ELAPSED_SECONDS_COPY_OFF (123UL)
-#define FD_METRICS_COUNTER_REPLAY_ROOT_ELAPSED_SECONDS_GC_OFF (124UL)
-
-#define FD_METRICS_REPLAY_TOTAL (38UL)
+#define FD_METRICS_COUNTER_REPLAY_ACCDB_LOOKUP_FUNK_OFF (84UL)
+#define FD_METRICS_COUNTER_REPLAY_ACCDB_LOOKUP_FUNK_NAME "replay_accdb_lookup_funk"
+#define FD_METRICS_COUNTER_REPLAY_ACCDB_LOOKUP_FUNK_TYPE (FD_METRICS_TYPE_COUNTER)
+#define FD_METRICS_COUNTER_REPLAY_ACCDB_LOOKUP_FUNK_DESC "Number of account lookups resolved from funk (in-memory fork store)"
+#define FD_METRICS_COUNTER_REPLAY_ACCDB_LOOKUP_FUNK_CVT (FD_METRICS_CONVERTER_NONE)
+
+#define FD_METRICS_COUNTER_REPLAY_ACCDB_LOOKUP_SPECRD_OFF (85UL)
+#define FD_METRICS_COUNTER_REPLAY_ACCDB_LOOKUP_SPECRD_NAME "replay_accdb_lookup_specrd"
+#define FD_METRICS_COUNTER_REPLAY_ACCDB_LOOKUP_SPECRD_TYPE (FD_METRICS_TYPE_COUNTER)
+#define FD_METRICS_COUNTER_REPLAY_ACCDB_LOOKUP_SPECRD_DESC "Number of account lookups resolved from speculative read (vinyl cache)"
+#define FD_METRICS_COUNTER_REPLAY_ACCDB_LOOKUP_SPECRD_CVT (FD_METRICS_CONVERTER_NONE)
+
+#define FD_METRICS_COUNTER_REPLAY_ACCDB_LOOKUP_ACCDB_OFF (86UL)
+#define FD_METRICS_COUNTER_REPLAY_ACCDB_LOOKUP_ACCDB_NAME "replay_accdb_lookup_accdb"
+#define FD_METRICS_COUNTER_REPLAY_ACCDB_LOOKUP_ACCDB_TYPE (FD_METRICS_TYPE_COUNTER)
+#define FD_METRICS_COUNTER_REPLAY_ACCDB_LOOKUP_ACCDB_DESC "Number of account lookups sent to accdb tile (vinyl rq/cq)"
+#define FD_METRICS_COUNTER_REPLAY_ACCDB_LOOKUP_ACCDB_CVT (FD_METRICS_CONVERTER_NONE)
+
+#define FD_METRICS_COUNTER_REPLAY_ACCDB_DT_FUNK_OFF (87UL)
+#define FD_METRICS_COUNTER_REPLAY_ACCDB_DT_FUNK_NAME "replay_accdb_dt_funk"
+#define FD_METRICS_COUNTER_REPLAY_ACCDB_DT_FUNK_TYPE (FD_METRICS_TYPE_COUNTER)
+#define FD_METRICS_COUNTER_REPLAY_ACCDB_DT_FUNK_DESC "Cumulative time spent in funk (in-memory) account lookups"
+#define FD_METRICS_COUNTER_REPLAY_ACCDB_DT_FUNK_CVT (FD_METRICS_CONVERTER_SECONDS)
+
+#define FD_METRICS_COUNTER_REPLAY_ACCDB_DT_SPECRD_OFF (88UL)
+#define FD_METRICS_COUNTER_REPLAY_ACCDB_DT_SPECRD_NAME "replay_accdb_dt_specrd"
+#define FD_METRICS_COUNTER_REPLAY_ACCDB_DT_SPECRD_TYPE (FD_METRICS_TYPE_COUNTER)
+#define FD_METRICS_COUNTER_REPLAY_ACCDB_DT_SPECRD_DESC "Cumulative time spent in speculative read (vinyl cache) account lookups"
+#define FD_METRICS_COUNTER_REPLAY_ACCDB_DT_SPECRD_CVT (FD_METRICS_CONVERTER_SECONDS)
+
+#define FD_METRICS_COUNTER_REPLAY_ACCDB_DT_VINYL_OFF (89UL)
+#define FD_METRICS_COUNTER_REPLAY_ACCDB_DT_VINYL_NAME "replay_accdb_dt_vinyl"
+#define FD_METRICS_COUNTER_REPLAY_ACCDB_DT_VINYL_TYPE (FD_METRICS_TYPE_COUNTER)
+#define FD_METRICS_COUNTER_REPLAY_ACCDB_DT_VINYL_DESC "Cumulative time spent waiting for vinyl rq/cq account lookups"
+#define FD_METRICS_COUNTER_REPLAY_ACCDB_DT_VINYL_CVT (FD_METRICS_CONVERTER_SECONDS)
+
+#define FD_METRICS_REPLAY_TOTAL (35UL)
extern const fd_metrics_meta_t FD_METRICS_REPLAY[FD_METRICS_REPLAY_TOTAL];
#endif /* HEADER_fd_src_disco_metrics_generated_fd_metrics_replay_h */
diff --git a/src/disco/metrics/metrics.xml b/src/disco/metrics/metrics.xml
index 7412eb578e5..548f2613fa0 100644
--- a/src/disco/metrics/metrics.xml
+++ b/src/disco/metrics/metrics.xml
@@ -671,6 +671,12 @@ metric introduced.
+
+
+
+
+
+
@@ -798,20 +804,13 @@ metric introduced.
-
-
-
-
-
- Time in seconds spent updating the rooted account store (one sample per block)
-
-
-
- Time in seconds spent updating the rooted account store (one sample per block, normalized by account count)
-
-
-
+
+
+
+
+
+
@@ -1170,9 +1169,15 @@ metric introduced.
+
+
+
+
+
+
-
-
+
+
@@ -1270,12 +1275,6 @@ metric introduced.
-
-
-
-
-
-
@@ -1308,7 +1307,7 @@ metric introduced.
-
+
diff --git a/src/disco/topo/fd_topo.h b/src/disco/topo/fd_topo.h
index b9cf010f507..7bb6dc924d6 100644
--- a/src/disco/topo/fd_topo.h
+++ b/src/disco/topo/fd_topo.h
@@ -648,6 +648,7 @@ struct fd_topo_tile {
struct {
ulong meta_map_obj_id;
ulong meta_pool_obj_id;
+ ulong line_obj_id;
ulong line_max;
ulong data_obj_id;
char bstream_path[ PATH_MAX ];
@@ -655,6 +656,8 @@ struct fd_topo_tile {
int io_type; /* FD_VINYL_IO_TYPE_* */
uint uring_depth;
+
+ ulong write_delay_slots;
} accdb;
struct {
diff --git a/src/disco/topo/fd_topob_vinyl.h b/src/disco/topo/fd_topob_vinyl.h
index cf53ba60712..e77c01d7b6d 100644
--- a/src/disco/topo/fd_topob_vinyl.h
+++ b/src/disco/topo/fd_topob_vinyl.h
@@ -59,6 +59,16 @@ fd_topob_vinyl_rq( fd_topo_t * topo,
fd_topob_tile_uses( topo, client_tile, rq_obj, FD_SHMEM_JOIN_MODE_READ_WRITE );
fd_topob_tile_uses( topo, client_tile, cq_obj, FD_SHMEM_JOIN_MODE_READ_ONLY );
+ /* Grant read-only access to meta map and element pool for speculative
+ reads (pin-based direct cache access). If the pod keys are absent,
+ specread is simply not available for this client. */
+ ulong meta_obj_id = fd_pod_query_ulong( topo->props, "accdb.meta_map", ULONG_MAX );
+ ulong ele_obj_id = fd_pod_query_ulong( topo->props, "accdb.meta_pool", ULONG_MAX );
+ if( meta_obj_id!=ULONG_MAX && ele_obj_id!=ULONG_MAX ) {
+ fd_topob_tile_uses( topo, client_tile, &topo->objs[ meta_obj_id ], FD_SHMEM_JOIN_MODE_READ_ONLY );
+ fd_topob_tile_uses( topo, client_tile, &topo->objs[ ele_obj_id ], FD_SHMEM_JOIN_MODE_READ_ONLY );
+ }
+
FD_TEST( rq_obj->label_idx==req_pool_obj->label_idx ); /* keep rq and req_pool in sync */
return rq_obj;
}
diff --git a/src/discof/accdb/fd_accdb_case_acquire.c b/src/discof/accdb/fd_accdb_case_acquire.c
new file mode 100644
index 00000000000..e517f5cd57a
--- /dev/null
+++ b/src/discof/accdb/fd_accdb_case_acquire.c
@@ -0,0 +1,220 @@
+ case FD_VINYL_REQ_TYPE_ACQUIRE: {
+ FD_MCNT_INC( ACCDB, REQUEST_BATCHES, 1UL );
+ FD_MCNT_INC( ACCDB, REQUESTS, batch_cnt );
+
+ ulong req_flags = (ulong)req->flags;
+ fd_vinyl_key_t const * req_key = MAP_REQ_GADDR( req->key_gaddr, fd_vinyl_key_t, batch_cnt );
+ ulong * req_val_gaddr = MAP_REQ_GADDR( req->val_gaddr_gaddr, ulong, batch_cnt );
+ schar * req_err = MAP_REQ_GADDR( req->err_gaddr, schar, batch_cnt );
+
+ int req_evict_prio = fd_vinyl_req_evict_prio( req_flags );
+
+ int bad_gaddr = (!!batch_cnt) & ((!req_key) | (!req_val_gaddr) | (!req_err));
+
+ if( FD_UNLIKELY( bad_gaddr ) ) {
+ comp_err = FD_VINYL_ERR_INVAL;
+ break;
+ }
+
+ for( ulong batch_idx=0UL; batch_idxline_idx==line_idx, "corruption detected" );
+
+ ulong line_ctl = line[ line_idx ].ctl;
+
+ long ref = fd_accdb_line_ctl_ref( line_ctl );
+
+ /* At this point, we are acquiring a cached pair for read.
+ If the line is acquired for modify, fail with AGAIN. If
+ there are too many acquires for read on this pair, CRIT
+ (could consider AGAIN here). Otherwise, we update the
+ ref count (don't change the ver), point the client at the
+ line caching pair key to finish the acquire. Note that
+ we don't validate the pair header if we detect that an
+ earlier acquire in this batch started fetching the pair
+ because the read might still be in progress (see note
+ below for more details). */
+
+ if( FD_UNLIKELY( ref<0L ) ) DONE( FD_VINYL_ERR_AGAIN );
+ if( FD_UNLIKELY( ref>=FD_VINYL_LINE_REF_MAX ) ) FD_LOG_CRIT(( "too many acquires for read on this pair" ));
+
+ if( FD_LIKELY( !obj->rd_active ) ) {
+ fd_vinyl_bstream_phdr_t * phdr = fd_vinyl_data_obj_phdr( obj );
+
+ FD_CRIT( fd_vinyl_data_obj_val_max( obj ) >= val_sz, "corruption detected" );
+ FD_CRIT( phdr->ctl==fd_vinyl_bstream_ctl( FD_VINYL_BSTREAM_CTL_TYPE_PAIR,
+ FD_VINYL_BSTREAM_CTL_STYLE_RAW, val_sz ), "corruption detected" );
+ FD_CRIT( fd_vinyl_key_eq( &phdr->key, key ), "corruption detected" );
+ FD_CRIT( !memcmp( &phdr->info, &ele0[ ele_idx ].phdr.info, sizeof(fd_vinyl_info_t) ), "corruption detected" );
+ }
+
+ FD_ATOMIC_FETCH_AND_ADD( &line[ line_idx ].ctl, 1UL );
+
+ req_val_gaddr[ batch_idx ] = fd_vinyl_data_gaddr( fd_vinyl_data_obj_val( obj ), data_laddr0 );
+
+ DONE( FD_VINYL_SUCCESS );
+
+ } /* pair key data cached */
+
+ /* At this point, pair key is not cached. If we are not allowed
+ to acquire this pair, fail. Otherwise, evict the least
+ recently used evictable line (this should always be possible
+ if quotas are confiured correctly) to make room to cache this
+ pair. Connect this line to meta element ele_idx, set the
+ line's reference count appropriately, bump the line's version
+ and move the line to the desired location in the eviction
+ sequence. We don't modify any shared fields in meta element
+ ele_idx so we can do the modification fast.
+
+ We do this upfront to free data cache for the alloc if the
+ LRU line is in use and to handle the same pair appearing
+ multiple times in an acquire.
+
+ That is, if req_key appears multiple times in an acquire to
+ modify, the trailing redundant acquires will see the object
+ as cached with ref==-1 and fail with AGAIN. If the key
+ appears multiple times in an acquire for read, the trailing
+ redundant acquires will see the object as cached with ref>0
+ and rd_active==1, conclude that the first redundant acquire
+ is in the process of reading the pair into cache, skip any
+ racy metadata checks, increase the ref count and succeed.
+
+ IMPORTANT SAFETY TIP! Note that this implies that client
+ doing an acquire-for-read with redundant keys and with
+ speculative processing will see req_err transition to success
+ for the trailing redundant items for a key before the leading
+ item of that key transitions to success (and thus before the
+ object is fully read / verified and/or decoded). It is up to
+ the client doing speculative cut through processing to avoid
+ redundant keys or react accordingly. */
+
+ line_idx = fd_accdb_clock_evict( ctx, line, line_cnt, ele0, ele_max, data );
+
+ line[ line_idx ].ele_idx = ele_idx; ele0[ ele_idx ].line_idx = line_idx;
+ FD_ATOMIC_FETCH_AND_ADD( &line[ line_idx ].ctl, 1UL );
+ if( req_evict_prio<=FD_VINYL_LINE_EVICT_PRIO_MRU ) {
+ FD_ATOMIC_FETCH_AND_OR( &line[ line_idx ].ctl, FD_ACCDB_LINE_CTL_CHANCE );
+ }
+
+ /* Allocate an appropriately sized object to hold this pair,
+ connect it to this line and report the location to the client. */
+
+ ulong val_max = val_sz;
+
+ ulong szc = fd_vinyl_data_szc( val_max );
+
+ fd_vinyl_data_obj_t * obj = fd_vinyl_data_alloc( data, szc );
+ if( FD_UNLIKELY( !obj ) ) FD_LOG_CRIT(( "increase data cache size" ));
+
+ line[ line_idx ].obj_gaddr = fd_vinyl_data_gaddr( obj, data_laddr0 ); obj->line_idx = line_idx;
+
+ void * val = fd_vinyl_data_obj_val( obj );
+
+ req_val_gaddr[ batch_idx ] = fd_vinyl_data_gaddr( val, data_laddr0 );
+
+ /* If we need to do I/O, start reading encoded pair data and
+ defer the data integrity and decoding to later (and then in
+ whatever order the I/O layer sees fit). */
+
+ obj->rd_active = (short)1;
+
+ int style = fd_vinyl_bstream_ctl_style( pair_ctl );
+ ulong val_esz = fd_vinyl_bstream_ctl_sz ( pair_ctl );
+
+ FD_CRIT( val_esz<=FD_VINYL_VAL_MAX, "corruption detected" );
+ FD_CRIT( (style!=FD_VINYL_BSTREAM_CTL_STYLE_RAW) | (val_sz==val_esz), "corruption detected" );
+
+ fd_vinyl_data_obj_t * cobj;
+
+ if( FD_LIKELY( style==FD_VINYL_BSTREAM_CTL_STYLE_RAW ) ) cobj = obj;
+ else {
+ cobj = fd_vinyl_data_alloc( data, fd_vinyl_data_szc( val_esz ) );
+ if( FD_UNLIKELY( !cobj ) ) FD_LOG_CRIT(( "increase data cache size" ));
+ }
+
+ cobj->rd->ctx = (ulong)obj;
+ cobj->rd->seq = ele0[ ele_idx ].seq;
+ cobj->rd->dst = fd_vinyl_data_obj_phdr( cobj );
+ cobj->rd->sz = fd_vinyl_bstream_pair_sz( val_esz );
+
+ cobj->rd_err = req_err + batch_idx;
+
+ fd_vinyl_io_read( io, cobj->rd );
+ read_cnt++;
+
+ goto next_acquire;
+
+ } /* pair key meta cached */
+
+ /* At this point, pair key does not exist at bstream seq_present
+ and is not in the process of being created. */
+
+ DONE( FD_VINYL_ERR_KEY );
+
+ next_acquire: /* silly language restriction */;
+
+# undef DONE
+
+ } /* for batch_idx */
+
+ FD_CRIT( !read_cnt, "corruption detected" );
+
+ comp_err = FD_VINYL_SUCCESS;
+ break;
+ }
diff --git a/src/discof/accdb/fd_accdb_line_ctl.h b/src/discof/accdb/fd_accdb_line_ctl.h
new file mode 100644
index 00000000000..38fa03b3abc
--- /dev/null
+++ b/src/discof/accdb/fd_accdb_line_ctl.h
@@ -0,0 +1,38 @@
+#ifndef HEADER_fd_src_discof_accdb_fd_accdb_line_ctl_h
+#define HEADER_fd_src_discof_accdb_fd_accdb_line_ctl_h
+
+/* fd_accdb_line_ctl.h provides the ctl field encoding for accdb cache
+ lines. This header is shared between the accdb tile
+ (fd_accdb_tile_private.h) and specread clients (fd_accdb_specread.h).
+
+ Layout:
+ bits [32,64) version (same as fd_vinyl_line_ctl)
+ bit 25 EVICTING
+ bit 24 CHANCE
+ bits [0,24) ref + 1 (combined client + specread ref count)
+
+ Specread pin: FETCH_AND_ADD(&ctl, 1UL), check old & EVICTING
+ Specread unpin: FETCH_AND_SUB(&ctl, 1UL)
+ CHANCE set: FETCH_AND_OR(&ctl, FD_ACCDB_LINE_CTL_CHANCE)
+ CHANCE clear: FETCH_AND_AND(&ctl, ~FD_ACCDB_LINE_CTL_CHANCE)
+ EVICTING set: CAS or FETCH_AND_OR on ctl
+ Version bump: CAS loop (preserves in-flight specread refs) */
+
+#include "../../util/fd_util_base.h"
+
+#define FD_ACCDB_LINE_CTL_CHANCE (1UL << 24)
+#define FD_ACCDB_LINE_CTL_EVICTING (1UL << 25)
+
+FD_PROTOTYPES_BEGIN
+
+FD_FN_CONST static inline ulong
+fd_accdb_line_ctl( ulong ver, long ref ) {
+ return (ver << 32) | ((ulong)(ref + 1L));
+}
+
+FD_FN_CONST static inline ulong fd_accdb_line_ctl_ver( ulong ctl ) { return ctl >> 32; }
+FD_FN_CONST static inline long fd_accdb_line_ctl_ref( ulong ctl ) { return ((long)(ctl & ((1UL<<24)-1UL))) - 1L; }
+
+FD_PROTOTYPES_END
+
+#endif /* HEADER_fd_src_discof_accdb_fd_accdb_line_ctl_h */
diff --git a/src/discof/accdb/fd_accdb_tile.c b/src/discof/accdb/fd_accdb_tile.c
index 6bd722083de..593b70c38d8 100644
--- a/src/discof/accdb/fd_accdb_tile.c
+++ b/src/discof/accdb/fd_accdb_tile.c
@@ -8,11 +8,9 @@
- Sandboxing */
#define _GNU_SOURCE
+#include "fd_accdb_tile_private.h"
#include "../../disco/topo/fd_topo.h"
#include "../../disco/metrics/fd_metrics.h"
-#include "../../discof/restore/fd_snapct_tile.h"
-#include "../../vinyl/fd_vinyl.h"
-#include "../../vinyl/fd_vinyl_base.h"
#include "../../vinyl/io/ur/fd_vinyl_io_ur.h"
#include "../../util/pod/fd_pod_format.h"
#include "../../util/io_uring/fd_io_uring_setup.h"
@@ -28,29 +26,14 @@
#define NAME "accdb"
#define MAX_INS 8
+#include "fd_accdb_tile_cache.c"
+#include "fd_accdb_tile_root.c"
+
/* For io_ur backend, this controls the size of the write-back cache.
This should be larger than the cumulative record size of all unique
changed accounts in a slot. */
#define IO_SPAD_MAX (128UL<<20)
-#define FD_VINYL_CLIENT_MAX (1024UL)
-#define FD_VINYL_REQ_MAX (1024UL)
-
-struct fd_vinyl_client {
- fd_vinyl_rq_t * rq; /* Channel for requests from this client (could be shared by multiple vinyl instances) */
- fd_vinyl_cq_t * cq; /* Channel for completions from this client to this vinyl instance
- (could be shared by multiple receivers of completions from this vinyl instance). */
- ulong burst_max; /* Max requests receive from this client at a time */
- ulong seq; /* Sequence number of the next request to receive in the rq */
- ulong link_id; /* Identifies requests from this client to this vinyl instance in the rq */
- ulong laddr0; /* A valid non-zero gaddr from this client maps to the vinyl instance's laddr laddr0 + gaddr ... */
- ulong laddr1; /* ... and thus is in (laddr0,laddr1). A zero gaddr maps to laddr NULL. */
- ulong quota_rem; /* Num of remaining acquisitions this client is allowed on this vinyl instance */
- ulong quota_max; /* Max quota */
-};
-
-typedef struct fd_vinyl_client fd_vinyl_client_t;
-
/* MAP_REQ_GADDR maps a request global address req_gaddr to an array of
cnt T's into the local address space as a T * pointer. If the result
is not properly aligned or the entire range does not completely fall
@@ -74,81 +57,6 @@ fd_vinyl_laddr( ulong req_gaddr,
req_laddr0, 0UL );
}
-struct fd_vinyl_tile {
-
- /* Vinyl objects */
-
- fd_vinyl_t vinyl[1];
- void * io_mem;
-
- /* Tile architecture */
-
- uint booted : 1;
- uint shutdown : 1;
- struct {
- ulong state_expected;
- ulong volatile const * state;
- ulong volatile const * pair_cnt;
- /* When booting from genesis only */
- struct {
- ulong io_seed;
- } from_genesis;
- } boot;
-
- /* I/O */
-
- int bstream_fd;
- ulong bstream_file_sz;
-
- /* io_uring */
-
- fd_io_uring_t ring[1];
- void * ioring_shmem; /* shared between kernel and user */
-
- /* Clients */
-
- fd_vinyl_client_t _client[ FD_VINYL_CLIENT_MAX ];
- ulong client_cnt;
- ulong client_idx;
-
- /* Received requests */
-
- fd_vinyl_req_t _req[ FD_VINYL_REQ_MAX ];
- ulong req_head; /* Requests [0,req_head) have been processed */
- ulong req_tail; /* Requests [req_head,req_tail) are pending */
- /* Requests [req_tail,ULONG_MAX) have not been received */
- ulong exec_max;
-
- /* accum_dead_cnt is the number of dead blocks that have been
- written since the last partition block.
-
- accum_move_cnt is the number of move blocks that have been
- written since this last partition block.
-
- accum_garbage_cnt / sz is the number of items / bytes garbage in
- the bstream that have accumulated since the last time we compacted
- the bstream. We use this to estimate the number of rounds of
- compaction to do in async handling. */
-
- ulong accum_dead_cnt;
- ulong accum_garbage_cnt;
- ulong accum_garbage_sz;
-
- /* Run loop state */
-
- ulong seq_part;
-
- /* Periodic syncing */
-
- long sync_next_ns;
-
- /* Vinyl limit on the number of pairs the meta map will accept.
- Exceeding this limit will trigger a LOG_ERR. */
- ulong pair_cnt_limit;
-};
-
-typedef struct fd_vinyl_tile fd_vinyl_tile_t;
-
/* Vinyl state object */
static ulong
@@ -160,7 +68,6 @@ struct fd_accdb_tile_layout {
ulong footprint;
ulong io_off;
ulong io_uring_shmem_off;
- ulong vinyl_line_off;
};
typedef struct fd_accdb_tile_layout fd_accdb_tile_layout_t;
@@ -171,7 +78,7 @@ fd_accdb_tile_layout( fd_accdb_tile_layout_t * layout,
memset( layout, 0, sizeof(fd_accdb_tile_layout_t) );
FD_SCRATCH_ALLOC_INIT( l, NULL );
- ulong ctx_off = (ulong)FD_SCRATCH_ALLOC_APPEND( l, alignof(fd_vinyl_tile_t), sizeof(fd_vinyl_tile_t) );
+ ulong ctx_off = (ulong)FD_SCRATCH_ALLOC_APPEND( l, alignof(fd_accdb_tile_t), sizeof(fd_accdb_tile_t) );
FD_TEST( ctx_off==0UL );
switch( tile->accdb.io_type ) {
@@ -189,8 +96,6 @@ fd_accdb_tile_layout( fd_accdb_tile_layout_t * layout,
FD_LOG_CRIT(( "invalid tile->accdb.io_type %d", tile->accdb.io_type ));
}
- layout->vinyl_line_off = (ulong)FD_SCRATCH_ALLOC_APPEND(
- l, alignof(fd_vinyl_line_t), sizeof(fd_vinyl_line_t)*tile->accdb.line_max );
layout->footprint = FD_SCRATCH_ALLOC_FINI( l, scratch_align() );
}
@@ -216,7 +121,7 @@ populate_allowed_fds( fd_topo_t const * topo,
void * scratch = fd_topo_obj_laddr( topo, tile->tile_obj_id );
FD_SCRATCH_ALLOC_INIT( l, scratch );
- fd_vinyl_tile_t * ctx = FD_SCRATCH_ALLOC_APPEND( l, alignof(fd_vinyl_tile_t), sizeof(fd_vinyl_tile_t) );
+ fd_accdb_tile_t * ctx = FD_SCRATCH_ALLOC_APPEND( l, alignof(fd_accdb_tile_t), sizeof(fd_accdb_tile_t) );
out_fds[ out_cnt++ ] = ctx->bstream_fd;
@@ -232,14 +137,14 @@ populate_allowed_seccomp( fd_topo_t const * topo,
struct sock_filter * out ) {
void * scratch = fd_topo_obj_laddr( topo, tile->tile_obj_id );
FD_SCRATCH_ALLOC_INIT( l, scratch );
- fd_vinyl_tile_t * ctx = FD_SCRATCH_ALLOC_APPEND( l, alignof(fd_vinyl_tile_t), sizeof(fd_vinyl_tile_t) );
+ fd_accdb_tile_t * ctx = FD_SCRATCH_ALLOC_APPEND( l, alignof(fd_accdb_tile_t), sizeof(fd_accdb_tile_t) );
populate_sock_filter_policy_fd_accdb_tile( out_cnt, out, (uint)fd_log_private_logfile_fd(), (uint)ctx->bstream_fd, (uint)ctx->ring->ioring_fd );
return sock_filter_policy_fd_accdb_tile_instr_cnt;
}
static void
-vinyl_io_uring_init( fd_vinyl_tile_t * ctx,
+vinyl_io_uring_init( fd_accdb_tile_t * ctx,
uint uring_depth,
int dev_fd ) {
fd_io_uring_params_t params[1];
@@ -286,10 +191,10 @@ privileged_init( fd_topo_t * topo,
FD_LOG_ERR(( "invalid vinyl_line_max %lu", tile->accdb.line_max ));
}
- fd_vinyl_tile_t * ctx = fd_topo_obj_laddr( topo, tile->tile_obj_id );
+ fd_accdb_tile_t * ctx = fd_topo_obj_laddr( topo, tile->tile_obj_id );
ulong ctx_laddr = (ulong)ctx;
- memset( ctx, 0, sizeof(fd_vinyl_tile_t) );
+ memset( ctx, 0, sizeof(fd_accdb_tile_t) );
ctx->bstream_fd = -1;
ctx->ring->ioring_fd = -1;
@@ -303,11 +208,11 @@ privileged_init( fd_topo_t * topo,
ctx->ioring_shmem = (void *)( ctx_laddr + layout->io_uring_shmem_off );
}
- fd_vinyl_line_t * _line = (void *)( ctx_laddr + layout->vinyl_line_off );
+ fd_vinyl_line_t * _line = fd_topo_obj_laddr( topo, tile->accdb.line_obj_id );
vinyl->cnc = NULL;
vinyl->io = NULL;
- vinyl->line = (fd_vinyl_line_t *)_line;
+ vinyl->line = _line;
vinyl->line_footprint = line_footprint;
/* FIXME use O_DIRECT? */
@@ -338,7 +243,7 @@ static void
unprivileged_init( fd_topo_t * topo,
fd_topo_tile_t * tile ) {
- fd_vinyl_tile_t * ctx = fd_topo_obj_laddr( topo, tile->tile_obj_id );
+ fd_accdb_tile_t * ctx = fd_topo_obj_laddr( topo, tile->tile_obj_id );
fd_vinyl_t * vinyl = ctx->vinyl;
ctx->sync_next_ns = fd_log_wallclock();
@@ -381,7 +286,6 @@ unprivileged_init( fd_topo_t * topo,
vinyl->gc_thresh = gc_thresh;
vinyl->gc_eager = gc_eager;
vinyl->style = FD_VINYL_BSTREAM_CTL_STYLE_RAW;
- vinyl->line_idx_lru = 0U;
vinyl->pair_cnt = 0UL;
vinyl->garbage_sz = 0UL;
@@ -392,13 +296,13 @@ unprivileged_init( fd_topo_t * topo,
fd_vinyl_line_t * line = vinyl->line;
for( ulong line_idx=0UL; line_idxclock_hand = 0U;
+
# undef TEST
ulong snapwm_tile_idx = fd_topo_find_tile( topo, "snapwm", 0UL );
@@ -522,12 +426,46 @@ unprivileged_init( fd_topo_t * topo,
} /* client join loop */
+ /* Initialize rooting state */
+
+ ctx->root_txn = NULL;
+ ctx->root_rec = NULL;
+ ctx->root_txn_idx = 0UL;
+ ctx->root_target_xid = (fd_funk_txn_xid_t){ .ul = { ULONG_MAX, ULONG_MAX } };
+ ctx->write_delay_slots = tile->accdb.write_delay_slots;
+
+ /* Join funk for rooting operations */
+
+ ulong funk_obj_id = fd_pod_query_ulong( topo->props, "funk", ULONG_MAX );
+ ulong funk_locks_obj_id = fd_pod_query_ulong( topo->props, "funk_locks", ULONG_MAX );
+ if( funk_obj_id!=ULONG_MAX && funk_locks_obj_id!=ULONG_MAX ) {
+ FD_TEST( fd_funk_join( ctx->funk,
+ fd_topo_obj_laddr( topo, funk_obj_id ),
+ fd_topo_obj_laddr( topo, funk_locks_obj_id ) ) );
+ }
+
+ /* Discover the replay_accdb input link for root messages */
+
+ ctx->root_in_mem = NULL;
+ ctx->root_in_chunk0 = 0UL;
+ ctx->root_in_wmark = 0UL;
+
+ for( ulong i=0UL; iin_cnt; i++ ) {
+ fd_topo_link_t const * in_link = &topo->links[ tile->in_link_id[ i ] ];
+ if( !strcmp( in_link->name, "replay_accdb" ) ) {
+ ctx->root_in_mem = topo->workspaces[ topo->objs[ in_link->dcache_obj_id ].wksp_id ].wksp;
+ ctx->root_in_chunk0 = fd_dcache_compact_chunk0( ctx->root_in_mem, in_link->dcache );
+ ctx->root_in_wmark = fd_dcache_compact_wmark ( ctx->root_in_mem, in_link->dcache, in_link->mtu );
+ break;
+ }
+ }
+
}
/* during_housekeeping is called periodically (approx every STEM_LAZY ns) */
static void
-during_housekeeping( fd_vinyl_tile_t * ctx ) {
+during_housekeeping( fd_accdb_tile_t * ctx ) {
fd_vinyl_t * vinyl = ctx->vinyl;
@@ -558,6 +496,61 @@ during_housekeeping( fd_vinyl_tile_t * ctx ) {
ctx->booted = 1;
}
+ /* --- Root processing ---
+ Continue any in-progress batch, or start rooting the oldest
+ unrooted funk txn if write_delay_slots allows. */
+
+ if( FD_UNLIKELY( ctx->root_rec ) ) {
+ /* Batch in progress — do one batch */
+ ctx->root_rec = fd_accdb_v2_root_batch( ctx, ctx->root_rec );
+ if( !ctx->root_rec ) {
+ fd_accdb_txn_root_fini( ctx, ctx->root_txn, ctx->root_txn_idx );
+ ctx->root_txn = NULL;
+ }
+
+ } else if( FD_UNLIKELY( ctx->root_txn ) ) {
+ /* Previous root had no records left — clean up */
+ fd_accdb_txn_root_fini( ctx, ctx->root_txn, ctx->root_txn_idx );
+ ctx->root_txn = NULL;
+
+ } else if( FD_LIKELY( ctx->root_target_xid.ul[0]!=ULONG_MAX ) ) {
+ /* Check if there's a child txn to root */
+ fd_funk_t * funk = ctx->funk;
+ ulong child_idx = fd_funk_txn_idx( funk->shmem->child_head_cidx );
+
+ if( !fd_funk_txn_idx_is_null( child_idx ) ) {
+ fd_funk_txn_t * child = &funk->txn_pool->ele[ child_idx ];
+ fd_funk_txn_xid_t const * child_xid = fd_funk_txn_xid( child );
+
+ ulong target_slot = ctx->root_target_xid.ul[0];
+ ulong child_slot = child_xid->ul[0];
+
+ if( child_slot <= target_slot ) {
+ int genesis_override = !child_slot;
+ int delay_ok = genesis_override
+ || !ctx->write_delay_slots
+ || (target_slot - child_slot >= ctx->write_delay_slots);
+
+ if( delay_ok ) {
+ ctx->root_txn = child;
+ ctx->root_txn_idx = child_idx;
+ ctx->root_rec = fd_accdb_txn_root_start( ctx, child );
+
+ if( ctx->root_rec ) {
+ ctx->root_rec = fd_accdb_v2_root_batch( ctx, ctx->root_rec );
+ if( !ctx->root_rec ) {
+ fd_accdb_txn_root_fini( ctx, ctx->root_txn, ctx->root_txn_idx );
+ ctx->root_txn = NULL;
+ }
+ } else {
+ fd_accdb_txn_root_fini( ctx, ctx->root_txn, ctx->root_txn_idx );
+ ctx->root_txn = NULL;
+ }
+ }
+ }
+ }
+ }
+
/* If we've written enough to justify appending a parallel
recovery partition, append one. */
@@ -607,21 +600,21 @@ during_housekeeping( fd_vinyl_tile_t * ctx ) {
times for highest performance, etc) and unaccounted zero
padding garbage to be absorbed when nothing else is going on. */
- int gc_eager = vinyl->gc_eager;
- if( FD_LIKELY( gc_eager>=0 ) ) {
+ // int gc_eager = vinyl->gc_eager;
+ // if( FD_LIKELY( gc_eager>=0 ) ) {
- /* Saturating wide left shift */
- ulong overflow = (ctx->accum_garbage_cnt >> (63-gc_eager) >> 1); /* sigh ... avoid wide shift UB */
- ulong compact_max = fd_ulong_max( fd_ulong_if( !overflow, ctx->accum_garbage_cnt << gc_eager, ULONG_MAX ), 1UL );
+ // /* Saturating wide left shift */
+ // ulong overflow = (ctx->accum_garbage_cnt >> (63-gc_eager) >> 1); /* sigh ... avoid wide shift UB */
+ // ulong compact_max = fd_ulong_max( fd_ulong_if( !overflow, ctx->accum_garbage_cnt << gc_eager, ULONG_MAX ), 1UL );
- /**/ ctx->accum_garbage_cnt = 0UL;
- vinyl->garbage_sz += ctx->accum_garbage_sz; ctx->accum_garbage_sz = 0UL;
+ // /**/ ctx->accum_garbage_cnt = 0UL;
+ // vinyl->garbage_sz += ctx->accum_garbage_sz; ctx->accum_garbage_sz = 0UL;
- ulong garbage_pre = vinyl->garbage_sz;
- fd_vinyl_compact( vinyl, compact_max );
- FD_MCNT_INC( ACCDB, CUM_GC_BYTES, garbage_pre - vinyl->garbage_sz );
+ // ulong garbage_pre = vinyl->garbage_sz;
+ // fd_accdb_compact( vinyl, compact_max );
+ // FD_MCNT_INC( ACCDB, CUM_GC_BYTES, garbage_pre - vinyl->garbage_sz );
- }
+ // }
/* Update vinyl sync block
(Required to reclaim bstream space freed by compaction) */
@@ -655,7 +648,7 @@ during_housekeeping( fd_vinyl_tile_t * ctx ) {
/* If should_shutdown returns non-zero, the vinyl tile is shut down */
static int
-should_shutdown( fd_vinyl_tile_t * ctx ) {
+should_shutdown( fd_accdb_tile_t * ctx ) {
if( FD_UNLIKELY( !ctx->booted ) ) return 0;
if( FD_LIKELY( !ctx->shutdown ) ) return 0;
@@ -699,7 +692,7 @@ should_shutdown( fd_vinyl_tile_t * ctx ) {
}
static void
-metrics_write( fd_vinyl_tile_t * ctx ) {
+metrics_write( fd_accdb_tile_t * ctx ) {
if( FD_UNLIKELY( !ctx->booted ) ) return;
fd_vinyl_t * vinyl = ctx->vinyl;
fd_vinyl_io_t * io = vinyl->io;
@@ -731,7 +724,7 @@ metrics_write( fd_vinyl_tile_t * ctx ) {
/* before_credit runs every main loop iteration */
static void
-before_credit( fd_vinyl_tile_t * ctx,
+before_credit( fd_accdb_tile_t * ctx,
fd_stem_context_t * stem,
int * charge_busy ) {
(void)stem;
@@ -744,17 +737,11 @@ before_credit( fd_vinyl_tile_t * ctx,
fd_vinyl_line_t * line = vinyl->line;
fd_vinyl_data_t * data = vinyl->data;
- ulong pair_max = vinyl->pair_max;
-
fd_vinyl_meta_ele_t * ele0 = meta->ele;
ulong ele_max = meta->ele_max;
ulong meta_seed = meta->seed;
- ulong * lock = meta->lock;
- int lock_shift = meta->lock_shift;
- ulong data_laddr0 = (ulong)data->laddr0;
- fd_vinyl_data_vol_t const * vol = data->vol;
- ulong vol_cnt = data->vol_cnt;
+ void * data_laddr0 = data->laddr0;
ulong line_cnt = vinyl->line_cnt;
@@ -865,40 +852,15 @@ before_credit( fd_vinyl_tile_t * ctx,
ulong fail_cnt = 0UL;
ulong read_cnt = 0UL;
- ulong append_cnt = 0UL;
- ulong accum_cache_hit = 0UL;
switch( req->type ) {
-
-# include "../../vinyl/fd_vinyl_case_acquire.c"
-# include "../../vinyl/fd_vinyl_case_release.c"
-# include "../../vinyl/fd_vinyl_case_erase.c"
- /* FIXME support more request types */
-
+# include "fd_accdb_case_acquire.c"
default:
FD_LOG_CRIT(( "unsupported request type %u", (uint)req->type ));
comp_err = FD_VINYL_ERR_INVAL;
break;
}
- FD_MCNT_INC( ACCDB, REQUEST_BATCHES, 1UL );
- switch( req->type ) {
- case FD_VINYL_REQ_TYPE_ACQUIRE:
- FD_MCNT_INC( ACCDB, REQUESTS_ACQUIRE, batch_cnt );
- FD_MCNT_INC( ACCDB, READ_OPS_SHARED_CACHE, accum_cache_hit );
- break;
- case FD_VINYL_REQ_TYPE_RELEASE:
- /* FIXME missing metrics:
- - ReadBytes(SharedCache)
- - WriteOps(SharedCache)
- - WriteBytes(SharedCache) */
- FD_MCNT_INC( ACCDB, REQUESTS_RELEASE, batch_cnt );
- break;
- case FD_VINYL_REQ_TYPE_ERASE:
- FD_MCNT_INC( ACCDB, REQUESTS_ERASE, batch_cnt );
- break;
- }
-
for( ; read_cnt; read_cnt-- ) {
fd_vinyl_io_rd_t * _rd; /* avoid pointer escape */
fd_vinyl_io_poll( io, &_rd, FD_VINYL_IO_FLAG_BLOCKING );
@@ -929,8 +891,8 @@ before_credit( fd_vinyl_tile_t * ctx,
ulong line_idx = obj->line_idx;
- FD_CRIT( line_idxctl = fd_vinyl_bstream_ctl( FD_VINYL_BSTREAM_CTL_TYPE_PAIR, FD_VINYL_BSTREAM_CTL_STYLE_RAW, val_sz );
- phdr->key = cphdr->key;
- phdr->info = cphdr->info;
-
} else {
FD_LOG_CRIT(( "corrupt bstream record (seq=%lu cpair_style=%d)", seq, cpair_style ));
}
@@ -987,28 +933,10 @@ before_credit( fd_vinyl_tile_t * ctx,
}
- if( FD_UNLIKELY( append_cnt ) ) fd_vinyl_io_commit( io, FD_VINYL_IO_FLAG_BLOCKING );
-
if( FD_LIKELY( comp_err<=0 ) ) fd_vinyl_cq_send( cq, comp, req_id, link_id, comp_err, batch_cnt, fail_cnt, quota_rem );
client->quota_rem = quota_rem;
- /* Update metrics. Derive counters from vinyl locals
-
- append_cnt is incremented in these places:
- - fd_vinyl_case_erase.c (fd_vinyl_io_append_dead, with accum_dead_cnt)
- - fd_vinyl_case_move.c (fd_vinyl_io_append_move, with accum_move_cnt)
- - fd_vinyl_case_move.c (fd_vinyl_io_append(pair))
- - fd_vinyl_case_release.c (fd_vinyl_io_append_pair_inplace)
- - fd_vinyl_case_release.c (fd_vinyl_io_append_dead, with accum_dead_cnt)
-
- We can thus infer the number of pair blocks appended by
- subtracting accum_* */
-
- ulong const dead_cnt = accum_dead_cnt - ctx->accum_dead_cnt;
- FD_MCNT_INC( ACCDB, BLOCKS_PAIR, append_cnt - dead_cnt );
- FD_MCNT_INC( ACCDB, BLOCKS_DEAD, dead_cnt );
-
}
ctx->accum_dead_cnt = accum_dead_cnt;
@@ -1016,11 +944,49 @@ before_credit( fd_vinyl_tile_t * ctx,
ctx->accum_garbage_sz = accum_garbage_sz;
}
+/* during_frag copies the root xid from the replay_accdb link dcache
+ into pending_xid (scratch). Committed in after_frag. */
+
+static inline void
+during_frag( fd_accdb_tile_t * ctx,
+ ulong in_idx FD_PARAM_UNUSED,
+ ulong seq FD_PARAM_UNUSED,
+ ulong sig FD_PARAM_UNUSED,
+ ulong chunk,
+ ulong sz,
+ ulong ctl FD_PARAM_UNUSED ) {
+ if( FD_UNLIKELY( sz!=sizeof(fd_funk_txn_xid_t) ) ) return;
+ if( FD_UNLIKELY( chunkroot_in_chunk0 || chunk>ctx->root_in_wmark ) ) {
+ FD_LOG_ERR(( "chunk %lu out of range [%lu,%lu]", chunk, ctx->root_in_chunk0, ctx->root_in_wmark ));
+ }
+ fd_funk_txn_xid_t const * xid = fd_chunk_to_laddr_const( ctx->root_in_mem, chunk );
+ ctx->pending_xid = *xid;
+}
+
+/* after_frag updates root_target_xid with the confirmed frag.
+ Always consumes immediately — root processing happens in
+ before_credit using funk's child list. */
+
+static inline void
+after_frag( fd_accdb_tile_t * ctx,
+ ulong in_idx FD_PARAM_UNUSED,
+ ulong seq FD_PARAM_UNUSED,
+ ulong sig FD_PARAM_UNUSED,
+ ulong sz,
+ ulong tsorig FD_PARAM_UNUSED,
+ ulong tspub FD_PARAM_UNUSED,
+ fd_stem_context_t * stem FD_PARAM_UNUSED ) {
+ if( FD_UNLIKELY( sz!=sizeof(fd_funk_txn_xid_t) ) ) return;
+ ctx->root_target_xid = ctx->pending_xid;
+}
+
#define STEM_BURST (1UL)
#define STEM_LAZY (10000) /* housekeep every 10 us */
-#define STEM_CALLBACK_CONTEXT_TYPE fd_vinyl_tile_t
+#define STEM_CALLBACK_CONTEXT_TYPE fd_accdb_tile_t
#define STEM_CALLBACK_CONTEXT_ALIGN fd_vinyl_align()
#define STEM_CALLBACK_BEFORE_CREDIT before_credit
+#define STEM_CALLBACK_DURING_FRAG during_frag
+#define STEM_CALLBACK_AFTER_FRAG after_frag
#define STEM_CALLBACK_DURING_HOUSEKEEPING during_housekeeping
#define STEM_CALLBACK_METRICS_WRITE metrics_write
#define STEM_CALLBACK_SHOULD_SHUTDOWN should_shutdown
diff --git a/src/discof/accdb/fd_accdb_tile_cache.c b/src/discof/accdb/fd_accdb_tile_cache.c
new file mode 100644
index 00000000000..756e7413dc0
--- /dev/null
+++ b/src/discof/accdb/fd_accdb_tile_cache.c
@@ -0,0 +1,231 @@
+/* fd_accdb_compact is the accdb tile's version of fd_vinyl_compact.
+
+ It is functionally identical except:
+ - Lines use obj_gaddr (ulong) instead of obj (pointer).
+ Resolved via fd_vinyl_data_laddr( gaddr, data->laddr0 ).
+ - Lines use fd_accdb_line_ctl_ref (24-bit ref with CHANCE/EVICTING
+ bits) instead of fd_vinyl_line_ctl_ref (32-bit ref). */
+
+FD_FN_UNUSED static void
+fd_accdb_compact( fd_vinyl_t * vinyl,
+ ulong compact_max ) {
+
+ fd_vinyl_io_t * io = vinyl->io;
+ ulong gc_thresh = vinyl->gc_thresh;
+ int gc_eager = vinyl->gc_eager;
+ int style = vinyl->style;
+
+ ulong io_seed = fd_vinyl_io_seed ( io ); (void)io_seed;
+ ulong seq_past = fd_vinyl_io_seq_past ( io );
+ ulong seq_present = fd_vinyl_io_seq_present( io );
+
+ if( FD_UNLIKELY( (!compact_max) | ((seq_present-seq_past)<=gc_thresh) | (gc_eager<0) ) ) return;
+
+ fd_vinyl_meta_t * meta = vinyl->meta;
+ fd_vinyl_line_t * line = vinyl->line;
+ ulong line_cnt = vinyl->line_cnt;
+ ulong garbage_sz = vinyl->garbage_sz;
+
+ fd_vinyl_meta_ele_t * ele0 = meta->ele;
+ ulong ele_max = meta->ele_max;
+ ulong meta_seed = meta->seed;
+
+ fd_vinyl_data_t * data = vinyl->data;
+
+ fd_vinyl_data_vol_t * vol = data->vol; (void)vol;
+ ulong vol_cnt = data->vol_cnt; (void)vol_cnt;
+
+ void * data_laddr0 = data->laddr0;
+
+ ulong seq = seq_past;
+
+ for( ulong rem=compact_max; rem; rem-- ) {
+
+ ulong past_sz_new = fd_vinyl_io_seq_future( io ) - seq;
+ if( FD_UNLIKELY( (past_sz_new <= gc_thresh ) |
+ (garbage_sz <= (past_sz_new >> gc_eager)) |
+ (fd_vinyl_seq_ge( seq, seq_present ) ) ) ) {
+ FD_CRIT( fd_vinyl_seq_le( seq, seq_present ), "corruption detected" );
+ if( FD_UNLIKELY( fd_vinyl_seq_eq( seq, seq_present ) ) ) FD_CRIT( !garbage_sz, "corruption detected" );
+ break;
+ }
+
+ fd_vinyl_bstream_block_t block[1];
+
+ fd_vinyl_io_read_imm( io, seq, block, FD_VINYL_BSTREAM_BLOCK_SZ );
+
+ ulong ctl = block->ctl;
+
+ int type = fd_vinyl_bstream_ctl_type( ctl );
+
+ switch( type ) {
+
+ case FD_VINYL_BSTREAM_CTL_TYPE_PAIR: {
+
+ int pair_style = fd_vinyl_bstream_ctl_style( ctl );
+ ulong pair_val_esz = fd_vinyl_bstream_ctl_sz ( ctl );
+ fd_vinyl_key_t const * pair_key = &block->phdr.key;
+ ulong pair_val_sz = (ulong)block->phdr.info.val_sz;
+
+ ulong pair_sz = fd_vinyl_bstream_pair_sz( pair_val_esz );
+
+ int truncated = (pair_sz > (seq_present - seq)); /* Wrapping safe */
+ int bad_esz = (pair_val_esz > FD_VINYL_VAL_MAX);
+ int bad_sz = (pair_val_sz > FD_VINYL_VAL_MAX);
+
+ FD_CRIT( !(truncated | bad_esz | bad_sz), truncated ? "truncated pair" :
+ bad_esz ? "unexpected pair value encoded size" :
+ "pair value size too large" );
+
+# if FD_PARANOID
+ fd_vinyl_bstream_block_t _ftr[1];
+ fd_vinyl_bstream_block_t * ftr = _ftr;
+
+ if( FD_UNLIKELY( pair_sz <= FD_VINYL_BSTREAM_BLOCK_SZ ) ) ftr = block;
+ else fd_vinyl_io_read_imm( io, seq + pair_sz - FD_VINYL_BSTREAM_BLOCK_SZ, ftr, FD_VINYL_BSTREAM_BLOCK_SZ );
+
+ FD_ALERT( !fd_vinyl_bstream_pair_test_fast( io_seed, seq, block, ftr ), "corruption detected" );
+# endif
+
+ ulong pair_memo = fd_vinyl_key_memo( meta_seed, pair_key );
+
+ ulong _ele_idx; /* avoid pointer escape */
+ int err = fd_vinyl_meta_query_fast( ele0, ele_max, pair_key, pair_memo, &_ele_idx );
+ ulong ele_idx = _ele_idx;
+
+ if( FD_LIKELY( !err ) ) {
+
+ if( FD_LIKELY( fd_vinyl_meta_ele_in_bstream( &ele0[ ele_idx ] ) ) ) {
+
+ ulong pair_seq = ele0[ ele_idx ].seq;
+
+ if( FD_LIKELY( fd_vinyl_seq_eq( pair_seq, seq ) ) ) {
+
+ FD_CRIT( !memcmp( &ele0[ ele_idx ].phdr, &block->phdr, sizeof(fd_vinyl_bstream_phdr_t) ), "corruption detected" );
+
+ int pair_style_new;
+ ulong pair_val_esz_new;
+ ulong pair_seq_new;
+
+ int do_copy = 1;
+
+ ulong line_idx = ele0[ ele_idx ].line_idx;
+
+ if( FD_LIKELY( line_idx!=ULONG_MAX ) ) { /* Pair is in cache */
+
+ FD_CRIT( line_idxline_idx==line_idx, "corruption detected" );
+ FD_CRIT ( !obj->rd_active, "corruption detected" );
+
+ ulong line_ctl = line[ line_idx ].ctl;
+
+ if( FD_LIKELY( fd_accdb_line_ctl_ref( line_ctl )>=0L ) ) { /* Pair cached and not acquired for modify */
+
+ fd_vinyl_bstream_phdr_t * phdr = fd_vinyl_data_obj_phdr( obj );
+
+ FD_ALERT( !memcmp( phdr, &block->phdr, sizeof(fd_vinyl_bstream_phdr_t) ), "corruption detected" );
+
+ pair_seq_new = fd_vinyl_io_append_pair_inplace( io, style, phdr, &pair_style_new, &pair_val_esz_new );
+
+ do_copy = 0;
+
+ }
+
+ }
+
+ if( do_copy ) { /* Pair is either not in cache or acquired for modify, append from the bstream */
+
+ if( FD_LIKELY( (pair_style!=FD_VINYL_BSTREAM_CTL_STYLE_RAW) |
+ (style ==FD_VINYL_BSTREAM_CTL_STYLE_RAW) |
+ (pair_sz ==FD_VINYL_BSTREAM_BLOCK_SZ ) ) ) {
+
+ pair_style_new = pair_style;
+ pair_val_esz_new = fd_vinyl_bstream_ctl_sz( ele0[ ele_idx ].phdr.ctl );
+ pair_seq_new = fd_vinyl_io_copy( io, pair_seq, pair_sz );
+
+ } else {
+
+ ulong cpair_max = fd_vinyl_bstream_pair_sz( (ulong)LZ4_COMPRESSBOUND( (int)pair_val_sz ) );
+ ulong scratch_max = cpair_max + pair_sz;
+
+ fd_vinyl_bstream_phdr_t * cphdr = (fd_vinyl_bstream_phdr_t *)
+ fd_vinyl_io_alloc( io, scratch_max, FD_VINYL_IO_FLAG_BLOCKING );
+
+ fd_vinyl_bstream_phdr_t * phdr = (fd_vinyl_bstream_phdr_t *)((ulong)cphdr + cpair_max);
+
+ fd_vinyl_io_read_imm( io, seq, phdr, pair_sz );
+
+ fd_vinyl_io_trim( io, scratch_max );
+
+ pair_seq_new = fd_vinyl_io_append_pair_inplace( io, style, phdr, &pair_style_new, &pair_val_esz_new );
+
+ if( FD_UNLIKELY( pair_style_new==FD_VINYL_BSTREAM_CTL_STYLE_RAW ) ) io->spad_used += scratch_max;
+
+ }
+ }
+
+ ele0[ ele_idx ].phdr.ctl = fd_vinyl_bstream_ctl( FD_VINYL_BSTREAM_CTL_TYPE_PAIR, pair_style_new, pair_val_esz_new );
+ ele0[ ele_idx ].seq = pair_seq_new;
+
+ } else {
+
+ FD_CRIT( fd_vinyl_seq_gt( pair_seq, seq ), "corruption detected" );
+
+ garbage_sz -= pair_sz;
+
+ }
+
+ } else {
+
+ garbage_sz -= pair_sz;
+
+ }
+
+ } else {
+
+ garbage_sz -= pair_sz;
+
+ }
+
+ seq += pair_sz;
+ break;
+
+ }
+
+ case FD_VINYL_BSTREAM_CTL_TYPE_DEAD:
+ case FD_VINYL_BSTREAM_CTL_TYPE_MOVE:
+ case FD_VINYL_BSTREAM_CTL_TYPE_PART: {
+
+ FD_ALERT( !fd_vinyl_bstream_block_test( io_seed, block ), "corruption detected" );
+
+ garbage_sz -= FD_VINYL_BSTREAM_BLOCK_SZ;
+ seq += FD_VINYL_BSTREAM_BLOCK_SZ;
+ break;
+
+ }
+
+ case FD_VINYL_BSTREAM_CTL_TYPE_ZPAD: {
+
+ FD_ALERT( !fd_vinyl_bstream_zpad_test( io_seed, seq, block ), "corruption detected" );
+
+ seq += FD_VINYL_BSTREAM_BLOCK_SZ;
+ break;
+
+ }
+
+ default: FD_LOG_CRIT(( "%016lx: unknown type (%x)", seq, (uint)type ));
+
+ }
+
+ }
+
+ fd_vinyl_io_commit( io, FD_VINYL_IO_FLAG_BLOCKING );
+ fd_vinyl_io_forget( io, seq );
+
+ vinyl->garbage_sz = garbage_sz;
+}
diff --git a/src/discof/accdb/fd_accdb_tile_private.h b/src/discof/accdb/fd_accdb_tile_private.h
new file mode 100644
index 00000000000..623ae0ac95d
--- /dev/null
+++ b/src/discof/accdb/fd_accdb_tile_private.h
@@ -0,0 +1,236 @@
+#ifndef HEADER_fd_src_discof_accdb_fd_accdb_tile_private_h
+#define HEADER_fd_src_discof_accdb_fd_accdb_tile_private_h
+
+#include "../../vinyl/fd_vinyl.h"
+#include "../../funk/fd_funk.h"
+#include "../../util/io_uring/fd_io_uring.h"
+#include "fd_accdb_line_ctl.h"
+
+/* fd_accdb_line_ctl_clear atomically bumps the version, clears
+ EVICTING and CHANCE, and sets ref to new_ref. Uses a CAS loop
+ to handle concurrent specreader pin/unpin safely. Any in-flight
+ specreader ADD/SUBs that race with the CAS simply cause a retry
+ (the specreader bails on EVICTING and SUBs back immediately). */
+
+static inline void
+fd_accdb_line_ctl_clear( fd_vinyl_line_t * line,
+ ulong line_idx,
+ long new_ref ) {
+ for(;;) {
+ ulong cur = FD_VOLATILE_CONST( line[ line_idx ].ctl );
+ ulong new = fd_accdb_line_ctl( fd_accdb_line_ctl_ver( cur )+1UL, new_ref );
+ if( FD_LIKELY( FD_ATOMIC_CAS( &line[ line_idx ].ctl, cur, new )==cur ) ) return;
+ FD_SPIN_PAUSE();
+ }
+}
+
+#define FD_VINYL_CLIENT_MAX (1024UL)
+#define FD_VINYL_REQ_MAX (1024UL)
+
+struct fd_vinyl_client {
+ fd_vinyl_rq_t * rq; /* Channel for requests from this client (could be shared by multiple vinyl instances) */
+ fd_vinyl_cq_t * cq; /* Channel for completions from this client to this vinyl instance
+ (could be shared by multiple receivers of completions from this vinyl instance). */
+ ulong burst_max; /* Max requests receive from this client at a time */
+ ulong seq; /* Sequence number of the next request to receive in the rq */
+ ulong link_id; /* Identifies requests from this client to this vinyl instance in the rq */
+ ulong laddr0; /* A valid non-zero gaddr from this client maps to the vinyl instance's laddr laddr0 + gaddr ... */
+ ulong laddr1; /* ... and thus is in (laddr0,laddr1). A zero gaddr maps to laddr NULL. */
+ ulong quota_rem; /* Num of remaining acquisitions this client is allowed on this vinyl instance */
+ ulong quota_max; /* Max quota */
+};
+
+typedef struct fd_vinyl_client fd_vinyl_client_t;
+
+struct fd_accdb_tile {
+
+ fd_funk_t funk[1];
+
+ /* Vinyl objects */
+
+ fd_vinyl_t vinyl[1];
+ void * io_mem;
+
+ /* Tile architecture */
+
+ uint booted : 1;
+ uint shutdown : 1;
+ struct {
+ ulong state_expected;
+ ulong volatile const * state;
+ ulong volatile const * pair_cnt;
+ /* When booting from genesis only */
+ struct {
+ ulong io_seed;
+ } from_genesis;
+ } boot;
+
+ /* I/O */
+
+ int bstream_fd;
+ ulong bstream_file_sz;
+
+ /* io_uring */
+
+ fd_io_uring_t ring[1];
+ void * ioring_shmem; /* shared between kernel and user */
+
+ /* Clients */
+
+ fd_vinyl_client_t _client[ FD_VINYL_CLIENT_MAX ];
+ ulong client_cnt;
+ ulong client_idx;
+
+ /* Received requests */
+
+ fd_vinyl_req_t _req[ FD_VINYL_REQ_MAX ];
+ ulong req_head; /* Requests [0,req_head) have been processed */
+ ulong req_tail; /* Requests [req_head,req_tail) are pending */
+ /* Requests [req_tail,ULONG_MAX) have not been received */
+ ulong exec_max;
+
+ /* accum_dead_cnt is the number of dead blocks that have been
+ written since the last partition block.
+
+ accum_move_cnt is the number of move blocks that have been
+ written since this last partition block.
+
+ accum_garbage_cnt / sz is the number of items / bytes garbage in
+ the bstream that have accumulated since the last time we compacted
+ the bstream. We use this to estimate the number of rounds of
+ compaction to do in async handling. */
+
+ ulong accum_dead_cnt;
+ ulong accum_garbage_cnt;
+ ulong accum_garbage_sz;
+
+ /* Run loop state */
+
+ ulong seq_part;
+
+ /* Periodic syncing */
+
+ long sync_next_ns;
+
+ /* Vinyl limit on the number of pairs the meta map will accept.
+ Exceeding this limit will trigger a LOG_ERR. */
+ ulong pair_cnt_limit;
+
+ uint clock_hand; /* CLOCK sweep position, in [0,line_cnt) */
+ int root_populate_cache; /* If non-zero, root_batch copies rooted pairs into cache with least priority */
+
+ /* Rooting — the replay tile sends root target xids via stem link.
+ The accdb tile consumes them immediately (after_frag) and walks
+ funk's child list in during_housekeeping to find the oldest unrooted
+ txn, publishing it to vinyl subject to write_delay_slots. */
+
+ fd_funk_txn_t * root_txn; /* txn being rooted, NULL if idle */
+ fd_funk_rec_t * root_rec; /* next rec head for root_batch, NULL if done */
+ ulong root_txn_idx; /* index of root_txn in txn_pool */
+
+ fd_funk_txn_xid_t root_target_xid; /* newest root xid from replay; ul[0]==ULONG_MAX means none received yet */
+ ulong write_delay_slots;
+
+ /* Stem input link for root messages from replay */
+
+ fd_wksp_t * root_in_mem;
+ ulong root_in_chunk0;
+ ulong root_in_wmark;
+
+ /* Scratch for during_frag → after_frag handoff */
+
+ fd_funk_txn_xid_t pending_xid;
+};
+
+typedef struct fd_accdb_tile fd_accdb_tile_t;
+
+FD_PROTOTYPES_BEGIN
+
+/* fd_accdb_clock_evict uses a CLOCK sweep to select and evict a
+ cache line. Scans from clock_hand (mod line_cnt), giving each
+ unreferenced line with chance==1 a "second chance" (clearing chance
+ to 0) and spinning until it finds an unreferenced line with
+ chance==0 that it can claim via CAS. Frees the data obj,
+ disconnects meta, and bumps the version inline. Returns the
+ evicted line_idx. */
+
+static inline ulong
+fd_accdb_clock_evict( fd_accdb_tile_t * ctx,
+ fd_vinyl_line_t * line,
+ ulong line_cnt,
+ fd_vinyl_meta_ele_t * ele0,
+ ulong ele_max,
+ fd_vinyl_data_t * data ) {
+ uint hand = ctx->clock_hand;
+
+ for(;;) {
+
+ ulong hand_ctl = line[ hand ].ctl;
+
+ if( FD_LIKELY( !fd_accdb_line_ctl_ref( hand_ctl ) ) ) {
+
+ if( FD_UNLIKELY( hand_ctl & FD_ACCDB_LINE_CTL_CHANCE ) ) {
+ FD_ATOMIC_FETCH_AND_AND( &line[ hand ].ctl,
+ ~FD_ACCDB_LINE_CTL_CHANCE );
+ hand = (uint)((hand+1U<(uint)line_cnt) ? hand+1U : 0U);
+ continue;
+ }
+
+ /* Try to claim for eviction via CAS. CAS proves ref==0 at
+ this instant. */
+ if( FD_LIKELY( FD_ATOMIC_CAS( &line[ hand ].ctl,
+ hand_ctl,
+ hand_ctl | FD_ACCDB_LINE_CTL_EVICTING )==hand_ctl ) ) {
+
+ /* Drain any specread pins that raced with the EVICTING CAS.
+ A specread that did FETCH_AND_ADD after our CAS will see
+ EVICTING in old_ctl and immediately FETCH_AND_SUB back.
+ We must wait for that SUB to land before ctl_clear, which
+ would otherwise capture the transient +1 ref in its own
+ CAS and leave ref at -1 after the specread's SUB. */
+ while( FD_UNLIKELY( fd_accdb_line_ctl_ref(
+ FD_VOLATILE_CONST( line[ hand ].ctl ) ) > 0L ) ) {
+ FD_SPIN_PAUSE();
+ }
+
+ break;
+ }
+ }
+
+ hand = (uint)((hand+1U<(uint)line_cnt) ? hand+1U : 0U);
+ }
+
+ ctx->clock_hand = (uint)((hand+1U<(uint)line_cnt) ? hand+1U : 0U);
+
+ /* Evict: free data obj, disconnect meta */
+
+ void * data_laddr0 = data->laddr0;
+ ulong obj_gaddr = line[ hand ].obj_gaddr;
+ ulong ele_idx = line[ hand ].ele_idx;
+
+ if( FD_LIKELY( obj_gaddr ) ) {
+ FD_LOG_ERR(( "evicting obj_gaddr=%lu", obj_gaddr ));
+ fd_vinyl_data_obj_t * obj = fd_vinyl_data_laddr( obj_gaddr, data_laddr0 );
+ FD_CRIT( obj->line_idx==(ulong)hand, "corruption detected" );
+ FD_CRIT( !obj->rd_active, "corruption detected" );
+ fd_vinyl_data_free( data, obj );
+ line[ hand ].obj_gaddr = 0UL;
+ }
+
+ if( FD_LIKELY( ele_idxrec_pool->ele );
+ ulong volatile * vl = &funk->rec_lock[ rec_idx ];
+ ulong const ver_lock = FD_VOLATILE_CONST( *vl );
+ ulong const ver = fd_funk_rec_ver_bits ( ver_lock );
+ ulong const lock = fd_funk_rec_lock_bits( ver_lock );
+ if( FD_UNLIKELY( lock ) ) {
+ /* Active readers — yield to caller */
+ return ULONG_MAX;
+ }
+ ulong const new_ver = fd_funk_rec_ver_inc( ver );
+ ulong const new_vl = fd_funk_rec_ver_lock( new_ver, FD_FUNK_REC_LOCK_MASK );
+ if( FD_UNLIKELY( FD_ATOMIC_CAS( vl, ver_lock, new_vl )!=ver_lock ) ) {
+ /* CAS failed (race with another lock operation) — yield to caller */
+ return ULONG_MAX;
+ }
+ return new_vl;
+}
+
+static void
+fd_funk_rec_admin_unlock( fd_funk_t const * funk,
+ fd_funk_rec_t * rec,
+ ulong ver_lock ) {
+ ulong rec_idx = (ulong)( rec - funk->rec_pool->ele );
+ ulong volatile * vl = &funk->rec_lock[ rec_idx ];
+ FD_VOLATILE( *vl ) = fd_funk_rec_ver_lock( fd_funk_rec_ver_bits( ver_lock ), 0UL );
+}
+
+/* funk_free_rec_locked frees a funk record that already has the admin
+ lock held (ver_lock from fd_funk_rec_admin_lock). */
+
+static void
+funk_free_rec_locked( fd_funk_t * funk,
+ fd_funk_rec_t * rec,
+ ulong ver_lock ) {
+ memset( &rec->pair, 0, sizeof(fd_funk_xid_key_pair_t) );
+ FD_COMPILER_MFENCE();
+ rec->map_next = FD_FUNK_REC_IDX_NULL;
+ fd_funk_val_flush( rec, funk->alloc, funk->wksp );
+ fd_funk_rec_admin_unlock( funk, rec, ver_lock );
+ fd_funk_rec_pool_release( funk->rec_pool, rec, 1 );
+}
+
+/* funk_free_rec attempts to admin-lock and free a funk record.
+ Returns 0 on success, 1 if the lock could not be acquired (active
+ readers). On failure the caller should retry later. */
+
+static int
+funk_free_rec( fd_funk_t * funk,
+ fd_funk_rec_t * rec ) {
+ FD_COMPILER_MFENCE();
+ ulong ver_lock = fd_funk_rec_admin_lock( funk, rec );
+ if( FD_UNLIKELY( ver_lock==ULONG_MAX ) ) return 1;
+ funk_free_rec_locked( funk, rec, ver_lock );
+ return 0;
+}
+
+/* funk_gc_chain optimistically deletes all but the newest rooted
+ revisions of rec. This possibly deletes 'rec'. Returns rec if rec
+ is the only known rooted revision, otherwise returns NULL (if rec was
+ deleted). Note that due to edge cases, revisions that are not in the
+ oldest tracked slot, may not reliably get cleaned up. (The oldest
+ tracked slot always gets cleaned up, though.) */
+
+static fd_funk_rec_t *
+funk_gc_chain( ulong root_slot,
+ fd_funk_t * funk,
+ fd_funk_rec_t * const rec ) {
+
+ fd_funk_rec_t * rec_pool = funk->rec_pool->ele;
+ ulong rec_max = funk->rec_pool->ele_max;
+ ulong seed = funk->rec_map->map->seed;
+ ulong chain_cnt = funk->rec_map->map->chain_cnt;
+
+ ulong hash = fd_funk_rec_map_key_hash( &rec->pair, seed );
+ ulong chain_idx = (hash & (chain_cnt-1UL) );
+
+ /* Lock rec_map chain */
+
+ int lock_err = fd_funk_rec_map_iter_lock( funk->rec_map, &chain_idx, 1UL, FD_MAP_FLAG_BLOCKING );
+ if( FD_UNLIKELY( lock_err!=FD_MAP_SUCCESS ) ) {
+ FD_LOG_CRIT(( "fd_funk_rec_map_iter_lock failed (%i-%s)", lock_err, fd_map_strerror( lock_err ) ));
+ }
+
+ fd_funk_rec_map_shmem_private_chain_t * chain =
+ fd_funk_rec_map_shmem_private_chain( funk->rec_map->map, 0UL ) + chain_idx;
+ ulong ver =
+ fd_funk_rec_map_private_vcnt_ver( FD_VOLATILE_CONST( chain->ver_cnt ) );
+ FD_CRIT( ver&1UL, "chain is not locked" );
+
+ /* Walk map chain */
+
+ fd_funk_rec_t * found_rec = NULL;
+ uint * pnext = &chain->head_cidx;
+ uint cur = *pnext;
+ ulong chain_len = 0UL;
+ ulong iter = 0UL;
+ while( cur!=FD_FUNK_REC_IDX_NULL ) {
+ if( FD_UNLIKELY( iter++ > rec_max ) ) FD_LOG_CRIT(( "cycle detected in rec_map chain %lu", chain_idx ));
+
+ /* Is this node garbage? */
+
+ fd_funk_rec_t * node = &funk->rec_pool->ele[ cur ];
+ if( FD_UNLIKELY( cur==node->map_next ) ) FD_LOG_CRIT(( "accdb corruption detected: cycle in rec_map chain %lu", chain_idx ));
+ cur = node->map_next;
+ if( !fd_funk_rec_key_eq( rec->pair.key, node->pair.key ) ) goto retain;
+ if( node->pair.xid->ul[0]>root_slot ) goto retain;
+ if( !found_rec ) {
+ found_rec = node;
+ goto retain;
+ }
+
+ /* No longer need this node */
+
+ if( node->pair.xid->ul[0] > rec->pair.xid->ul[0] ) {
+ /* If this node is newer than the to-be-deleted slot, need to
+ remove it from the transaction's record list. */
+ uint neigh_prev = node->prev_idx;
+ uint neigh_next = node->next_idx;
+ if( neigh_prev==FD_FUNK_REC_IDX_NULL ||
+ neigh_next==FD_FUNK_REC_IDX_NULL ) {
+ /* Node is first or last of transaction -- too bothersome to
+ remove it from the transaction's record list */
+ goto retain;
+ }
+ rec_pool[ neigh_next ].prev_idx = neigh_prev;
+ rec_pool[ neigh_prev ].next_idx = neigh_next;
+ }
+
+ /* Destroy this node (skip if lock is contended — will retry
+ on the next root batch) */
+
+ if( FD_UNLIKELY( funk_free_rec( funk, node ) ) ) goto retain;
+ *pnext = cur;
+ continue;
+
+ retain:
+ pnext = &node->map_next;
+ chain_len++;
+ }
+
+ /* Unlock rec_map chain */
+
+ FD_COMPILER_MFENCE();
+ FD_VOLATILE( chain->ver_cnt ) =
+ fd_funk_rec_map_private_vcnt( ver+1UL, chain_len );
+ FD_COMPILER_MFENCE();
+ return found_rec==rec ? found_rec : NULL;
+}
+
+/* accdb_invalidate_line sets the EVICTING flag on a cached line,
+ checks that all specread pins have drained, then frees the data obj,
+ disconnects the line from meta, and bumps the version via CAS.
+ Returns 0 on success, 1 if specread refs are still active (caller
+ should retry later). Caller must be the vinyl tile (single
+ writer). */
+
+static int
+accdb_invalidate_line( fd_vinyl_line_t * line,
+ fd_vinyl_meta_ele_t * ele0,
+ fd_vinyl_data_t * data,
+ ulong line_idx,
+ ulong ele_idx ) {
+
+ /* Must not be acquired for modify by a vinyl client. Transient
+ specread pins (ref > 0) are OK — the EVICTING flag below will
+ cause them to bail. */
+ FD_CRIT( fd_accdb_line_ctl_ref( line[ line_idx ].ctl ) >= 0L,
+ "cannot invalidate line acquired for modify" );
+
+ /* Set EVICTING — new specreaders will see it and bail */
+ FD_ATOMIC_FETCH_AND_OR( &line[ line_idx ].ctl,
+ FD_ACCDB_LINE_CTL_EVICTING );
+
+ /* Check if existing specread refs have drained. If not, undo
+ EVICTING and yield to caller so accdb can service requests. */
+ if( FD_UNLIKELY( fd_accdb_line_ctl_ref( FD_VOLATILE_CONST( line[ line_idx ].ctl ) ) > 0L ) ) {
+ FD_ATOMIC_FETCH_AND_AND( &line[ line_idx ].ctl,
+ ~FD_ACCDB_LINE_CTL_EVICTING );
+ return 1;
+ }
+
+ /* Free data obj */
+ ulong obj_gaddr = line[ line_idx ].obj_gaddr;
+ if( FD_LIKELY( obj_gaddr ) ) {
+ fd_vinyl_data_obj_t * obj = fd_vinyl_data_laddr( obj_gaddr, data->laddr0 );
+ fd_vinyl_data_free( data, obj );
+ line[ line_idx ].obj_gaddr = 0UL;
+ }
+
+ /* Disconnect line <-> meta */
+ ele0[ ele_idx ].line_idx = ULONG_MAX;
+ line[ line_idx ].ele_idx = ULONG_MAX;
+
+ /* Bump version, clear EVICTING via CAS */
+ fd_accdb_line_ctl_clear( line, line_idx, 0L );
+ return 0;
+}
+
+/* accdb_populate_line evicts a cache line via CLOCK sweep, allocates
+ a data object, copies the pair into it, and returns the new
+ line_idx. The line is inserted with least eviction priority (no
+ CHANCE bit) so the CLOCK sweep will reclaim it first. Returns
+ ULONG_MAX if the data allocation fails (the evicted line is left
+ disconnected). Caller must set ele0[ele_idx].line_idx to the
+ returned value. */
+
+static ulong
+accdb_populate_line( fd_accdb_tile_t * ctx,
+ fd_vinyl_line_t * line,
+ ulong line_cnt,
+ fd_vinyl_meta_ele_t * ele0,
+ ulong ele_max,
+ fd_vinyl_data_t * data,
+ ulong ele_idx,
+ fd_vinyl_key_t const * key,
+ fd_vinyl_info_t const * info,
+ void const * val,
+ ulong val_sz ) {
+
+ void * data_laddr0 = data->laddr0;
+
+ ulong new_line_idx = fd_accdb_clock_evict( ctx, line, line_cnt, ele0, ele_max, data );
+
+ ulong szc = fd_vinyl_data_szc( val_sz );
+ fd_vinyl_data_obj_t * obj = fd_vinyl_data_alloc( data, szc );
+ if( FD_UNLIKELY( !obj ) ) return ULONG_MAX;
+
+ line[ new_line_idx ].obj_gaddr = fd_vinyl_data_gaddr( obj, data_laddr0 );
+ line[ new_line_idx ].ele_idx = ele_idx;
+ obj->line_idx = new_line_idx;
+ obj->rd_active = (short)0;
+
+ fd_vinyl_bstream_phdr_t * phdr = fd_vinyl_data_obj_phdr( obj );
+ phdr->ctl = fd_vinyl_bstream_ctl( FD_VINYL_BSTREAM_CTL_TYPE_PAIR,
+ FD_VINYL_BSTREAM_CTL_STYLE_RAW,
+ val_sz );
+ phdr->key = *key;
+ phdr->info = *info;
+ fd_memcpy( fd_vinyl_data_obj_val( obj ), val, val_sz );
+
+ /* No CHANCE bit — least eviction priority */
+ return new_line_idx;
+}
+
+fd_funk_rec_t *
+fd_accdb_v2_root_batch( fd_accdb_tile_t * accdb,
+ fd_funk_rec_t * rec0 ) {
+ fd_funk_t * funk = accdb->funk;
+ fd_wksp_t * funk_wksp = funk->wksp; /* shm workspace containing unrooted accounts */
+ fd_funk_rec_t * rec_pool = funk->rec_pool->ele; /* funk rec arena */
+
+ fd_vinyl_t * vinyl = accdb->vinyl;
+ fd_vinyl_io_t * io = vinyl->io;
+ fd_vinyl_meta_t * meta = vinyl->meta;
+ fd_vinyl_line_t * line = vinyl->line;
+ fd_vinyl_data_t * data = vinyl->data;
+
+ fd_vinyl_meta_ele_t * ele0 = meta->ele;
+ ulong ele_max = meta->ele_max;
+ ulong meta_seed = meta->seed;
+ ulong * lock = meta->lock;
+ int lock_shift = meta->lock_shift;
+ ulong line_cnt = vinyl->line_cnt;
+
+ ulong append_cnt = 0UL;
+ ulong root_slot = funk->shmem->last_publish->ul[0];
+
+ /* Collect funk request batch */
+
+ fd_funk_rec_t * recs[ FD_ACCDB_ROOT_BATCH_MAX ];
+ ulong rec_cnt;
+
+ fd_funk_rec_t * next = rec0;
+ for( rec_cnt=0UL; next && rec_cntnext_idx ) ) {
+ next = NULL;
+ } else {
+ next = &rec_pool[ cur->next_idx ];
+ }
+ cur->prev_idx = FD_FUNK_REC_IDX_NULL;
+ cur->next_idx = FD_FUNK_REC_IDX_NULL;
+
+ if( funk_gc_chain( root_slot, funk, cur ) ) {
+ recs[ rec_cnt++ ] = cur;
+ }
+ }
+
+ for( ulong i=0UL; ival_sz>=sizeof(fd_account_meta_t), "corrupt funk_rec" );
+
+ fd_vinyl_key_t const * key =
+ (fd_vinyl_key_t const *)fd_funk_rec_key( recs[ i ] );
+ ulong memo = fd_vinyl_key_memo( meta_seed, key );
+
+ ulong ele_idx;
+ int found = fd_vinyl_meta_query_fast( ele0, ele_max, key, memo,
+ &ele_idx );
+
+ if( acct->lamports ) {
+ /* --- Append pair block --- */
+
+ ulong val_sz = (ulong)recs[ i ]->val_sz;
+
+ fd_vinyl_info_t info;
+ memset( &info, 0, sizeof(fd_vinyl_info_t) );
+ info.val_sz = (uint)val_sz;
+
+ if( FD_LIKELY( !found ) ) {
+ /* Existing key — overwrite */
+
+ /* Invalidate cache if cached */
+ ulong cur_line_idx = ele0[ ele_idx ].line_idx;
+ if( FD_LIKELY( cur_line_idx!=ULONG_MAX ) ) {
+ if( FD_UNLIKELY( accdb_invalidate_line( line, ele0, data, cur_line_idx, ele_idx ) ) )
+ goto skip_rec;
+ }
+
+ /* Garbage accounting for old pair */
+ ulong val_esz_before =
+ fd_vinyl_bstream_ctl_sz( ele0[ ele_idx ].phdr.ctl );
+ accdb->accum_garbage_cnt++;
+ accdb->accum_garbage_sz +=
+ fd_vinyl_bstream_pair_sz( val_esz_before );
+
+ /* Append new pair to bstream */
+ ulong seq = fd_vinyl_io_append_pair_raw( io, key, &info,
+ (void const *)acct );
+ append_cnt++;
+
+ /* Optionally copy into cache with least eviction priority */
+ ulong new_line_idx = ULONG_MAX;
+ if( FD_LIKELY( accdb->root_populate_cache ) ) {
+ new_line_idx = accdb_populate_line( accdb, line, line_cnt,
+ ele0, ele_max, data,
+ ele_idx, key, &info,
+ (void const *)acct, val_sz );
+ }
+
+ /* Update meta (prepare/publish for existing element) */
+ fd_vinyl_meta_prepare_fast( lock, lock_shift, ele_idx );
+
+ ele0[ ele_idx ].phdr.ctl =
+ fd_vinyl_bstream_ctl( FD_VINYL_BSTREAM_CTL_TYPE_PAIR,
+ FD_VINYL_BSTREAM_CTL_STYLE_RAW,
+ val_sz );
+ ele0[ ele_idx ].phdr.info = info;
+ ele0[ ele_idx ].seq = seq;
+ ele0[ ele_idx ].line_idx = new_line_idx;
+
+ fd_vinyl_meta_publish_fast( lock, lock_shift, ele_idx );
+
+ } else {
+ /* New key — insert */
+
+ /* Append to bstream first (need seq for meta) */
+ ulong seq = fd_vinyl_io_append_pair_raw( io, key, &info,
+ (void const *)acct );
+ append_cnt++;
+
+ /* Optionally copy into cache with least eviction priority */
+ ulong new_line_idx = ULONG_MAX;
+ if( FD_LIKELY( accdb->root_populate_cache ) ) {
+ new_line_idx = accdb_populate_line( accdb, line, line_cnt,
+ ele0, ele_max, data,
+ ele_idx, key, &info,
+ (void const *)acct, val_sz );
+ }
+
+ /* Insert into meta at the empty slot. Per meta.h safety tip:
+ "Inserting without doing a prepare is fine so long as
+ phdr.ctl becomes visible last." */
+
+ ele0[ ele_idx ].memo = memo;
+ ele0[ ele_idx ].phdr.key = *key;
+ ele0[ ele_idx ].phdr.info = info;
+ ele0[ ele_idx ].seq = seq;
+ ele0[ ele_idx ].line_idx = new_line_idx;
+ FD_COMPILER_MFENCE();
+ ele0[ ele_idx ].phdr.ctl =
+ fd_vinyl_bstream_ctl( FD_VINYL_BSTREAM_CTL_TYPE_PAIR,
+ FD_VINYL_BSTREAM_CTL_STYLE_RAW,
+ val_sz );
+ FD_COMPILER_MFENCE();
+
+ vinyl->pair_cnt++;
+ }
+
+ } else {
+ /* --- Append erase block --- */
+
+ if( FD_LIKELY( !found ) ) {
+ /* Key exists in meta — erase it */
+
+ FD_CRIT( ele0[ ele_idx ].phdr.ctl!=ULONG_MAX,
+ "cannot erase key being created" );
+
+ /* Invalidate cache if cached */
+ ulong cur_line_idx = ele0[ ele_idx ].line_idx;
+ if( FD_LIKELY( cur_line_idx!=ULONG_MAX ) ) {
+ if( FD_UNLIKELY( accdb_invalidate_line( line, ele0, data, cur_line_idx, ele_idx ) ) )
+ goto skip_rec;
+ }
+
+ /* Garbage: old pair + dead block itself */
+ ulong val_esz_before =
+ fd_vinyl_bstream_ctl_sz( ele0[ ele_idx ].phdr.ctl );
+ accdb->accum_garbage_cnt += 2UL;
+ accdb->accum_garbage_sz +=
+ fd_vinyl_bstream_pair_sz( val_esz_before )
+ + FD_VINYL_BSTREAM_BLOCK_SZ;
+
+ fd_vinyl_io_append_dead( io, &ele0[ ele_idx ].phdr, NULL, 0UL );
+ append_cnt++;
+ accdb->accum_dead_cnt++;
+
+ /* Remove from meta (handles its own locking) */
+ fd_vinyl_meta_remove_fast( ele0, ele_max, lock, lock_shift,
+ line, line_cnt, ele_idx );
+ vinyl->pair_cnt--;
+ }
+ /* else: erase of non-existent key — no-op */
+ }
+ continue;
+
+ skip_rec:
+ /* Cache line has active specread refs — re-chain record for
+ next batch. The vinyl write will be harmlessly re-done. */
+ FD_LOG_NOTICE(( "vinyl data contention" ));
+ recs[ i ]->next_idx = next ? (uint)(ulong)( next - rec_pool ) : FD_FUNK_REC_IDX_NULL;
+ next = recs[ i ];
+ recs[ i ] = NULL;
+ }
+
+ /* Commit result */
+ if( FD_LIKELY( append_cnt ) ) {
+ fd_vinyl_io_commit( io, FD_VINYL_IO_FLAG_BLOCKING );
+ }
+
+ /* Remove funk records. Try admin lock first — if contended
+ (active readers), skip the record and re-chain it onto next for
+ the next batch. The vinyl write will be harmlessly re-done. */
+
+ for( ulong i=0UL; inext_idx = next ? (uint)(ulong)( next - rec_pool ) : FD_FUNK_REC_IDX_NULL;
+ next = recs[ i ];
+ continue;
+ }
+ fd_funk_xid_key_pair_t pair = recs[ i ]->pair;
+ fd_funk_rec_query_t query[1];
+ int rm_err = fd_funk_rec_map_remove( funk->rec_map, &pair, NULL, query, FD_MAP_FLAG_BLOCKING );
+ if( FD_UNLIKELY( rm_err!=FD_MAP_SUCCESS ) ) FD_LOG_CRIT(( "fd_funk_rec_map_remove failed (%i-%s)", rm_err, fd_map_strerror( rm_err ) ));
+ funk_free_rec_locked( funk, recs[ i ], ver_lock );
+ }
+
+ return next;
+}
+
+/* fd_accdb_txn_root_start prepares a funk transaction for rooting on
+ the accdb tile. This does:
+ 1. Reparent children of the txn to root (shmem child_head/tail)
+ 2. Mark last_publish atomically
+ 3. Drain users (rwlock_write + set state=PUBLISH)
+ 4. Detach rec list from txn
+ Returns the head of the detached record list, or NULL if the txn
+ has no records. Caller stores the returned head as root_rec and
+ the txn pool index as root_txn_idx for later use. */
+
+fd_funk_rec_t *
+fd_accdb_txn_root_start( fd_accdb_tile_t * ctx,
+ fd_funk_txn_t * txn ) {
+ fd_funk_t * funk = ctx->funk;
+
+ /* Phase 1: Reparent children to root */
+
+ funk->shmem->child_head_cidx = txn->child_head_cidx;
+ funk->shmem->child_tail_cidx = txn->child_tail_cidx;
+ ulong child_idx = fd_funk_txn_idx( txn->child_head_cidx );
+ while( !fd_funk_txn_idx_is_null( child_idx ) ) {
+ funk->txn_pool->ele[ child_idx ].parent_cidx = fd_funk_txn_cidx( FD_FUNK_TXN_IDX_NULL );
+ child_idx = fd_funk_txn_idx( funk->txn_pool->ele[ child_idx ].sibling_next_cidx );
+ }
+
+ /* Phase 2: Mark as last published */
+
+ fd_funk_txn_xid_t xid[1];
+ fd_funk_txn_xid_copy( xid, fd_funk_txn_xid( txn ) );
+ fd_funk_txn_xid_st_atomic( funk->shmem->last_publish, xid );
+ FD_LOG_INFO(( "accdb tile root_start xid %lu:%lu", xid->ul[0], xid->ul[1] ));
+
+ /* Phase 3: Drain users */
+
+ ulong txn_idx = (ulong)( txn - funk->txn_pool->ele );
+ fd_rwlock_write( &funk->txn_lock[ txn_idx ] );
+ FD_VOLATILE( txn->state ) = FD_FUNK_TXN_STATE_PUBLISH;
+
+ /* Phase 4: Detach record list */
+
+ fd_funk_rec_t * head = NULL;
+ if( !fd_funk_rec_idx_is_null( txn->rec_head_idx ) ) {
+ head = &funk->rec_pool->ele[ txn->rec_head_idx ];
+ }
+ txn->rec_head_idx = FD_FUNK_REC_IDX_NULL;
+ txn->rec_tail_idx = FD_FUNK_REC_IDX_NULL;
+
+ return head;
+}
+
+/* fd_accdb_txn_root_fini completes rooting of a funk transaction.
+ Called after all record batches have been migrated. Removes the
+ txn from the txn_map, releases the rwlock, and frees the txn. */
+
+void
+fd_accdb_txn_root_fini( fd_accdb_tile_t * ctx,
+ fd_funk_txn_t * txn,
+ ulong txn_idx ) {
+ fd_funk_t * funk = ctx->funk;
+
+ /* Phase 5: Remove txn from txn_map */
+
+ fd_funk_txn_xid_t xid[1];
+ fd_funk_txn_xid_copy( xid, fd_funk_txn_xid( txn ) );
+ fd_funk_txn_map_query_t query[1];
+ int rm_err = fd_funk_txn_map_remove( funk->txn_map, xid, NULL, query, 0 );
+ if( FD_UNLIKELY( rm_err!=FD_MAP_SUCCESS ) ) {
+ FD_LOG_CRIT(( "txn_map_remove failed xid=%lu:%lu: %i-%s",
+ xid->ul[0], xid->ul[1], rm_err, fd_map_strerror( rm_err ) ));
+ }
+
+ /* Phase 6: Free txn */
+
+ fd_rwlock_unwrite( &funk->txn_lock[ txn_idx ] );
+ FD_VOLATILE( txn->state ) = FD_FUNK_TXN_STATE_FREE;
+ txn->parent_cidx = UINT_MAX;
+ txn->sibling_prev_cidx = UINT_MAX;
+ txn->sibling_next_cidx = UINT_MAX;
+ txn->child_head_cidx = UINT_MAX;
+ txn->child_tail_cidx = UINT_MAX;
+ fd_funk_txn_pool_release( funk->txn_pool, txn, 1 );
+
+ FD_LOG_INFO(( "accdb tile root_fini xid %lu:%lu", xid->ul[0], xid->ul[1] ));
+}
diff --git a/src/discof/execle/fd_execle_tile.c b/src/discof/execle/fd_execle_tile.c
index 10fbed15e21..55815aafa14 100644
--- a/src/discof/execle/fd_execle_tile.c
+++ b/src/discof/execle/fd_execle_tile.c
@@ -98,6 +98,14 @@ metrics_write( fd_execle_tile_t * ctx ) {
FD_MCNT_ENUM_COPY( EXECLE, TRANSACTION_LANDED, ctx->metrics.txn_landed );
FD_MCNT_SET( EXECLE, COMPUTE_UNITS_TOTAL, ctx->runtime->metrics.cu_cum );
+
+ fd_accdb_user_t * accdb = ctx->accdb;
+ FD_MCNT_SET( EXECLE, ACCDB_LOOKUP_FUNK, accdb->base.lookup_funk );
+ FD_MCNT_SET( EXECLE, ACCDB_LOOKUP_SPECRD, accdb->base.lookup_specrd );
+ FD_MCNT_SET( EXECLE, ACCDB_LOOKUP_ACCDB, accdb->base.lookup_accdb );
+ FD_MCNT_SET( EXECLE, ACCDB_DT_FUNK, (ulong)accdb->base.dt_funk );
+ FD_MCNT_SET( EXECLE, ACCDB_DT_SPECRD, (ulong)accdb->base.dt_specrd );
+ FD_MCNT_SET( EXECLE, ACCDB_DT_VINYL, (ulong)accdb->base.dt_vinyl );
}
static int
diff --git a/src/discof/execrp/fd_execrp_tile.c b/src/discof/execrp/fd_execrp_tile.c
index 67e03abc6af..96efb6bf18b 100644
--- a/src/discof/execrp/fd_execrp_tile.c
+++ b/src/discof/execrp/fd_execrp_tile.c
@@ -138,7 +138,13 @@ metrics_write( fd_execrp_tile_t * ctx ) {
FD_MCNT_SET( EXECRP, VM_REGIME_INTERPRETER, exec_ticks );
fd_accdb_user_t * accdb = ctx->accdb;
- FD_MCNT_SET( EXECRP, ACCDB_CREATED, accdb->base.created_cnt );
+ FD_MCNT_SET( EXECRP, ACCDB_CREATED, accdb->base.created_cnt );
+ FD_MCNT_SET( EXECRP, ACCDB_LOOKUP_FUNK, accdb->base.lookup_funk );
+ FD_MCNT_SET( EXECRP, ACCDB_LOOKUP_SPECRD, accdb->base.lookup_specrd );
+ FD_MCNT_SET( EXECRP, ACCDB_LOOKUP_ACCDB, accdb->base.lookup_accdb );
+ FD_MCNT_SET( EXECRP, ACCDB_DT_FUNK, (ulong)accdb->base.dt_funk );
+ FD_MCNT_SET( EXECRP, ACCDB_DT_SPECRD, (ulong)accdb->base.dt_specrd );
+ FD_MCNT_SET( EXECRP, ACCDB_DT_VINYL, (ulong)accdb->base.dt_vinyl );
FD_STATIC_ASSERT( sizeof(runtime->metrics.txn_account_save)/sizeof(ulong)==FD_METRICS_ENUM_ACCOUNT_CHANGE_CNT, enum );
FD_MCNT_ENUM_COPY( EXECRP, TXN_ACCOUNT_CHANGES, runtime->metrics.txn_account_save );
diff --git a/src/discof/fd_accdb_topo.c b/src/discof/fd_accdb_topo.c
index d1dad5ebf4b..72669742b1f 100644
--- a/src/discof/fd_accdb_topo.c
+++ b/src/discof/fd_accdb_topo.c
@@ -3,6 +3,7 @@
#include "../flamenco/accdb/fd_accdb_impl_v2.h"
#include "../flamenco/progcache/fd_progcache_user.h"
#include "../util/pod/fd_pod.h"
+#include "../util/pod/fd_pod_format.h"
void
fd_accdb_init_from_topo( fd_accdb_user_t * accdb,
@@ -22,14 +23,29 @@ fd_accdb_init_from_topo( fd_accdb_user_t * accdb,
} else {
fd_topo_obj_t const * vinyl_rq = fd_topo_find_tile_obj( topo, tile, "vinyl_rq" );
fd_topo_obj_t const * vinyl_req_pool = fd_topo_find_tile_obj( topo, tile, "vinyl_rpool" );
+ fd_topo_obj_t const * vinyl_line = fd_topo_find_tile_obj( topo, tile, "vinyl_line" );
FD_TEST( fd_accdb_user_v2_init( accdb,
fd_topo_obj_laddr( topo, funk_obj_id ),
fd_topo_obj_laddr( topo, locks_obj_id ),
fd_topo_obj_laddr( topo, vinyl_rq->id ),
topo->workspaces[ vinyl_data->wksp_id ].wksp,
fd_topo_obj_laddr( topo, vinyl_req_pool->id ),
+ fd_topo_obj_laddr( topo, vinyl_line->id ),
vinyl_rq->id,
max_depth ) );
+
+ /* Enable speculative reads if the tile has access to meta/ele */
+ fd_topo_obj_t const * vinyl_meta = fd_topo_find_tile_obj( topo, tile, "vinyl_meta" );
+ fd_topo_obj_t const * vinyl_ele = fd_topo_find_tile_obj( topo, tile, "vinyl_meta_e" );
+ if( vinyl_meta && vinyl_ele && vinyl_line ) {
+ ulong line_cnt = fd_pod_queryf_ulong( topo->props, 0UL,
+ "obj.%lu.line_cnt", vinyl_line->id );
+ fd_accdb_user_v2_init_cache( accdb,
+ fd_topo_obj_laddr( topo, vinyl_meta->id ),
+ fd_topo_obj_laddr( topo, vinyl_ele->id ),
+ fd_topo_obj_laddr( topo, vinyl_line->id ),
+ line_cnt );
+ }
}
}
diff --git a/src/discof/genesis/fd_genesi_tile.c b/src/discof/genesis/fd_genesi_tile.c
index 2f301cd34b2..ba632a69660 100644
--- a/src/discof/genesis/fd_genesi_tile.c
+++ b/src/discof/genesis/fd_genesi_tile.c
@@ -476,18 +476,10 @@ unprivileged_init( fd_topo_t * topo,
if( !vinyl_data ) {
FD_TEST( fd_accdb_admin_v1_init( ctx->accdb_admin, fd_topo_obj_laddr( topo, funk_obj_id ), fd_topo_obj_laddr( topo, funk_locks_obj_id ) ) );
} else {
- fd_topo_obj_t const * vinyl_rq = fd_topo_find_tile_obj( topo, tile, "vinyl_rq" );
- fd_topo_obj_t const * vinyl_req_pool = fd_topo_find_tile_obj( topo, tile, "vinyl_rpool" );
- FD_TEST( vinyl_rq );
- FD_TEST( vinyl_req_pool );
FD_TEST( fd_accdb_admin_v2_init( ctx->accdb_admin,
fd_topo_obj_laddr( topo, funk_obj_id ),
- fd_topo_obj_laddr( topo, funk_locks_obj_id ),
- fd_topo_obj_laddr( topo, vinyl_rq->id ),
- topo->workspaces[ vinyl_data->wksp_id ].wksp,
- fd_topo_obj_laddr( topo, vinyl_req_pool->id ),
- vinyl_rq->id,
- tile->genesi.accdb_max_depth ) );
+ fd_topo_obj_laddr( topo, funk_locks_obj_id ) ) );
+ fd_accdb_admin_v2_max_depth_set( ctx->accdb_admin, tile->genesi.accdb_max_depth );
}
fd_accdb_init_from_topo( ctx->accdb, topo, tile, tile->genesi.accdb_max_depth );
diff --git a/src/discof/replay/fd_replay_tile.c b/src/discof/replay/fd_replay_tile.c
index d28507aeeaa..678854a1dbc 100644
--- a/src/discof/replay/fd_replay_tile.c
+++ b/src/discof/replay/fd_replay_tile.c
@@ -417,6 +417,8 @@ struct fd_replay_tile {
fd_replay_out_link_t epoch_out[1];
+ fd_replay_out_link_t accdb_out[1];
+
/* The gui tile needs to reliably own a reference to the most recent
completed active bank. Replay needs to know if the gui as a
consumer is enabled so it can increment the bank's refcnt before
@@ -548,17 +550,15 @@ metrics_write( fd_replay_tile_t * ctx ) {
FD_MCNT_SET( REPLAY, PROGCACHE_ROOTED, ctx->progcache_admin->metrics.root_cnt );
FD_MCNT_SET( REPLAY, PROGCACHE_GC_ROOT, ctx->progcache_admin->metrics.gc_root_cnt );
- FD_MCNT_SET( REPLAY, ACCDB_CREATED, ctx->accdb->base.created_cnt );
- FD_MCNT_SET( REPLAY, ACCDB_REVERTED, ctx->accdb_admin->base.revert_cnt );
- FD_MCNT_SET( REPLAY, ACCDB_ROOTED, ctx->accdb_admin->base.root_cnt );
- FD_MCNT_SET( REPLAY, ACCDB_ROOTED_BYTES, ctx->accdb_admin->base.root_tot_sz );
- FD_MCNT_SET( REPLAY, ACCDB_GC_ROOT, ctx->accdb_admin->base.gc_root_cnt );
- FD_MCNT_SET( REPLAY, ACCDB_RECLAIMED, ctx->accdb_admin->base.reclaim_cnt );
- FD_MHIST_COPY( REPLAY, ROOT_SLOT_DURATION_SECONDS, ctx->metrics.root_slot_dur );
- FD_MHIST_COPY( REPLAY, ROOT_ACCOUNT_DURATION_SECONDS, ctx->metrics.root_account_dur );
- FD_MCNT_SET( REPLAY, ROOT_ELAPSED_SECONDS_DB, (ulong)ctx->accdb_admin->base.dt_vinyl );
- FD_MCNT_SET( REPLAY, ROOT_ELAPSED_SECONDS_COPY, (ulong)ctx->accdb_admin->base.dt_copy );
- FD_MCNT_SET( REPLAY, ROOT_ELAPSED_SECONDS_GC, (ulong)ctx->accdb_admin->base.dt_gc );
+ FD_MCNT_SET( REPLAY, ACCDB_CREATED, ctx->accdb->base.created_cnt );
+ FD_MCNT_SET( REPLAY, ACCDB_REVERTED, ctx->accdb_admin->base.revert_cnt );
+
+ FD_MCNT_SET( REPLAY, ACCDB_LOOKUP_FUNK, ctx->accdb->base.lookup_funk );
+ FD_MCNT_SET( REPLAY, ACCDB_LOOKUP_SPECRD, ctx->accdb->base.lookup_specrd );
+ FD_MCNT_SET( REPLAY, ACCDB_LOOKUP_ACCDB, ctx->accdb->base.lookup_accdb );
+ FD_MCNT_SET( REPLAY, ACCDB_DT_FUNK, (ulong)ctx->accdb->base.dt_funk );
+ FD_MCNT_SET( REPLAY, ACCDB_DT_SPECRD, (ulong)ctx->accdb->base.dt_specrd );
+ FD_MCNT_SET( REPLAY, ACCDB_DT_VINYL, (ulong)ctx->accdb->base.dt_vinyl );
}
static inline ulong
@@ -1414,6 +1414,8 @@ store_xinsert( fd_store_t * store,
} FD_STORE_XLOCK_END;
}
+static void accdb_advance_root( fd_replay_tile_t * ctx, fd_stem_context_t * stem, ulong slot, ulong bank_idx );
+
static void
boot_genesis( fd_replay_tile_t * ctx,
fd_stem_context_t * stem,
@@ -1435,7 +1437,7 @@ boot_genesis( fd_replay_tile_t * ctx,
fd_funk_txn_xid_t target_xid = { .ul = { 0UL, 0UL } };
fd_accdb_attach_child( ctx->accdb_admin, &root_xid, &target_xid );
fd_runtime_read_genesis( ctx->banks, bank, ctx->accdb, &xid, NULL, &meta->genesis_hash, &meta->lthash, ctx->genesis, genesis_blob, &ctx->runtime_stack );
- fd_accdb_advance_root( ctx->accdb_admin, &target_xid );
+ accdb_advance_root( ctx, stem, target_xid.ul[0], target_xid.ul[1] );
static const fd_txncache_fork_id_t txncache_root = { .val = USHORT_MAX };
bank->data->txncache_fork_id = fd_txncache_attach_child( ctx->txncache, txncache_root );
@@ -2071,9 +2073,10 @@ accdb_root_op_total( fd_replay_tile_t const * ctx ) {
}
static void
-accdb_advance_root( fd_replay_tile_t * ctx,
- ulong slot,
- ulong bank_idx ) {
+accdb_advance_root( fd_replay_tile_t * ctx,
+ fd_stem_context_t * stem,
+ ulong slot,
+ ulong bank_idx ) {
fd_funk_txn_xid_t xid = { .ul[0] = slot, .ul[1] = bank_idx };
FD_LOG_DEBUG(( "advancing root to slot=%lu", slot ));
@@ -2085,11 +2088,25 @@ accdb_advance_root( fd_replay_tile_t * ctx,
fd_histf_sample( ctx->metrics.root_slot_dur, (ulong)root_accounts_dt );
fd_histf_sample( ctx->metrics.root_account_dur, (ulong)root_accounts_dt / (ulong)fd_long_max( rooted_accounts, 1L ) );
+ /* Send root request to accdb tile via stem link.
+ sig carries the slot so the accdb tile can peek at the mcache
+ to determine write_delay_slots deferral without consuming. */
+ if( FD_LIKELY( ctx->accdb_out->idx!=ULONG_MAX ) ) {
+ fd_funk_txn_xid_t * msg = fd_chunk_to_laddr( ctx->accdb_out->mem, ctx->accdb_out->chunk );
+ *msg = xid;
+ fd_stem_publish( stem, ctx->accdb_out->idx, slot, ctx->accdb_out->chunk,
+ sizeof(fd_funk_txn_xid_t), 0UL, 0UL,
+ fd_frag_meta_ts_comp( fd_tickcount() ) );
+ ctx->accdb_out->chunk = fd_dcache_compact_next( ctx->accdb_out->chunk,
+ sizeof(fd_funk_txn_xid_t), ctx->accdb_out->chunk0, ctx->accdb_out->wmark );
+ }
+
fd_progcache_txn_advance_root( ctx->progcache_admin, &xid );
}
static int
-advance_published_root( fd_replay_tile_t * ctx ) {
+advance_published_root( fd_replay_tile_t * ctx,
+ fd_stem_context_t * stem ) {
fd_block_id_ele_t * block_id_ele = fd_block_id_map_ele_query( ctx->block_id_map, &ctx->consensus_root, NULL, ctx->block_id_arr );
if( FD_UNLIKELY( !block_id_ele ) ) {
@@ -2121,7 +2138,7 @@ advance_published_root( fd_replay_tile_t * ctx ) {
fd_block_id_ele_t * advanceable_root_ele = &ctx->block_id_arr[ advanceable_root_idx ];
ulong advanceable_root_slot = fd_bank_slot_get( bank );
- accdb_advance_root( ctx, advanceable_root_slot, bank->data->idx );
+ accdb_advance_root( ctx, stem, advanceable_root_slot, bank->data->idx );
fd_txncache_advance_root( ctx->txncache, bank->data->txncache_fork_id );
fd_sched_advance_root( ctx->sched, advanceable_root_idx );
@@ -2178,7 +2195,7 @@ after_credit( fd_replay_tile_t * ctx,
/* If the published_root is not caught up to the consensus root, then
we should try to advance the published root. */
- if( FD_UNLIKELY( ctx->consensus_root_bank_idx!=ctx->published_root_bank_idx && advance_published_root( ctx ) ) ) {
+ if( FD_UNLIKELY( ctx->consensus_root_bank_idx!=ctx->published_root_bank_idx && advance_published_root( ctx, stem ) ) ) {
*charge_busy = 1;
*opt_poll_in = 0;
return;
@@ -2819,17 +2836,11 @@ unprivileged_init( fd_topo_t * topo,
fd_topo_obj_laddr( topo, funk_obj_id ),
fd_topo_obj_laddr( topo, funk_locks_obj_id ) ) );
} else {
- fd_topo_obj_t const * vinyl_rq = fd_topo_find_tile_obj( topo, tile, "vinyl_rq" );
- fd_topo_obj_t const * vinyl_req_pool = fd_topo_find_tile_obj( topo, tile, "vinyl_rpool" );
FD_TEST( fd_accdb_admin_v2_init( ctx->accdb_admin,
fd_topo_obj_laddr( topo, funk_obj_id ),
- fd_topo_obj_laddr( topo, funk_locks_obj_id ),
- fd_topo_obj_laddr( topo, vinyl_rq->id ),
- topo->workspaces[ vinyl_data->wksp_id ].wksp,
- fd_topo_obj_laddr( topo, vinyl_req_pool->id ),
- vinyl_rq->id,
- max_depth ) );
+ fd_topo_obj_laddr( topo, funk_locks_obj_id ) ) );
fd_accdb_admin_v2_delay_set( ctx->accdb_admin, tile->replay.write_delay_slots );
+ fd_accdb_admin_v2_max_depth_set( ctx->accdb_admin, max_depth );
}
fd_accdb_init_from_topo( ctx->accdb, topo, tile, max_depth );
@@ -2941,9 +2952,10 @@ unprivileged_init( fd_topo_t * topo,
else FD_LOG_ERR(( "unexpected input link name %s", link->name ));
}
- *ctx->epoch_out = out1( topo, tile, "replay_epoch" ); FD_TEST( ctx->epoch_out->idx!=ULONG_MAX );
- *ctx->replay_out = out1( topo, tile, "replay_out" ); FD_TEST( ctx->replay_out->idx!=ULONG_MAX );
- *ctx->exec_out = out1( topo, tile, "replay_execrp" ); FD_TEST( ctx->exec_out->idx!=ULONG_MAX );
+ *ctx->epoch_out = out1( topo, tile, "replay_epoch" ); FD_TEST( ctx->epoch_out->idx!=ULONG_MAX );
+ *ctx->replay_out = out1( topo, tile, "replay_out" ); FD_TEST( ctx->replay_out->idx!=ULONG_MAX );
+ *ctx->exec_out = out1( topo, tile, "replay_execrp" ); FD_TEST( ctx->exec_out->idx!=ULONG_MAX );
+ *ctx->accdb_out = out1( topo, tile, "replay_accdb" ); /* idx==ULONG_MAX when vinyl disabled */
ctx->gui_enabled = fd_topo_find_tile( topo, "gui", 0UL )!=ULONG_MAX;
ctx->rpc_enabled = fd_topo_find_tile( topo, "rpc", 0UL )!=ULONG_MAX;
@@ -2989,11 +3001,6 @@ unprivileged_init( fd_topo_t * topo,
fd_histf_join( fd_histf_new( ctx->metrics.store_query_work, FD_MHIST_SECONDS_MIN( REPLAY, STORE_QUERY_WORK ),
FD_MHIST_SECONDS_MAX( REPLAY, STORE_QUERY_WORK ) ) );
- fd_histf_join( fd_histf_new( ctx->metrics.root_slot_dur, FD_MHIST_SECONDS_MIN( REPLAY, ROOT_SLOT_DURATION_SECONDS ),
- FD_MHIST_SECONDS_MAX( REPLAY, ROOT_SLOT_DURATION_SECONDS ) ) );
- fd_histf_join( fd_histf_new( ctx->metrics.root_account_dur, FD_MHIST_SECONDS_MIN( REPLAY, ROOT_ACCOUNT_DURATION_SECONDS ),
- FD_MHIST_SECONDS_MAX( REPLAY, ROOT_ACCOUNT_DURATION_SECONDS ) ) );
-
/* Ensure precompiles are available, crash fast otherwise */
fd_precompiles();
diff --git a/src/flamenco/accdb/Local.mk b/src/flamenco/accdb/Local.mk
index 026aa13971e..6a968332d8d 100644
--- a/src/flamenco/accdb/Local.mk
+++ b/src/flamenco/accdb/Local.mk
@@ -29,7 +29,6 @@ $(call add-objs,fd_accdb_admin_v1 fd_accdb_impl_v1,fd_flamenco)
ifdef FD_HAS_ATOMIC
$(call add-hdrs,fd_accdb_admin_v2.h fd_accdb_impl_v2.h)
$(call add-objs,fd_accdb_admin_v2,fd_flamenco)
-$(call add-objs,fd_accdb_admin_v2_root,fd_flamenco)
$(call add-objs,fd_accdb_impl_v2,fd_flamenco)
$(call add-hdrs,fd_vinyl_req_pool.h)
$(call add-objs,fd_vinyl_req_pool,fd_flamenco)
@@ -38,14 +37,8 @@ endif
# Debug APIs
$(call add-hdrs,fd_accdb_fsck.h)
$(call add-objs,fd_accdb_fsck_funk fd_accdb_fsck_vinyl,fd_flamenco)
-ifdef FD_HAS_LZ4
-$(call make-bin,fd_accdb_ctl,fd_accdb_ctl,fd_vinyl fd_tango fd_ballet fd_util)
-endif
ifdef FD_HAS_ATOMIC
$(call make-unit-test,test_accdb_v1,test_accdb_v1,fd_flamenco fd_funk fd_ballet fd_util)
$(call run-unit-test,test_accdb_v1)
-ifdef FD_HAS_LZ4
-$(call make-unit-test,test_accdb_v2,test_accdb_v2,fd_flamenco fd_vinyl fd_funk fd_tango fd_ballet fd_util)
-endif
endif
diff --git a/src/flamenco/accdb/fd_accdb_admin_v2.c b/src/flamenco/accdb/fd_accdb_admin_v2.c
index 30c20cb07f1..538354cc2e9 100644
--- a/src/flamenco/accdb/fd_accdb_admin_v2.c
+++ b/src/flamenco/accdb/fd_accdb_admin_v2.c
@@ -1,4 +1,4 @@
-#include "fd_accdb_admin_v2_private.h"
+#include "fd_accdb_admin_v2.h"
FD_STATIC_ASSERT( alignof(fd_accdb_admin_v2_t)<=alignof(fd_accdb_admin_t), layout );
FD_STATIC_ASSERT( sizeof (fd_accdb_admin_v2_t)<=sizeof(fd_accdb_admin_t), layout );
@@ -6,37 +6,13 @@ FD_STATIC_ASSERT( sizeof (fd_accdb_admin_v2_t)<=sizeof(fd_accdb_admin_t), layou
fd_accdb_admin_t *
fd_accdb_admin_v2_init( fd_accdb_admin_t * accdb_,
void * shfunk,
- void * shlocks,
- void * vinyl_rq,
- void * vinyl_data,
- void * vinyl_req_pool,
- ulong vinyl_link_id,
- ulong max_depth ) {
+ void * shlocks ) {
/* Call superclass constructor */
if( FD_UNLIKELY( !fd_accdb_admin_v1_init( accdb_, shfunk, shlocks ) ) ) {
return NULL;
}
- if( FD_UNLIKELY( !vinyl_data ) ) {
- FD_LOG_WARNING(( "NULL vinyl_data" ));
- return NULL;
- }
-
- fd_vinyl_rq_t * rq = fd_vinyl_rq_join( vinyl_rq );
- fd_vinyl_req_pool_t * req_pool = fd_vinyl_req_pool_join( vinyl_req_pool );
- if( FD_UNLIKELY( !rq || !req_pool ) ) {
- /* component joins log warning if this is reached */
- FD_LOG_WARNING(( "Failed to initialize database client" ));
- return NULL;
- }
fd_accdb_admin_v2_t * accdb = fd_type_pun( accdb_ );
- accdb->root_lineage->max_depth = max_depth;
- accdb->vinyl_req_id = 0UL;
- accdb->vinyl_rq = rq;
- accdb->vinyl_link_id = vinyl_link_id;
- accdb->vinyl_data_wksp = vinyl_data;
- accdb->vinyl_req_wksp = fd_wksp_containing( req_pool );
- accdb->vinyl_req_pool = req_pool;
accdb->base.accdb_type = FD_ACCDB_TYPE_V2;
accdb->base.vt = &fd_accdb_admin_v2_vt;
return accdb_;
@@ -57,8 +33,6 @@ void
fd_accdb_admin_v2_fini( fd_accdb_admin_t * admin_ ) {
fd_accdb_admin_v2_t * admin = downcast( admin_ );
- fd_vinyl_rq_leave( admin->vinyl_rq );
-
/* superclass destructor */
admin->base.accdb_type = FD_ACCDB_TYPE_V1;
fd_accdb_admin_v1_fini( admin_ );
@@ -73,11 +47,58 @@ void
fd_accdb_v2_attach_child( fd_accdb_admin_t * admin_,
fd_funk_txn_xid_t const * xid_parent,
fd_funk_txn_xid_t const * xid_new ) {
- fd_accdb_admin_v1_t * db = downcast( admin_ )->v1;
+ fd_accdb_admin_v2_t * accdb = downcast( admin_ );
+ fd_accdb_admin_v1_t * db = accdb->v1;
+ fd_funk_t * funk = db->funk;
+
+ /* Ensure fork depth stays within limits. This thread is the only
+ one that appends to the fork graph. Other threads may concurrently
+ remove from the graph (by advancing root), which can only decrease
+ the depth. Therefore we can safely spin until there is room. */
+
+ ulong max_depth = accdb->max_depth;
+ if( FD_LIKELY( max_depth ) ) {
+ for(;;) {
+ /* Compute depth of the new child = 1 (for the child itself)
+ + number of ancestors from parent to root. */
+
+ ulong depth = 1UL;
+
+ if( !fd_funk_txn_xid_eq( xid_parent, funk->shmem->last_publish ) ) {
+ /* Parent is not root -- walk the parent chain */
+
+ fd_funk_txn_map_query_t query[1];
+ int err;
+ for(;;) {
+ err = fd_funk_txn_map_query_try( funk->txn_map, xid_parent, NULL, query, 0 );
+ if( FD_LIKELY( err!=FD_MAP_ERR_AGAIN ) ) break;
+ FD_SPIN_PAUSE();
+ }
+
+ if( FD_LIKELY( err==FD_MAP_SUCCESS ) ) {
+ fd_funk_txn_t const * txn = fd_funk_txn_map_query_ele( query );
+ depth++; /* count parent */
+
+ ulong parent_idx = fd_funk_txn_idx( txn->parent_cidx );
+ while( !fd_funk_txn_idx_is_null( parent_idx ) ) {
+ txn = &funk->txn_pool->ele[ parent_idx ];
+ depth++;
+ parent_idx = fd_funk_txn_idx( txn->parent_cidx );
+ }
+ }
+ /* If err==FD_MAP_ERR_KEY, parent was concurrently rooted.
+ depth stays at 1, which is always within limits. */
+ }
+
+ if( FD_LIKELY( depthul[0], xid_new ->ul[1],
xid_parent->ul[0], xid_parent->ul[1] ));
- fd_funk_txn_prepare( db->funk, xid_parent, xid_new );
+ fd_funk_txn_prepare( funk, xid_parent, xid_new );
}
void
@@ -86,94 +107,12 @@ fd_accdb_v2_cancel( fd_accdb_admin_t * admin,
fd_accdb_v1_cancel( admin, xid );
}
-static void
-publish_recs( fd_accdb_admin_v2_t * admin,
- fd_funk_txn_t * txn ) {
- fd_funk_rec_t * rec_pool = admin->v1->funk->rec_pool->ele;
- fd_funk_rec_t * head = !fd_funk_rec_idx_is_null( txn->rec_head_idx ) ?
- &rec_pool[ txn->rec_head_idx ] : NULL;
- txn->rec_head_idx = FD_FUNK_REC_IDX_NULL;
- txn->rec_tail_idx = FD_FUNK_REC_IDX_NULL;
- while( head ) {
- head = fd_accdb_v2_root_batch( admin, head );
- }
-}
-
-static void
-txn_unregister( fd_funk_t * funk,
- fd_funk_txn_t * txn ) {
- ulong child_idx = fd_funk_txn_idx( txn->child_head_cidx );
- while( FD_UNLIKELY( !fd_funk_txn_idx_is_null( child_idx ) ) ) {
- funk->txn_pool->ele[ child_idx ].parent_cidx = fd_funk_txn_cidx( FD_FUNK_TXN_IDX_NULL );
- child_idx = fd_funk_txn_idx( funk->txn_pool->ele[ child_idx ].sibling_next_cidx );
- }
-
- fd_funk_txn_xid_t xid[1]; fd_funk_txn_xid_copy( xid, fd_funk_txn_xid( txn ) );
- fd_funk_txn_map_query_t query[1];
- int remove_err = fd_funk_txn_map_remove( funk->txn_map, xid, NULL, query, 0 );
- if( FD_UNLIKELY( remove_err!=FD_MAP_SUCCESS ) ) {
- FD_LOG_CRIT(( "fd_accdb_publish failed: fd_funk_txn_map_remove failed: %i-%s", remove_err, fd_map_strerror( remove_err ) ));
- }
-}
-
-static void
-txn_free( fd_funk_t * funk,
- fd_funk_txn_t * txn ) {
- FD_VOLATILE( txn->state ) = FD_FUNK_TXN_STATE_FREE;
- txn->parent_cidx = UINT_MAX;
- txn->sibling_prev_cidx = UINT_MAX;
- txn->sibling_next_cidx = UINT_MAX;
- txn->child_head_cidx = UINT_MAX;
- txn->child_tail_cidx = UINT_MAX;
- fd_funk_txn_pool_release( funk->txn_pool, txn, 1 );
-}
-
-static void
-fd_accdb_txn_publish_one( fd_accdb_admin_v2_t * accdb,
- fd_funk_txn_t * txn ) {
- fd_funk_t * funk = accdb->v1->funk;
-
- /* Children of transaction are now children of root */
- funk->shmem->child_head_cidx = txn->child_head_cidx;
- funk->shmem->child_tail_cidx = txn->child_tail_cidx;
-
- /* Phase 1: Mark transaction as "last published" */
-
- fd_funk_txn_xid_t xid[1]; fd_funk_txn_xid_copy( xid, fd_funk_txn_xid( txn ) );
- if( FD_UNLIKELY( !fd_funk_txn_idx_is_null( fd_funk_txn_idx( txn->parent_cidx ) ) ) ) {
- FD_LOG_CRIT(( "fd_accdb_txn_advance_root: parent of txn %lu:%lu is not root", xid->ul[0], xid->ul[1] ));
- }
- fd_funk_txn_xid_st_atomic( funk->shmem->last_publish, xid );
- FD_LOG_INFO(( "accdb txn laddr=%p xid %lu:%lu: publish", (void *)txn, txn->xid.ul[0], txn->xid.ul[1] ));
-
- /* Phase 2: Drain users from transaction */
-
- ulong txn_idx = (ulong)( txn - funk->txn_pool->ele );
- fd_rwlock_write( &funk->txn_lock[ txn_idx ] );
- FD_VOLATILE( txn->state ) = FD_FUNK_TXN_STATE_PUBLISH;
-
- /* Phase 3: Move records from funk to vinyl */
-
- publish_recs( accdb, txn );
-
- /* Phase 4: Unregister transaction */
-
- txn_unregister( funk, txn );
-
- /* Phase 5: Free transaction object */
-
- fd_rwlock_unwrite( &funk->txn_lock[ txn_idx ] );
- txn_free( funk, txn );
-}
-
void
fd_accdb_v2_advance_root( fd_accdb_admin_t * accdb_,
fd_funk_txn_xid_t const * xid ) {
fd_accdb_admin_v2_t * accdb = downcast( accdb_ );
fd_funk_t * funk = accdb->v1->funk;
- fd_accdb_lineage_set_fork( accdb->root_lineage, funk, xid );
-
/* Assume no concurrent access to txn_map */
fd_funk_txn_map_query_t query[1];
@@ -190,23 +129,8 @@ fd_accdb_v2_advance_root( fd_accdb_admin_t * accdb_,
fd_accdb_txn_cancel_siblings( accdb->v1, txn );
- fd_accdb_lineage_t * lineage = accdb->root_lineage;
- fd_funk_txn_xid_t oldest_xid = lineage->fork[ lineage->fork_depth-1UL ];
- if( fd_funk_txn_xid_eq_root( &oldest_xid ) && lineage->fork_depth>1UL ) {
- oldest_xid = lineage->fork[ lineage->fork_depth-2UL ];
- }
-
- ulong delay = xid->ul[0] - oldest_xid.ul[0];
- /* genesis_override is necessary when bootstrapping from genesis,
- without requiring fd_accdb_admin_v2_delay_set to accept 0. */
- int genesis_override = !xid->ul[0];
- if( delay >= accdb->slot_delay || genesis_override ) {
- FD_LOG_INFO(( "accdb xid %lu:%lu: pruning",
- oldest_xid.ul[0], oldest_xid.ul[1] ));
- fd_funk_txn_t * oldest = &funk->txn_pool->ele[ funk->shmem->child_head_cidx ];
- FD_TEST( fd_funk_txn_xid_eq( &oldest_xid, &oldest->xid ) );
- fd_accdb_txn_publish_one( accdb, oldest );
- }
+ /* Root message is sent to the accdb tile by the replay tile via
+ the replay_accdb stem link (see fd_replay_tile.c). */
}
void
@@ -217,6 +141,13 @@ fd_accdb_admin_v2_delay_set( fd_accdb_admin_t * accdb_,
accdb->slot_delay = slot_delay;
}
+void
+fd_accdb_admin_v2_max_depth_set( fd_accdb_admin_t * accdb_,
+ ulong max_depth ) {
+ fd_accdb_admin_v2_t * accdb = downcast( accdb_ );
+ accdb->max_depth = max_depth;
+}
+
fd_accdb_admin_vt_t const fd_accdb_admin_v2_vt = {
.fini = fd_accdb_admin_v2_fini,
.root_get = fd_accdb_v2_root_get,
diff --git a/src/flamenco/accdb/fd_accdb_admin_v2.h b/src/flamenco/accdb/fd_accdb_admin_v2.h
index f211afe020c..5da2e3deec6 100644
--- a/src/flamenco/accdb/fd_accdb_admin_v2.h
+++ b/src/flamenco/accdb/fd_accdb_admin_v2.h
@@ -5,6 +5,25 @@
account database. */
#include "fd_accdb_admin.h"
+#include "fd_accdb_admin_v1.h"
+#include "../../tango/fd_tango_base.h"
+
+struct fd_accdb_admin_v2 {
+ union {
+ fd_accdb_admin_base_t base;
+ fd_accdb_admin_v1_t v1[1];
+ };
+
+ ulong slot_delay;
+ ulong max_depth; /* Max fork depth (distance from any tip to root).
+ attach_child spins if adding a child would exceed this. */
+
+ fd_frag_meta_t * mcache;
+ ulong depth;
+ ulong seq;
+};
+
+typedef struct fd_accdb_admin_v2 fd_accdb_admin_v2_t;
FD_PROTOTYPES_BEGIN
@@ -13,17 +32,16 @@ extern fd_accdb_admin_vt_t const fd_accdb_admin_v2_vt;
fd_accdb_admin_t *
fd_accdb_admin_v2_init( fd_accdb_admin_t * admin_,
void * shfunk,
- void * shlocks,
- void * vinyl_rq,
- void * vinyl_data,
- void * vinyl_req_pool,
- ulong vinyl_link_id,
- ulong max_depth );
+ void * shlocks );
void
fd_accdb_admin_v2_delay_set( fd_accdb_admin_t * admin,
ulong slot_delay );
+void
+fd_accdb_admin_v2_max_depth_set( fd_accdb_admin_t * admin,
+ ulong max_depth );
+
void
fd_accdb_admin_v2_fini( fd_accdb_admin_t * ljoin );
diff --git a/src/flamenco/accdb/fd_accdb_admin_v2_private.h b/src/flamenco/accdb/fd_accdb_admin_v2_private.h
deleted file mode 100644
index 6151000589d..00000000000
--- a/src/flamenco/accdb/fd_accdb_admin_v2_private.h
+++ /dev/null
@@ -1,62 +0,0 @@
-#ifndef HEADER_fd_src_flamenco_accdb_fd_accdb_admin_v2_private_h
-#define HEADER_fd_src_flamenco_accdb_fd_accdb_admin_v2_private_h
-
-#include "fd_accdb_admin_v2.h"
-#include "fd_accdb_admin_v1.h"
-#include "fd_vinyl_req_pool.h"
-
-/* FD_ACCDB_ROOT_BATCH_MAX controls how many accounts to write in
- batches to the vinyl DB server. */
-
-#define FD_ACCDB_ROOT_BATCH_MAX (128UL)
-
-struct fd_accdb_admin_v2 {
- union {
- fd_accdb_admin_base_t base;
- fd_accdb_admin_v1_t v1[1];
- };
-
- fd_accdb_lineage_t root_lineage[1];
- ulong slot_delay;
-
- /* Vinyl client */
- ulong vinyl_req_id;
- fd_vinyl_rq_t * vinyl_rq;
- ulong vinyl_link_id;
- fd_wksp_t * vinyl_data_wksp;
- fd_wksp_t * vinyl_req_wksp;
- fd_vinyl_req_pool_t * vinyl_req_pool;
-};
-
-typedef struct fd_accdb_admin_v2 fd_accdb_admin_v2_t;
-
-FD_PROTOTYPES_BEGIN
-
-/* fd_accdb_v2_root_batch "roots" a batch of funk accounts.
-
- rec0 is the head of the batch linked list to root (NULL is fine).
- Up to FD_ACCDB_ROOT_BATCH_MAX records starting at rec0 are migrated
- to vinyl. This frees rec0 and subsequent items. Returns the next
- record in the linked list that is not yet rooted.
-
- It is assumed that the rec0 linked list is not owned by a funk_txn at
- this point. (The funk_txn that used to own rec0 has child_head and
- child_tail set to sentinel.)
-
- Each record is considered as follows:
- - If another newer revision of this record exists that was already
- marked as rooted, this record is thrown away.
- - Otherwise, the record is moved to vinyl.
-
- The move to vinyl is done in a thread-safe manner (writes to vinyl
- first, then once the write is globally visible, removes from funk).
-
- Updates the following metrics: root_cnt, reclaim_cnt. */
-
-fd_funk_rec_t *
-fd_accdb_v2_root_batch( fd_accdb_admin_v2_t * admin,
- fd_funk_rec_t * rec0 );
-
-FD_PROTOTYPES_END
-
-#endif /* HEADER_fd_src_flamenco_accdb_fd_accdb_admin_v2_private_h */
diff --git a/src/flamenco/accdb/fd_accdb_admin_v2_root.c b/src/flamenco/accdb/fd_accdb_admin_v2_root.c
deleted file mode 100644
index 0430d025446..00000000000
--- a/src/flamenco/accdb/fd_accdb_admin_v2_root.c
+++ /dev/null
@@ -1,379 +0,0 @@
-#include "fd_accdb_admin_v2_private.h"
-#include "../fd_flamenco_base.h"
-#include "../runtime/fd_runtime_const.h" /* FD_RUNTIME_ACC_SZ_MAX */
-#include "../../vinyl/data/fd_vinyl_data.h"
-
-/***********************************************************************
-
- fd_accdb_admin_v2_root.c contains the account rooting algorithm.
-
- This algorithm is designed to amortize vinyl I/O latency by
- processing accounts in batches.
-
- For each batch of accounts, it does the following logic:
-
- - ACQUIRE batch request for account updates
- - ERASE batch request for account deletions
- - Spin wait for ACQUIRE completion
- - Copy back modified accounts
- - RELEASE batch request for account updates
- - Spin wait for ACQUIRE, ERASE completions
- - Free records from funk
-
-***********************************************************************/
-
-/* vinyl_spin_wait waits for completion of a vinyl request and asserts
- that all requests completed successfully. */
-
-static void
-vinyl_spin_wait( fd_vinyl_comp_t const * comp,
- fd_vinyl_key_t const * key0,
- schar const * err0,
- ulong cnt,
- char const * req_type_cstr ) {
-
- /* FIXME use a load-acquire here, such that later loads are ordered
- past this load */
- while( FD_VOLATILE_CONST( comp->seq )!=1UL ) FD_SPIN_PAUSE();
- FD_COMPILER_MFENCE();
- int comp_err = FD_VOLATILE_CONST( comp->err );
- if( FD_UNLIKELY( comp_err!=FD_VINYL_SUCCESS ) ) {
- FD_LOG_CRIT(( "vinyl tile rejected my %s request (%i-%s)",
- req_type_cstr, comp_err, fd_vinyl_strerror( comp_err ) ));
- }
-
- for( ulong i=0UL; irec_pool->ele );
- ulong volatile * vl = &funk->rec_lock[ rec_idx ];
- for(;;) {
- ulong const ver_lock = FD_VOLATILE_CONST( *vl );
- ulong const ver = fd_funk_rec_ver_bits ( ver_lock );
- ulong const lock = fd_funk_rec_lock_bits( ver_lock );
- if( FD_UNLIKELY( lock ) ) {
- /* Spin while there are active readers */
- /* FIXME kill client after spinning for 30 seconds to prevent silent deadlock */
- FD_SPIN_PAUSE();
- continue;
- }
- ulong const new_ver = fd_funk_rec_ver_inc( ver );
- ulong const new_vl = fd_funk_rec_ver_lock( new_ver, FD_FUNK_REC_LOCK_MASK );
- if( FD_UNLIKELY( FD_ATOMIC_CAS( vl, ver_lock, new_vl )!=ver_lock ) ) {
- FD_SPIN_PAUSE();
- continue;
- }
- return new_vl;
- }
-}
-
-static void
-fd_funk_rec_admin_unlock( fd_funk_t const * funk,
- fd_funk_rec_t * rec,
- ulong ver_lock ) {
- ulong rec_idx = (ulong)( rec - funk->rec_pool->ele );
- ulong volatile * vl = &funk->rec_lock[ rec_idx ];
- FD_VOLATILE( *vl ) = fd_funk_rec_ver_lock( fd_funk_rec_ver_bits( ver_lock ), 0UL );
-}
-
-static void
-funk_free_rec( fd_funk_t * funk,
- fd_funk_rec_t * rec ) {
- /* Acquire admin lock (kick out readers)
-
- Note: At this point, well-behaving external readers will abandon a
- read-lock attempt if they observe this active write lock. (An
- admin lock always implies the record is about to die) */
-
- FD_COMPILER_MFENCE();
- ulong ver_lock = fd_funk_rec_admin_lock( funk, rec );
-
- /* Free record */
-
- memset( &rec->pair, 0, sizeof(fd_funk_xid_key_pair_t) );
- FD_COMPILER_MFENCE();
- rec->map_next = FD_FUNK_REC_IDX_NULL;
- fd_funk_val_flush( rec, funk->alloc, funk->wksp );
- fd_funk_rec_admin_unlock( funk, rec, ver_lock );
- fd_funk_rec_pool_release( funk->rec_pool, rec, 1 );
-}
-
-/* funk_gc_chain optimistically deletes all but the newest rooted
- revisions of rec. This possibly deletes 'rec'. Returns rec if rec
- is the only known rooted revision, otherwise returns NULL (if rec was
- deleted). Note that due to edge cases, revisions that are not in the
- oldest tracked slot, may not reliably get cleaned up. (The oldest
- tracked slot always gets cleaned up, though.) */
-
-static fd_funk_rec_t *
-funk_gc_chain( fd_accdb_admin_v2_t * const admin,
- fd_funk_rec_t * const rec ) {
-
- fd_accdb_lineage_t * lineage = admin->root_lineage;
- fd_funk_t * funk = admin->v1->funk;
- fd_funk_rec_t * rec_pool = funk->rec_pool->ele;
- ulong rec_max = funk->rec_pool->ele_max;
- ulong seed = funk->rec_map->map->seed;
- ulong chain_cnt = funk->rec_map->map->chain_cnt;
- ulong root_slot = lineage->fork[0].ul[0];
-
- ulong hash = fd_funk_rec_map_key_hash( &rec->pair, seed );
- ulong chain_idx = (hash & (chain_cnt-1UL) );
-
- /* Lock rec_map chain */
-
- int lock_err = fd_funk_rec_map_iter_lock( funk->rec_map, &chain_idx, 1UL, FD_MAP_FLAG_BLOCKING );
- if( FD_UNLIKELY( lock_err!=FD_MAP_SUCCESS ) ) {
- FD_LOG_CRIT(( "fd_funk_rec_map_iter_lock failed (%i-%s)", lock_err, fd_map_strerror( lock_err ) ));
- }
-
- fd_funk_rec_map_shmem_private_chain_t * chain =
- fd_funk_rec_map_shmem_private_chain( funk->rec_map->map, 0UL ) + chain_idx;
- ulong ver =
- fd_funk_rec_map_private_vcnt_ver( FD_VOLATILE_CONST( chain->ver_cnt ) );
- FD_CRIT( ver&1UL, "chain is not locked" );
-
- /* Walk map chain */
-
- fd_funk_rec_t * found_rec = NULL;
- uint * pnext = &chain->head_cidx;
- uint cur = *pnext;
- ulong chain_len = 0UL;
- ulong iter = 0UL;
- while( cur!=FD_FUNK_REC_IDX_NULL ) {
- if( FD_UNLIKELY( iter++ > rec_max ) ) FD_LOG_CRIT(( "cycle detected in rec_map chain %lu", chain_idx ));
-
- /* Is this node garbage? */
-
- fd_funk_rec_t * node = &funk->rec_pool->ele[ cur ];
- if( FD_UNLIKELY( cur==node->map_next ) ) FD_LOG_CRIT(( "accdb corruption detected: cycle in rec_map chain %lu", chain_idx ));
- cur = node->map_next;
- if( !fd_funk_rec_key_eq( rec->pair.key, node->pair.key ) ) goto retain;
- if( node->pair.xid->ul[0]>root_slot ) goto retain;
- if( !found_rec ) {
- found_rec = node;
- goto retain;
- }
-
- /* No longer need this node */
-
- if( node->pair.xid->ul[0] > rec->pair.xid->ul[0] ) {
- /* If this node is newer than the to-be-deleted slot, need to
- remove it from the transaction's record list. */
- uint neigh_prev = node->prev_idx;
- uint neigh_next = node->next_idx;
- if( neigh_prev==FD_FUNK_REC_IDX_NULL ||
- neigh_next==FD_FUNK_REC_IDX_NULL ) {
- /* Node is first or last of transaction -- too bothersome to
- remove it from the transaction's record list */
- goto retain;
- }
- rec_pool[ neigh_next ].prev_idx = neigh_prev;
- rec_pool[ neigh_prev ].next_idx = neigh_next;
- }
-
- /* Destroy this node */
-
- funk_free_rec( funk, node );
- *pnext = cur;
- continue;
-
- retain:
- pnext = &node->map_next;
- chain_len++;
- }
-
- /* Unlock rec_map chain */
-
- FD_COMPILER_MFENCE();
- FD_VOLATILE( chain->ver_cnt ) =
- fd_funk_rec_map_private_vcnt( ver+1UL, chain_len );
- FD_COMPILER_MFENCE();
- return found_rec==rec ? found_rec : NULL;
-}
-
-/* Main algorithm */
-
-fd_funk_rec_t *
-fd_accdb_v2_root_batch( fd_accdb_admin_v2_t * admin,
- fd_funk_rec_t * rec0 ) {
- long t_start = fd_tickcount();
-
- fd_funk_t * funk = admin->v1->funk; /* unrooted DB */
- fd_wksp_t * funk_wksp = funk->wksp; /* shm workspace containing unrooted accounts */
- fd_funk_rec_t * rec_pool = funk->rec_pool->ele; /* funk rec arena */
- fd_vinyl_rq_t * rq = admin->vinyl_rq; /* "request queue "*/
- fd_vinyl_req_pool_t * req_pool = admin->vinyl_req_pool; /* "request pool" */
- fd_wksp_t * req_wksp = admin->vinyl_req_wksp; /* shm workspace containing request buffer */
- fd_wksp_t * data_wksp = admin->vinyl_data_wksp; /* shm workspace containing vinyl data cache */
- ulong link_id = admin->vinyl_link_id; /* vinyl client ID */
-
- /* Collect funk request batch */
-
- fd_funk_rec_t * recs[ FD_ACCDB_ROOT_BATCH_MAX ];
- ulong rec_cnt;
-
- fd_funk_rec_t * next = rec0;
- for( rec_cnt=0UL; next && rec_cntnext_idx ) ) {
- next = NULL;
- } else {
- next = &rec_pool[ cur->next_idx ];
- }
- cur->prev_idx = FD_FUNK_REC_IDX_NULL;
- cur->next_idx = FD_FUNK_REC_IDX_NULL;
-
- if( funk_gc_chain( admin, cur ) ) {
- recs[ rec_cnt++ ] = cur;
- }
- }
-
- /* Partition batch into ACQUIRE (updates) and ERASE (deletions) */
-
- ulong acq_cnt = 0UL;
- ulong del_cnt;
- for( ulong i=0UL; ival_sz>=sizeof(fd_account_meta_t), "corrupt funk_rec" );
- if( meta->lamports ) {
- fd_funk_rec_t * tmp = recs[ i ];
- recs[ i ] = recs[ acq_cnt ];
- recs[ acq_cnt ] = tmp;
- acq_cnt++;
- }
- }
- del_cnt = rec_cnt - acq_cnt;
-
- /* Create ACQUIRE and ERASE batch requests */
-
- ulong del_batch = fd_vinyl_req_pool_acquire( req_pool ); /* ERASE */
- ulong acq_batch = fd_vinyl_req_pool_acquire( req_pool ); /* ACQUIRE */
- fd_vinyl_key_t * acq_key0 = fd_vinyl_req_batch_key( req_pool, acq_batch );
- fd_vinyl_key_t * del_key0 = fd_vinyl_req_batch_key( req_pool, del_batch );
-
- for( ulong i=0UL; ipair.key, 32UL );
- }
- for( ulong i=0UL; ipair.key, 32UL );
- }
-
- /* Send off ACQUIRE and ERASE requests */
-
- fd_vinyl_comp_t * acq_comp = fd_vinyl_req_batch_comp ( req_pool, acq_batch );
- fd_vinyl_comp_t * del_comp = fd_vinyl_req_batch_comp ( req_pool, del_batch );
- schar * acq_err0 = fd_vinyl_req_batch_err ( req_pool, acq_batch );
- schar * del_err0 = fd_vinyl_req_batch_err ( req_pool, del_batch );
- ulong * acq_val_gaddr0 = fd_vinyl_req_batch_val_gaddr( req_pool, acq_batch );
-
- memset( acq_comp, 0, sizeof(fd_vinyl_comp_t) );
- memset( del_comp, 0, sizeof(fd_vinyl_comp_t) );
- for( ulong i=0UL; idlen;
- FD_CRIT( data_sz<=FD_RUNTIME_ACC_SZ_MAX, "oversize account record" );
-
- ulong val_sz = sizeof(fd_account_meta_t) + data_sz;
- acq_val_gaddr0[ i ] = val_sz;
- admin->base.root_tot_sz += val_sz;
- }
-
- fd_vinyl_req_send_batch(
- rq, req_pool, req_wksp,
- admin->vinyl_req_id++, link_id,
- FD_VINYL_REQ_TYPE_ACQUIRE,
- FD_VINYL_REQ_FLAG_MODIFY |
- FD_VINYL_REQ_FLAG_IGNORE |
- FD_VINYL_REQ_FLAG_CREATE,
- acq_batch, acq_cnt
- );
- fd_vinyl_req_send_batch(
- rq, req_pool, req_wksp,
- admin->vinyl_req_id++, link_id,
- FD_VINYL_REQ_TYPE_ERASE,
- 0UL,
- del_batch, del_cnt
- );
-
- /* Spin for ACQUIRE completion */
-
- vinyl_spin_wait( acq_comp, acq_key0, acq_err0, acq_cnt, "ACQUIRE" );
- long t_acquire = fd_tickcount();
-
- /* Copy back modified accounts */
-
- for( ulong i=0UL; idlen;
- ulong val_sz = sizeof(fd_account_meta_t) + data_sz;
- FD_CRIT( data_sz<=FD_RUNTIME_ACC_SZ_MAX, "oversize account record" );
-
- fd_account_meta_t * dst_meta = fd_wksp_laddr_fast( data_wksp, acq_val_gaddr0[ i ] );
- fd_vinyl_info_t * val_info = fd_vinyl_data_info( dst_meta );
-
- fd_memcpy( dst_meta, src_meta, val_sz );
- val_info->val_sz = (uint)val_sz;
- }
-
- /* Send off RELEASE batch request (reuse acq_batch) */
-
- memset( acq_comp, 0, sizeof(fd_vinyl_comp_t) );
- for( ulong i=0UL; ivinyl_req_id++, link_id,
- FD_VINYL_REQ_TYPE_RELEASE,
- FD_VINYL_REQ_FLAG_MODIFY,
- acq_batch, acq_cnt
- );
- long t_copy = fd_tickcount();
-
- /* Spin for ERASE, RELEASE completions */
-
- vinyl_spin_wait( del_comp, del_key0, del_err0, del_cnt, "ERASE" );
- fd_vinyl_req_pool_release( req_pool, del_batch );
-
- vinyl_spin_wait( acq_comp, acq_key0, acq_err0, acq_cnt, "RELEASE" );
- fd_vinyl_req_pool_release( req_pool, acq_batch );
- long t_release = fd_tickcount();
-
- /* Remove funk records */
-
- for( ulong i=0UL; ipair;
- fd_funk_rec_query_t query[1];
- int rm_err = fd_funk_rec_map_remove( funk->rec_map, &pair, NULL, query, FD_MAP_FLAG_BLOCKING );
- if( FD_UNLIKELY( rm_err!=FD_MAP_SUCCESS ) ) FD_LOG_CRIT(( "fd_funk_rec_map_remove failed (%i-%s)", rm_err, fd_map_strerror( rm_err ) ));
- funk_free_rec( funk, recs[ i ] );
- }
- long t_gc = fd_tickcount();
-
- /* Update metrics */
-
- admin->base.root_cnt += (uint)acq_cnt;
- admin->base.reclaim_cnt += (uint)del_cnt;
- admin->base.dt_vinyl += ( t_acquire - t_start ) + ( t_release - t_copy );
- admin->base.dt_copy += ( t_copy - t_acquire );
- admin->base.dt_gc += ( t_gc - t_release );
-
- return next;
-}
diff --git a/src/flamenco/accdb/fd_accdb_base.h b/src/flamenco/accdb/fd_accdb_base.h
index a4c44a81deb..6bbebcf645f 100644
--- a/src/flamenco/accdb/fd_accdb_base.h
+++ b/src/flamenco/accdb/fd_accdb_base.h
@@ -13,6 +13,7 @@ typedef struct fd_accdb_user fd_accdb_user_t;
#define FD_ACCDB_TYPE_V0 (80U) /* minimal single chain */
#define FD_ACCDB_TYPE_V1 (1U) /* funk */
#define FD_ACCDB_TYPE_V2 (2U) /* read-only vinyl + read-write funk */
+#define FD_ACCDB_TYPE_V2S (3U) /* vinyl speculative read (pinned) */
#define FD_ACCDB_REF_INVAL 0 /* not a valid reference */
#define FD_ACCDB_REF_RO 1 /* read only */
diff --git a/src/flamenco/accdb/fd_accdb_ctl.c b/src/flamenco/accdb/fd_accdb_ctl.c
deleted file mode 100644
index dc5deb35477..00000000000
--- a/src/flamenco/accdb/fd_accdb_ctl.c
+++ /dev/null
@@ -1,771 +0,0 @@
-/* fd_accdb_ctl.c is a command-line debugging tool for interacting with
- a Firedancer account database. */
-
-#include "../../vinyl/fd_vinyl.h"
-#include "../../flamenco/fd_flamenco_base.h"
-#include "../../ballet/base58/fd_base58.h"
-#include "../../util/cstr/fd_cstr.h"
-#include "../../util/pod/fd_pod.h"
-#include
-#include /* offsetof */
-#include
-
-/* req_info contains various request metadata R/W mapped into the vinyl
- tile. */
-
-struct req_info {
- fd_vinyl_key_t key[1];
- ulong val_gaddr[1];
- schar err[1];
- fd_vinyl_comp_t comp[1];
-};
-
-typedef struct req_info req_info_t;
-
-/* The client class contains local handles to client-related vinyl
- objects. */
-
-struct client {
- fd_vinyl_rq_t * rq;
- fd_vinyl_cq_t * cq;
- ulong req_id;
- ulong link_id;
-
- fd_vinyl_meta_t * meta;
-
- req_info_t * req_info;
- ulong req_info_gaddr;
- fd_wksp_t * val_wksp;
- fd_wksp_t * client_wksp;
-
- /* Vinyl client status */
- ulong quota_rem;
- ulong cq_seq;
-};
-
-typedef struct client client_t;
-
-static char const bin2hex[ 16 ] = { '0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f' };
-
-static void
-hexdump( uchar const * data,
- uint sz ) {
- ulong sz_align = fd_ulong_align_dn( sz, 16UL );
- uint i;
- for( i=0U; i>4 ] );
- p = fd_cstr_append_char( p, bin2hex[ data[ i+j ]&15 ] );
- p = fd_cstr_append_char( p, ' ' );
- }
- p = fd_cstr_append_char( p, ' ' );
- for( ulong j=0UL; j<16UL; j++ ) {
- int c = data[ i+j ];
- p = fd_cstr_append_char( p, fd_char_if( fd_isalnum( c ) | fd_ispunct( c ) | (c==' '), (char)c, '.' ) );
- }
- p = fd_cstr_append_char( p, '\n' );
- ulong len = (ulong)( p-line );
- fd_cstr_fini( p );
- fwrite( line, 1UL, len, stdout );
- }
- if( sz ) {
- char line[ 80 ];
- char * p = fd_cstr_init( line );
- p = fd_cstr_append_uint_as_hex( p, '0', i, 7UL );
- p = fd_cstr_append_text( p, ": ", 3UL );
- for( ; i>4 ] );
- p = fd_cstr_append_char( p, bin2hex[ data[ i ]&15 ] );
- p = fd_cstr_append_char( p, ' ' );
- }
- p = fd_cstr_append_char( p, '\n' );
- ulong len = (ulong)( p-line );
- fd_cstr_fini( p );
- fwrite( line, 1UL, len, stdout );
- }
- fflush( stdout );
-}
-
-static void
-client_query( client_t * client,
- char ** arg,
- ulong arg_cnt ) {
- req_info_t * req_info = client->req_info;
- if( FD_UNLIKELY( arg_cnt!=1UL ) ) {
- puts( "ERR(query): invalid query command, usage is \"query \"" );
- return;
- }
- char const * acc_addr_b58 = arg[0];
- fd_vinyl_key_t * acc_key = req_info->key;
- if( FD_UNLIKELY( !fd_base58_decode_32( acc_addr_b58, acc_key->uc ) ) ) {
- puts( "ERR(query): invalid account address" );
- return;
- }
-
- /* Send an acquire request */
-
- req_info->comp->seq = 0UL;
- req_info->val_gaddr[0] = FD_VINYL_VAL_MAX;
- fd_vinyl_rq_send(
- client->rq,
- client->req_id++,
- client->link_id,
- FD_VINYL_REQ_TYPE_ACQUIRE, /* type */
- 0UL, /* flags */
- 1UL,
- /* key_gaddr */ client->req_info_gaddr + offsetof( req_info_t, key ),
- /* val_gaddr_gaddr */ client->req_info_gaddr + offsetof( req_info_t, val_gaddr ),
- /* err_gaddr */ client->req_info_gaddr + offsetof( req_info_t, err ),
- /* comp_gaddr */ client->req_info_gaddr + offsetof( req_info_t, comp )
- );
-
- /* Poll direct completion for acquire (not via CQ) */
-
- fd_vinyl_comp_t * comp = req_info->comp;
- while( FD_VOLATILE_CONST( comp->seq )!=1UL ) FD_SPIN_PAUSE();
- int acquire_err = req_info->err[0];
- if( acquire_err==FD_VINYL_SUCCESS ) {
- fd_account_meta_t const * val = fd_wksp_laddr_fast( client->val_wksp, req_info->val_gaddr[0] );
- void const * data = (void const *)( val+1 );
-
- FD_BASE58_ENCODE_32_BYTES( val->owner, owner_b58 );
- printf(
- "\n"
- "Public Key: %s\n"
- "Balance: %lu.%lu SOL\n"
- "Owner: %s\n"
- "Executable: %s\n"
- "Length: %u (0x%x) bytes\n",
- acc_addr_b58,
- val->lamports / 1000000000UL,
- val->lamports % 1000000000UL,
- owner_b58,
- val->executable ? "true" : "false",
- val->dlen,
- val->dlen
- );
- hexdump( data, val->dlen );
-
- /* Send a release request */
-
- req_info->comp->seq = 0UL;
- req_info->val_gaddr[0] = FD_VINYL_VAL_MAX;
- fd_vinyl_rq_send(
- client->rq,
- client->req_id++,
- client->link_id,
- FD_VINYL_REQ_TYPE_RELEASE, /* type */
- 0UL, /* flags */
- 1UL,
- 0UL,
- /* val_gaddr_gaddr */ client->req_info_gaddr + offsetof( req_info_t, val_gaddr ),
- /* err_gaddr */ client->req_info_gaddr + offsetof( req_info_t, err ),
- /* comp_gaddr */ client->req_info_gaddr + offsetof( req_info_t, comp )
- );
-
- /* Poll direct completion for release (not via CQ) */
-
- while( FD_VOLATILE_CONST( comp->seq )!=1UL ) FD_SPIN_PAUSE();
- FD_TEST( req_info->err[0]==FD_VINYL_SUCCESS );
-
- puts( "" );
- } else if( acquire_err==FD_VINYL_ERR_KEY ) {
- printf(
- "\n"
- "Public Key: %s\n"
- "Account does not exist\n"
- "\n",
- acc_addr_b58
- );
- } else {
- FD_LOG_ERR(( "Vinyl acquire request failed (err %i-%s)", acquire_err, fd_vinyl_strerror( acquire_err ) ));
- }
-}
-
-typedef struct batch_req batch_req_t;
-struct batch_req {
- batch_req_t * prev;
- batch_req_t * next;
-
- ulong key_off;
- ulong err_off;
- ulong val_gaddr_off;
-
- ulong req_id;
-};
-
-static ulong
-batch_req_align( void ) {
- return fd_ulong_max( alignof(batch_req_t), alignof(fd_vinyl_key_t) );
-}
-
-static ulong
-batch_req_footprint( ulong depth ) {
- ulong l = FD_LAYOUT_INIT;
- l = FD_LAYOUT_APPEND( l, alignof(batch_req_t), sizeof(batch_req_t) );
- l = FD_LAYOUT_APPEND( l, alignof(fd_vinyl_key_t), depth*sizeof(fd_vinyl_key_t) );
- l = FD_LAYOUT_APPEND( l, alignof(schar), depth*sizeof(schar) );
- l = FD_LAYOUT_APPEND( l, alignof(ulong), depth*sizeof(ulong) );
- return FD_LAYOUT_FINI( l, batch_req_align() );
-}
-
-static batch_req_t *
-batch_req_new( void * mem,
- ulong depth ) {
- FD_SCRATCH_ALLOC_INIT( l, mem );
- batch_req_t * req = FD_SCRATCH_ALLOC_APPEND( l, alignof(batch_req_t), sizeof(batch_req_t) );
- fd_vinyl_key_t * key = FD_SCRATCH_ALLOC_APPEND( l, alignof(fd_vinyl_key_t), depth*sizeof(fd_vinyl_key_t) );
- schar * err = FD_SCRATCH_ALLOC_APPEND( l, alignof(schar), depth*sizeof(schar) );
- ulong * val_gaddr = FD_SCRATCH_ALLOC_APPEND( l, alignof(ulong), depth*sizeof(ulong) );
- FD_SCRATCH_ALLOC_FINI( l, batch_req_align() );
-
- *req = (batch_req_t) {
- .prev = NULL,
- .next = NULL,
-
- .key_off = (ulong)key - (ulong)mem,
- .err_off = (ulong)err - (ulong)mem,
- .val_gaddr_off = (ulong)val_gaddr - (ulong)mem
- };
- return req;
-}
-
-static inline fd_vinyl_key_t *
-batch_req_key( batch_req_t * req ) {
- return (fd_vinyl_key_t *)( (ulong)req + req->key_off );
-}
-
-static inline schar *
-batch_req_err( batch_req_t * req ) {
- return (schar *)( (ulong)req + req->err_off );
-}
-
-static inline ulong *
-batch_req_val_gaddr( batch_req_t * req ) {
- return (ulong *)( (ulong)req + req->val_gaddr_off );
-}
-
-struct bench_query_rand {
- batch_req_t * req_free; /* free entries */
- batch_req_t * req_wait_lo; /* list of entries awaiting completion */
- batch_req_t * req_wait_hi;
- ulong batch_depth;
-
- ulong iter_rem;
- fd_vinyl_key_t * sample;
- ulong sample_idx;
- ulong sample_max;
-
- ulong found_cnt;
- ulong miss_cnt;
-};
-typedef struct bench_query_rand bench_query_rand_t;
-
-/* bqr_free_push adds a wait queue entry to the free stack. */
-
-static void
-bqr_free_push( bench_query_rand_t * bqr,
- batch_req_t * req ) {
- req->prev = NULL;
- req->next = bqr->req_free;
- if( bqr->req_free ) bqr->req_free->prev = req;
- bqr->req_free = req;
-}
-
-/* bqr_free_pop removes a wait queue entry from the free stack (alloc). */
-
-static batch_req_t *
-bqr_free_pop( bench_query_rand_t * bqr ) {
- batch_req_t * req = bqr->req_free;
- bqr->req_free = req->next;
- if( bqr->req_free ) bqr->req_free->prev = NULL;
- req->prev = req->next = NULL;
- return req;
-}
-
-/* bqr_wait_push adds a new wait queue entry. */
-
-static void
-bqr_wait_push( bench_query_rand_t * bqr,
- batch_req_t * req ) {
- req->prev = bqr->req_wait_hi;
- req->next = NULL;
- if( bqr->req_wait_hi ) bqr->req_wait_hi->next = req;
- bqr->req_wait_hi = req;
- if( !bqr->req_wait_lo ) bqr->req_wait_lo = req;
-}
-
-/* bqr_wait_pop removes the oldest wait queue entry. */
-
-static batch_req_t *
-bqr_wait_pop( bench_query_rand_t * bqr ) {
- batch_req_t * req = bqr->req_wait_lo;
- bqr->req_wait_lo = req->next;
- req->prev = req->next = NULL;
- if( bqr->req_wait_lo ) bqr->req_wait_lo->prev = NULL;
- else bqr->req_wait_hi = NULL;
- return req;
-}
-
-/* bqr_req_release sends a batch RELEASE request for a batch of values.
- Completions arriving for RELEASE will replenish quota. */
-
-static void
-bqr_req_release( client_t * client,
- bench_query_rand_t * bqr,
- batch_req_t * req,
- uint cnt ) {
- FD_CRIT( !req->prev && !req->next, "attempt to release a request that is already free or still pending" );
-
- schar * err = batch_req_err( req );
- for( uint i=0U; ireq_id++, 63 );
- ulong link_id = client->link_id;
- int type = FD_VINYL_REQ_TYPE_RELEASE;
- ulong flags = 0UL;
- ulong batch_cnt = (ulong)cnt;
- ulong val_gaddr_gaddr = fd_wksp_gaddr_fast( client->client_wksp, batch_req_val_gaddr( req ) );
- ulong err_gaddr = fd_wksp_gaddr_fast( client->client_wksp, err );
- fd_vinyl_rq_send( client->rq, req_id, link_id, type, flags, batch_cnt, 0UL, val_gaddr_gaddr, err_gaddr, 0UL );
-
- req->req_id = req_id;
- bqr_wait_push( bqr, req );
-}
-
-/* bqr_handle_cq handles an ACQUIRE or RELEASE completion. */
-
-static void
-bqr_handle_cq( client_t * client,
- bench_query_rand_t * bqr,
- fd_vinyl_comp_t * comp ) {
- FD_CRIT( bqr->req_wait_lo, "received completion even though no request is pending" );
- batch_req_t * req = bqr_wait_pop( bqr );
- FD_CRIT( req->req_id==comp->req_id, "received completion for unexpected req_id" );
- FD_CRIT( comp->batch_cnt<=bqr->batch_depth, "corrupt comp->batch_cnt" );
-
- /* The high bit of the request ID indicates whether this was an
- ACQUIRE or RELEASE request. */
- int const is_release = fd_ulong_extract_bit( comp->req_id, 63 );
-
- fd_vinyl_key_t * key = batch_req_key( req );
- ulong * val_gaddr = batch_req_val_gaddr( req );
- schar * err = batch_req_err( req );
-
- if( !is_release ) {
-
- uint j=0U;
- for( uint i=0U; ibatch_cnt; i++ ) {
- int e = err[ i ];
- if( FD_UNLIKELY( e!=FD_VINYL_SUCCESS && e!=FD_VINYL_ERR_KEY ) ) {
- FD_LOG_CRIT(( "Unexpected vinyl error %i-%s", e, fd_vinyl_strerror( e ) ));
- }
- if( e==FD_VINYL_SUCCESS ) {
- bqr->found_cnt++;
- key [ j ] = key[ i ];
- val_gaddr[ j ] = val_gaddr[ i ];
- j++;
- } else {
- bqr->miss_cnt++;
- client->quota_rem++;
- }
- }
-
- if( j ) bqr_req_release( client, bqr, req, j );
- else bqr_free_push( bqr, req );
-
- } else {
-
- schar * err = batch_req_err( req );
- uint cnt = comp->batch_cnt;
- for( uint i=0U; iquota_rem += comp->batch_cnt;
- bqr_free_push( bqr, req );
-
- }
-
-}
-
-/* bqr_drain_cq drains all completion queue entries. */
-
-static void
-bqr_drain_cq( client_t * client,
- bench_query_rand_t * bqr ) {
- for(;;) {
- fd_vinyl_comp_t comp[1];
- long cq_err = fd_vinyl_cq_recv( client->cq, client->cq_seq, comp );
- if( FD_UNLIKELY( cq_err<0 ) ) {
- FD_LOG_CRIT(( "Vinyl completion queue overrun detected" ));
- }
- if( cq_err>0 ) break;
- bqr_handle_cq( client, bqr, comp );
- client->cq_seq++;
- }
-}
-
-/* bqr_req_acquire sends a batch of ACQUIRE requests. */
-
-static void
-bqr_req_acquire( client_t * client,
- bench_query_rand_t * bqr ) {
- FD_CRIT( bqr->req_free, "attempt to acquire a request when none are free" );
- batch_req_t * req = bqr_free_pop( bqr );
- ulong cnt = bqr->batch_depth;
- if( FD_UNLIKELY( cnt>bqr->iter_rem ) ) cnt = bqr->iter_rem;
-
- /* Prepare request descriptor */
- fd_vinyl_key_t * key = batch_req_key ( req );
- schar * err = batch_req_err ( req );
- ulong * val_gaddr = batch_req_val_gaddr( req );
- for( ulong i=0UL; isample_idx;
- key [ i ] = bqr->sample[ idx ];
- err [ i ] = 0;
- val_gaddr[ i ] = 0UL;
- bqr->sample_idx++;
- if( bqr->sample_idx>=bqr->sample_max ) bqr->sample_idx = 0UL;
- }
-
- /* Send request */
- ulong req_id = fd_ulong_clear_bit( client->req_id++, 63 );
- ulong link_id = client->link_id;
- int type = FD_VINYL_REQ_TYPE_ACQUIRE;
- ulong flags = 0UL;
- ulong key_gaddr = fd_wksp_gaddr_fast( client->client_wksp, batch_req_key ( req ) );
- ulong val_gaddr_gaddr = fd_wksp_gaddr_fast( client->client_wksp, batch_req_val_gaddr( req ) );
- ulong err_gaddr = fd_wksp_gaddr_fast( client->client_wksp, batch_req_err ( req ) );
- fd_vinyl_rq_send( client->rq, req_id, link_id, type, flags, cnt, key_gaddr, val_gaddr_gaddr, err_gaddr, 0UL );
-
- /* Update quotas */
- bqr->iter_rem -= cnt;
- client->quota_rem -= cnt;
-
- req->req_id = req_id;
- bqr_wait_push( bqr, req );
-}
-
-/* bench_query_rand_poll sends as many random read requests to vinyl as
- possible. Returns 1 if there is more work to do, 0 if the benchmark
- is done. */
-
-static int
-bench_query_rand_poll( client_t * client,
- bench_query_rand_t * bqr ) {
- if( bqr->req_wait_lo ) {
- bqr_drain_cq( client, bqr );
- }
- while( bqr->req_free && bqr->iter_rem ) {
- bqr_req_acquire( client, bqr );
- }
- return (!!bqr->req_wait_lo) | (!!bqr->iter_rem);
-}
-
-/* client_bench_query_rand runs a random read benchmark against vinyl.
- Assumes that RQ and CQ are clean and quota_rem==quota_max. */
-
-static void
-client_bench_query_rand( client_t * client,
- int * pargc,
- char *** pargv ) {
-
- /* Prepare a random query benchmark
-
- 1. Randomly sample keys into an array (--keys)
- 2. Inject random keys at a configurable rate (--miss) to exercise
- index query misses
- 3. Loop through the sampled keys array until (--iter) queries have
- been submitted, while doing batches of (--batch) keys at a time
-
- The benchmark loop is pipelined/asynchronous. The client will
- submit request batches until it is blocked by quota, RQ, or CQ. */
-
- ulong batch_depth = fd_env_strip_cmdline_ulong( pargc, pargv, "--batch", NULL, 1UL );
- ulong key_cnt = fd_env_strip_cmdline_ulong( pargc, pargv, "--keys", NULL, 262144UL );
- ulong const iter_cnt = fd_env_strip_cmdline_ulong( pargc, pargv, "--iter", NULL, 1048576UL );
- ulong const seed = fd_env_strip_cmdline_ulong( pargc, pargv, "--seed", NULL, (ulong)fd_tickcount() );
- float const miss_r = fd_env_strip_cmdline_float( pargc, pargv, "--miss", NULL, 0.1f );
-
- batch_depth = fd_ulong_max( batch_depth, 1UL );
- key_cnt = fd_ulong_min( key_cnt, UINT_MAX );
-
- fd_rng_t _rng[1]; fd_rng_t * rng = fd_rng_join( fd_rng_new( _rng, (uint)fd_ulong_hash( seed ), 0UL ) );
-
- fd_vinyl_meta_t * meta = client->meta;
- ulong const ele_max = fd_vinyl_meta_ele_max ( meta );
- ulong const probe_max = fd_vinyl_meta_probe_max( meta );
-
- /* Allocate a huge page backed scratch memory region to back keys */
-
- ulong sample_fp = fd_ulong_align_up( key_cnt*sizeof(fd_vinyl_key_t), FD_SHMEM_HUGE_PAGE_SZ );
- ulong sample_page_sz = FD_SHMEM_NORMAL_PAGE_SZ;
- ulong sample_page_cnt = sample_fp>>FD_SHMEM_NORMAL_LG_PAGE_SZ;
- fd_vinyl_key_t * sample = fd_shmem_acquire( sample_page_sz, sample_page_cnt, fd_log_cpu_id() );
- ulong sample_cnt = 0UL;
- if( FD_UNLIKELY( !sample ) ) {
- FD_LOG_WARNING(( "Cannot acquire scratch memory to hold %lu vinyl keys (out of memory). Aborting benchmark", key_cnt ));
- return;
- }
-
- /* Determine pipeline depth */
-
- ulong const rq_ele_depth = fd_vinyl_rq_req_cnt ( client->rq )*batch_depth;
- ulong const cq_ele_depth = fd_vinyl_cq_comp_cnt( client->cq )*batch_depth;
- ulong const quota_max = fd_ulong_min( client->quota_rem, fd_ulong_min( rq_ele_depth, cq_ele_depth ) );
- ulong const batch_max = ( quota_max + batch_depth - 1UL ) / batch_depth;
-
- /* Allocate request queue entries */
-
- ulong req_footprint = batch_req_footprint( batch_depth );
- ulong req_batch_footprint = batch_max*req_footprint;
- ulong req_laddr = (ulong)fd_wksp_alloc_laddr( client->client_wksp, batch_req_align(), req_batch_footprint, 1UL );
- if( FD_UNLIKELY( !req_laddr ) ) {
- FD_LOG_WARNING(( "Vinyl client wksp is too small to hold requests. Aborting benchmark" ));
- fd_shmem_release( sample, sample_page_sz, sample_page_cnt );
- return;
- }
- for( ulong batch_idx=0UL,
- batch_cur=req_laddr;
- batch_idxprev = batch_idx>0UL ? (batch_req_t *)( batch_cur - req_footprint ) : NULL;
- req->next = batch_idx+1ULele + meta_idx;
- if( FD_LIKELY( fd_vinyl_meta_ele_in_use( ele ) ) ) {
- sample[ i ] = ele->phdr.key;
- sample_cnt++;
- break;
- }
- meta_idx = (meta_idx+1UL) % ele_max;
- }
- if( !probe_rem ) { /* no key found (low hashmap utilization) ... */
- for( uint j=0U; j<32U; j+=4U ) FD_STORE( uint, sample[ i ].uc+j, fd_rng_uint( rng ) );
- }
-
- }
- dt += fd_log_wallclock();
-
-# if FD_HAS_DOUBLE
- FD_LOG_NOTICE(( "Sampled %lu keys in %gs (miss ratio %g)",
- key_cnt, (double)dt/1e9, (double)( key_cnt-sample_cnt )/(double)key_cnt ));
-# else
- FD_LOG_NOTICE(( "Sampled %lu keys in %ldns (%lu missed)",
- key_cnt, dt, key_cnt-sample_cnt ));
-# endif
-
- /* Run benchmark */
-
- bench_query_rand_t bqr = {
- .req_free = req_free,
- .req_wait_lo = NULL,
- .req_wait_hi = NULL,
- .batch_depth = batch_depth,
- .iter_rem = iter_cnt,
- .sample = sample,
- .sample_idx = 0UL,
- .sample_max = key_cnt
- };
- dt = -fd_log_wallclock();
- while( bench_query_rand_poll( client, &bqr ) );
- dt += fd_log_wallclock();
-
-# if FD_HAS_DOUBLE
- FD_LOG_NOTICE(( "Completed %lu queries (%lu found, %lu missed) in %gs (%g q/s)",
- iter_cnt, bqr.found_cnt, bqr.miss_cnt,
- (double)dt/1e9,
- (double)iter_cnt / ( (double)dt/1e9 ) ));
-# else
- FD_LOG_NOTICE(( "Completed %lu queries (%lu found, %lu missed) in %ldns",
- iter_cnt, bqr.found_cnt, bqr.miss_cnt, dt ));
-# endif
-
- /* Clean up */
-
- fd_rng_delete( fd_rng_leave( rng ) );
-
- fd_wksp_free_laddr( (void *)req_laddr );
-
- fd_shmem_release( sample, sample_page_sz, sample_page_cnt );
-}
-
-static int
-client_cmd( client_t * client,
- char ** tok,
- ulong tok_cnt ) {
- if( FD_UNLIKELY( !tok_cnt ) ) return 1;
- char const * cmd = tok[0];
- if( !strcmp( cmd, "query" ) ) {
- client_query( client, tok+1, tok_cnt-1 );
- } else if( !strcmp( cmd, "bench-query-rand" ) ) {
- int argc = (int)tok_cnt;
- client_bench_query_rand( client, &argc, &tok );
- } else if( !strcmp( cmd, "quit" ) || !strcmp( cmd, "exit" ) ) {
- return 0;
- } else {
- printf( "ERR: unknown command `%s`\n", cmd );
- }
- return 1;
-}
-
-static void
-repl( client_t * client ) {
- char line[ 4096 ] = {0};
-# define TOK_MAX 16
- char * tokens[ 16 ] = {0};
- for(;;) {
- fputs( "accdb> ", stdout );
- fflush( stdout );
-
- /* Read command */
- if( fgets( line, sizeof(line), stdin )==NULL ) {
- putc( '\n', stdout );
- break;
- }
- line[ strcspn( line, "\n" ) ] = '\0';
- line[ sizeof(line)-1 ] = '\0';
-
- /* Interpret command */
- ulong tok_cnt = fd_cstr_tokenize( tokens, TOK_MAX, line, ' ' );
- if( !client_cmd( client, tokens, tok_cnt ) ) break;
- }
-# undef TOK_MAX
-}
-
-int
-main( int argc,
- char ** argv ) {
- fd_boot( &argc, &argv );
-
- char const * cfg_gaddr = fd_env_strip_cmdline_cstr ( &argc, &argv, "--cfg", NULL, NULL );
- char const * wksp_name = fd_env_strip_cmdline_cstr ( &argc, &argv, "--wksp", NULL, NULL );
- ulong const burst_max = fd_env_strip_cmdline_ulong( &argc, &argv, "--burst-max", NULL, 1UL );
- ulong const quota_max = fd_env_strip_cmdline_ulong( &argc, &argv, "--quota-max", NULL, 2UL );
- if( FD_UNLIKELY( !cfg_gaddr ) ) FD_LOG_ERR(( "Missing required argument --cfg" ));
- if( FD_UNLIKELY( !wksp_name ) ) FD_LOG_ERR(( "Missing required argument --wksp" ));
-
- argc--; argv++;
-
- /* Join server shared memory structures */
-
- uchar * pod = fd_pod_join( fd_wksp_map( cfg_gaddr ) );
- if( FD_UNLIKELY( !pod ) ) FD_LOG_ERR(( "Invalid --cfg pod" ));
-
- void * _cnc = fd_wksp_pod_map( pod, "cnc" );
- void * _meta = fd_wksp_pod_map( pod, "meta" );
- void * _ele = fd_wksp_pod_map( pod, "ele" );
- void * _obj = fd_wksp_pod_map( pod, "obj" );
-
- fd_cnc_t * cnc = fd_cnc_join( _cnc ); FD_TEST( cnc );
- fd_vinyl_meta_t meta[1];
- FD_TEST( fd_vinyl_meta_join( meta, _meta, _ele ) );
-
- ulong vinyl_status = fd_cnc_signal_query( cnc );
- if( FD_UNLIKELY( vinyl_status!=FD_CNC_SIGNAL_RUN ) ) {
- char status_cstr[ FD_CNC_SIGNAL_CSTR_BUF_MAX ];
- FD_LOG_ERR(( "Vinyl tile not running (status %lu-%s)", vinyl_status, fd_cnc_signal_cstr( vinyl_status, status_cstr ) ));
- }
-
- /* Allocate client structures */
-
- fd_wksp_t * wksp = fd_wksp_attach( wksp_name );
- FD_TEST( wksp );
-
- ulong const rq_max = 32UL;
- ulong const cq_max = 32UL;
- void * _rq = fd_wksp_alloc_laddr( wksp, fd_vinyl_rq_align(), fd_vinyl_rq_footprint( rq_max ), 1UL );
- void * _cq = fd_wksp_alloc_laddr( wksp, fd_vinyl_cq_align(), fd_vinyl_cq_footprint( cq_max ), 1UL );
- fd_vinyl_rq_t * rq = fd_vinyl_rq_join( fd_vinyl_rq_new( _rq, rq_max ) );
- fd_vinyl_cq_t * cq = fd_vinyl_cq_join( fd_vinyl_cq_new( _cq, cq_max ) );
- if( FD_UNLIKELY( !rq || !cq ) ) {
- FD_LOG_WARNING(( "Failed to allocate request/completion queues" ));
- goto dealloc2;
- }
-
- ulong req_info_gaddr = fd_wksp_alloc( wksp, alignof(req_info_t), sizeof(req_info_t), 1UL );
- if( FD_UNLIKELY( !req_info_gaddr ) ) {
- FD_LOG_WARNING(( "Failed to pre-allocate request metadata" ));
- goto dealloc1;
- }
- req_info_t * req_info = fd_wksp_laddr_fast( wksp, req_info_gaddr );
-
- /* Run client */
-
- ulong const link_id = 0UL;
- int join_err = fd_vinyl_client_join( cnc, rq, cq, wksp, link_id, burst_max, quota_max );
- if( FD_UNLIKELY( join_err ) ) FD_LOG_ERR(( "Failed to join vinyl client to server (err %i-%s)", join_err, fd_vinyl_strerror( join_err ) ));
-
- FD_LOG_NOTICE(( "Attached client" ));
-
- client_t client = {
- .rq = rq,
- .cq = cq,
- .req_id = 0UL,
- .link_id = link_id,
-
- .meta = meta,
-
- .req_info = req_info,
- .req_info_gaddr = req_info_gaddr,
- .val_wksp = fd_wksp_containing( _obj ),
- .client_wksp = wksp,
-
- .quota_rem = quota_max,
- .cq_seq = fd_vinyl_cq_seq( cq )
- };
-
- if( argc>0 ) {
- client_cmd( &client, argv, (ulong)argc );
- } else {
- repl( &client );
- }
-
- FD_LOG_NOTICE(( "Detaching client" ));
-
- int leave_err = fd_vinyl_client_leave( cnc, link_id );
- if( FD_UNLIKELY( leave_err ) ) FD_LOG_ERR(( "Failed to leave vinyl client from server (err %i-%s)", leave_err, fd_vinyl_strerror( leave_err ) ));
-
-dealloc1:
- fd_wksp_free( wksp, req_info_gaddr );
-
-dealloc2:
- fd_wksp_free_laddr( fd_vinyl_rq_delete( fd_vinyl_rq_leave( rq ) ) );
- fd_wksp_free_laddr( fd_vinyl_cq_delete( fd_vinyl_cq_leave( cq ) ) );
-
- fd_wksp_unmap( fd_cnc_leave( cnc ) );
- fd_vinyl_meta_leave( meta );
- fd_wksp_unmap( _meta );
- fd_wksp_unmap( _ele );
- fd_wksp_unmap( _obj );
- fd_wksp_detach( wksp );
-
- fd_halt();
- return 0;
-}
diff --git a/src/flamenco/accdb/fd_accdb_impl_v2.c b/src/flamenco/accdb/fd_accdb_impl_v2.c
index 3c49d56f8de..daa8e1f42cf 100644
--- a/src/flamenco/accdb/fd_accdb_impl_v2.c
+++ b/src/flamenco/accdb/fd_accdb_impl_v2.c
@@ -1,6 +1,9 @@
#include "fd_accdb_impl_v2.h"
#include "fd_accdb_funk.h"
+#include "fd_accdb_specread.h"
#include "fd_vinyl_req_pool.h"
+#include "../../vinyl/data/fd_vinyl_data.h"
+#include "../../ballet/base58/fd_base58.h"
#include
FD_STATIC_ASSERT( alignof(fd_accdb_user_v2_t)<=alignof(fd_accdb_user_t), layout );
@@ -244,11 +247,11 @@ funk_open_ref( fd_accdb_user_v2_t * accdb,
/* Traverse chain for candidate */
fd_funk_rec_t * rec = NULL;
int err;
- for(;;) {
+ for( ulong backoff=1UL; ; ) {
err = funk_rec_acquire( accdb, chain_idx, key, &rec, is_write );
if( FD_LIKELY( err!=ACQUIRE_FAILED ) ) break;
- FD_SPIN_PAUSE();
- /* FIXME backoff */
+ for( ulong i=0UL; iaddress, address, 32UL );
@@ -292,17 +295,84 @@ fd_accdb_user_v2_open_ro_multi( fd_accdb_user_t * accdb,
fd_accdb_lineage_set_fork( v2->lineage, v2->funk, xid );
ulong addr_laddr = (ulong)addr0;
+ long t0 = fd_tickcount();
for( ulong i=0UL; ibase.ro_active++;
+ v2->base.lookup_funk++;
} else {
fd_accdb_ro_init_empty( &ro0[i], addr_i );
}
}
+ v2->base.dt_funk += fd_tickcount() - t0;
+
+ /* Speculative cache reads — attempt pin-based direct reads for
+ accounts not found in funk. On success, the caller gets a
+ zero-copy pointer into the vinyl data cache.
+
+ Holding specread pins across the ACQUIRE spin-wait below can
+ deadlock (root invalidation spin-drains pins while blocking
+ request processing). Therefore specread is only useful when it
+ can resolve ALL remaining accounts, avoiding the ACQUIRE
+ entirely. If any specread misses, we unpin everything and fall
+ through to ACQUIRE for the whole batch. */
+
+ t0 = fd_tickcount();
+ if( v2->vinyl_line_cnt ) {
+
+ /* Pin pass — attempt specread for every non-funk account.
+ Track how many we still need so we can detect partial
+ coverage without a second scan. */
+
+ ulong spec_need = 0UL;
+ ulong spec_hit = 0UL;
+ for( ulong i=0UL; iaccdb_type!=FD_ACCDB_TYPE_NONE ) continue;
+ spec_need++;
+ void const * addr_i = (void const *)( (ulong)addr_laddr + i*32UL );
+ fd_vinyl_key_t vkey[1];
+ fd_vinyl_key_init( vkey, addr_i, 32UL );
+ fd_account_meta_t const * meta;
+ ulong spec_line_idx;
+ int specerr = fd_accdb_specread_pin( v2->vinyl_meta,
+ v2->vinyl_line, v2->vinyl_line_cnt, v2->vinyl_specrd_wksp,
+ vkey, &meta, &spec_line_idx );
+ if( specerr==FD_VINYL_SUCCESS ) {
+ spec_hit++;
+ ro0[i] = (fd_accdb_ro_t){0};
+ memcpy( ro0[i].ref->address, addr_i, 32UL );
+ ro0[i].ref->accdb_type = FD_ACCDB_TYPE_V2S;
+ ro0[i].ref->ref_type = FD_ACCDB_REF_RO;
+ ro0[i].ref->user_data = spec_line_idx;
+ ro0[i].meta = meta;
+ } else if( specerr==FD_VINYL_ERR_KEY ) {
+ fd_accdb_ro_init_empty( &ro0[i], addr_i );
+ ro0[i].ref->user_data2 = 1;
+ }
+ }
- /* For the accounts that were not found in funk, open vinyl records */
-
+ // if( spec_hitaccdb_type!=FD_ACCDB_TYPE_V2S ) continue;
+ // fd_accdb_specread_unpin( v2->vinyl_line, ro0[i].ref->user_data );
+ // ro0[i].ref->accdb_type = FD_ACCDB_TYPE_NONE;
+ // ro0[i].ref->user_data = 0UL;
+ // ro0[i].meta = NULL;
+ // }
+ // } else {
+ // /* Full coverage — commit all specread results */
+ v2->base.ro_active += spec_hit;
+ v2->base.lookup_specrd += spec_hit;
+ // }
+ }
+ v2->base.dt_specrd += fd_tickcount() - t0;
+
+ /* For the accounts that were not found in funk or specread,
+ open vinyl records via rq/cq */
+
+ t0 = fd_tickcount();
ulong batch_idx = fd_vinyl_req_pool_acquire( req_pool );
/* req_pool_release called before returning */
fd_vinyl_comp_t * comp = fd_vinyl_req_batch_comp ( req_pool, batch_idx );
@@ -314,10 +384,12 @@ fd_accdb_user_v2_open_ro_multi( fd_accdb_user_t * accdb,
ulong req_cnt = 0UL;
for( ulong i=0UL; iaccdb_type!=FD_ACCDB_TYPE_NONE ) continue;
- /* At this point, addr0[i] not found in funk, load from vinyl */
+ if( ro0[i].ref->accdb_type!=FD_ACCDB_TYPE_NONE ||
+ ro0[i].ref->user_data2!=0 ) {
+ continue;
+ }
void const * addr_i = (void const *)( (ulong)addr0 + i*32UL );
-
+ FD_BASE58_ENCODE_32_BYTES( addr_i, addr_b58 );
fd_vinyl_key_init( req_key0+req_cnt, addr_i, 32UL );
req_err0 [ req_cnt ] = 0;
req_val_gaddr0[ req_cnt ] = 0UL;
@@ -326,9 +398,12 @@ fd_accdb_user_v2_open_ro_multi( fd_accdb_user_t * accdb,
if( !req_cnt ) {
/* All records were found in funk, bail early */
fd_vinyl_req_pool_release( req_pool, batch_idx );
+ v2->base.dt_vinyl += fd_tickcount() - t0;
return;
}
+ v2->base.lookup_accdb += req_cnt;
+
/* Send read-only "ACQUIRE" batch to vinyl and wait for response */
ulong req_id = v2->vinyl_req_id++;
@@ -347,7 +422,10 @@ fd_accdb_user_v2_open_ro_multi( fd_accdb_user_t * accdb,
req_cnt = 0UL;
for( ulong i=0UL; iaccdb_type!=FD_ACCDB_TYPE_NONE ) continue;
+ if( ro0[i].ref->accdb_type!=FD_ACCDB_TYPE_NONE ||
+ ro0[i].ref->user_data2!=0 ) {
+ continue;
+ }
void const * addr_i = (void const *)( (ulong)addr0 + i*32UL );
int req_err = FD_VOLATILE_CONST( req_err0 [ req_cnt ] );
@@ -365,12 +443,15 @@ fd_accdb_user_v2_open_ro_multi( fd_accdb_user_t * accdb,
ro->ref->ref_type = FD_ACCDB_REF_RO;
ro->meta = meta;
} else if( FD_UNLIKELY( req_err!=FD_VINYL_ERR_KEY ) ) {
- FD_LOG_CRIT(( "vinyl tile ACQUIRE request failed: %i-%s", req_err, fd_vinyl_strerror( req_err ) ));
+ FD_LOG_CRIT(( "vinyl tile ACQUIRE request failed: %i-%s (idx=%lu cnt=%lu)",
+ req_err, fd_vinyl_strerror( req_err ),
+ i, cnt ));
}
req_cnt++;
}
fd_vinyl_req_pool_release( req_pool, batch_idx );
+ v2->base.dt_vinyl += fd_tickcount() - t0;
/* At this point, ownership of vinyl records transitions to caller.
(Released using close_ro_multi) */
@@ -416,14 +497,94 @@ fd_accdb_user_v2_open_rw_multi( fd_accdb_user_t * accdb,
finishes) */
ulong addr_laddr = (ulong)addr0;
+ long t0 = fd_tickcount();
for( ulong i=0UL; ibase.dt_funk += fd_tickcount() - t0;
+
+ /* Speculative cache reads — for accounts not found in funk, attempt
+ pin-based direct reads from the vinyl cache. On success, copy the
+ account into a new writable funk record and unpin immediately.
+ The pin is short-lived (held only for the memcpy) so there is no
+ deadlock risk with root invalidation. On miss/contention, fall
+ through to the rq/cq ACQUIRE path below. */
+
+ t0 = fd_tickcount();
+ if( v2->vinyl_line_cnt ) {
+ for( ulong i=0UL; iref_type!=FD_ACCDB_REF_INVAL ) continue;
+ void const * addr_i = (void const *)( (ulong)addr_laddr + i*32UL );
+ fd_vinyl_key_t vkey[1];
+ fd_vinyl_key_init( vkey, addr_i, 32UL );
+ fd_account_meta_t const * src_meta;
+ ulong spec_line_idx;
+ int specerr = fd_accdb_specread_pin( v2->vinyl_meta,
+ v2->vinyl_line, v2->vinyl_line_cnt, v2->vinyl_specrd_wksp,
+ vkey, &src_meta, &spec_line_idx );
+ if( specerr==FD_VINYL_ERR_KEY ) goto tombstone;
+ if( specerr!=FD_VINYL_SUCCESS ) continue; /* fall through to ACQUIRE */
+
+ uchar const * src_data = (uchar *)( src_meta+1 );
+
+ if( FD_UNLIKELY( src_meta->lamports==0UL ) ) {
+ /* Tombstone — unpin and handle via create-or-skip */
+ fd_accdb_specread_unpin( v2->vinyl_line, spec_line_idx );
+tombstone:
+ if( flag_create ) {
+ fd_accdb_funk_create( v2->funk, &rw0[i], txn, addr_i, data_max0[i] );
+ fd_funk_rec_write_lock_uncontended( v2->funk, (fd_funk_rec_t *)rw0[i].ref->user_data );
+ accdb->base.rw_active++;
+ } else {
+ memset( &rw0[i], 0, sizeof(fd_accdb_ref_t) );
+ /* Mark as handled so it doesn't leak into ACQUIRE batch
+ (ref_type!=INVAL skips batch builder) and promotion loop
+ (user_data2!=0 skips RW branch). accdb_type remains
+ NONE so the caller sees a not-found result. */
+ rw0[i].ref->ref_type = FD_ACCDB_REF_RW;
+ rw0[i].ref->user_data2 = ULONG_MAX;
+ }
+ continue;
+ }
+
+ ulong acc_orig_sz = src_meta->dlen;
+ ulong val_sz_min = sizeof(fd_account_meta_t)+fd_ulong_max( data_max0[i], acc_orig_sz );
+ ulong acc_sz = flag_truncate ? 0UL : acc_orig_sz;
+ ulong val_sz = sizeof(fd_account_meta_t)+acc_sz;
+ ulong val_max = 0UL;
+ void * val = fd_alloc_malloc_at_least( funk->alloc, 16UL, val_sz_min, &val_max );
+ if( FD_UNLIKELY( !val ) ) {
+ FD_LOG_CRIT(( "Failed to modify account: out of memory allocating %lu bytes", acc_orig_sz ));
+ }
+
+ fd_account_meta_t * dst_meta = val;
+ uchar * dst_data = (uchar *)( dst_meta+1 );
+ ulong data_max_actual = val_max - sizeof(fd_account_meta_t);
+ if( flag_truncate ) fd_accdb_funk_copy_truncated( dst_meta, src_meta );
+ else fd_accdb_funk_copy_account ( dst_meta, dst_data, src_meta, src_data );
+
+ /* Unpin immediately — data has been copied to funk */
+ fd_accdb_specread_unpin( v2->vinyl_line, spec_line_idx );
+
+ if( acc_orig_szuser_data );
+ accdb->base.rw_active++;
+ accdb->base.created_cnt++;
+ }
+ }
+ v2->base.dt_specrd += fd_tickcount() - t0;
- /* For the accounts that were not found in funk, create writable funk
- records from elements in vinyl. */
+ /* For the accounts that were not found in funk or specread, create
+ writable funk records from elements in vinyl. */
+ t0 = fd_tickcount();
ulong batch_idx = fd_vinyl_req_pool_acquire( req_pool );
/* req_pool_release called before returning */
fd_vinyl_comp_t * comp = fd_vinyl_req_batch_comp ( req_pool, batch_idx );
@@ -459,9 +620,11 @@ fd_accdb_user_v2_open_rw_multi( fd_accdb_user_t * accdb,
FD_LOG_CRIT(( "vinyl tile rejected my ACQUIRE request: %i-%s", comp_err, fd_vinyl_strerror( comp_err ) ));
}
}
+ v2->base.dt_vinyl += fd_tickcount() - t0;
/* Promote any found accounts to writable accounts */
+ ulong vinyl_cnt = req_cnt;
req_cnt = 0UL;
for( ulong i=0UL; iref->ref_type==FD_ACCDB_REF_RW ) {
+
+ /* Entries already created by specread have user_data2 set to
+ the txn pointer (by fd_accdb_funk_prep_create). Funk
+ write-locked entries from funk_open_ref have user_data2==0. */
+ if( rw->ref->user_data2 ) continue;
+
/* Mutable record found, modify in-place */
if( FD_UNLIKELY( !flag_create && fd_accdb_ref_lamports( rw->ro )==0UL ) ) {
@@ -553,7 +722,6 @@ fd_accdb_user_v2_open_rw_multi( fd_accdb_user_t * accdb,
} else {
memset( rw, 0, sizeof(fd_accdb_ref_t) );
}
- req_cnt++;
continue;
}
@@ -595,22 +763,16 @@ fd_accdb_user_v2_open_rw_multi( fd_accdb_user_t * accdb,
accdb->base.created_cnt++;
}
- /* Send "RELEASE" batch (reuse val_gaddr values),
- and wait for response */
-
- if( req_cnt ) {
- ulong req_id = v2->vinyl_req_id++;
- memset( fd_vinyl_req_batch_comp( req_pool, batch_idx ), 0, sizeof(fd_vinyl_comp_t) );
- fd_vinyl_req_send_batch( rq, req_pool, req_wksp, req_id, link_id, FD_VINYL_REQ_TYPE_RELEASE, 0UL, batch_idx, req_cnt );
-
- while( FD_VOLATILE_CONST( comp->seq )!=1UL ) FD_SPIN_PAUSE();
- FD_COMPILER_MFENCE();
- int comp_err = FD_VOLATILE_CONST( comp->err );
- if( FD_UNLIKELY( comp_err!=FD_VINYL_SUCCESS ) ) {
- FD_LOG_CRIT(( "vinyl tile rejected my RELEASE request: %i-%s", comp_err, fd_vinyl_strerror( comp_err ) ));
- }
+ /* Release vinyl records: decrement ref count directly in shared
+ memory. The data was copied to funk so we no longer need the
+ vinyl cache entries. */
+ for( ulong i=0UL; ivinyl_line[ obj->line_idx ].ctl, 1UL );
}
-
fd_vinyl_req_pool_release( req_pool, batch_idx );
}
@@ -641,56 +803,28 @@ void
fd_accdb_user_v2_close_ref_multi( fd_accdb_user_t * accdb,
fd_accdb_ref_t * ref0,
ulong cnt ) {
- fd_accdb_user_v2_t * v2 = (fd_accdb_user_v2_t *)accdb;
- fd_vinyl_rq_t * rq = v2->vinyl_rq; /* "request queue "*/
- fd_vinyl_req_pool_t * req_pool = v2->vinyl_req_pool; /* "request pool" */
- fd_wksp_t * req_wksp = v2->vinyl_req_wksp; /* shm workspace containing request buffer */
- fd_wksp_t * data_wksp = v2->vinyl_data_wksp; /* shm workspace containing vinyl data cache */
- ulong link_id = v2->vinyl_link_id; /* vinyl client ID */
+ fd_accdb_user_v2_t * v2 = (fd_accdb_user_v2_t *)accdb;
- if( FD_UNLIKELY( cnt>fd_vinyl_req_batch_key_max( req_pool ) ) ) {
- FD_LOG_CRIT(( "close_ref_multi cnt %lu exceeds vinyl request batch max %lu",
- cnt, fd_vinyl_req_batch_key_max( req_pool ) ));
+ /* Release specread pins (V2S) */
+ for( ulong i=0UL; iaccdb_type!=FD_ACCDB_TYPE_V2S ) continue;
+ fd_accdb_specread_unpin( v2->vinyl_line, ref->user_data );
+ accdb->base.ro_active--;
+ memset( ref, 0, sizeof(fd_accdb_ref_t) );
}
- /* First, release all references to vinyl records
- (This is a prefetch friendly / fast loop) */
-
- ulong batch_idx = fd_vinyl_req_pool_acquire( req_pool );
- /* req_pool_release called before returning */
- fd_vinyl_comp_t * comp = fd_vinyl_req_batch_comp ( req_pool, batch_idx );
- schar * req_err0 = fd_vinyl_req_batch_err ( req_pool, batch_idx );
- ulong * req_val_gaddr0 = fd_vinyl_req_batch_val_gaddr( req_pool, batch_idx );
-
- ulong ro_close_cnt = 0UL;
- ulong rw_close_cnt = 0UL;
- ulong req_cnt = 0UL;
+ /* Release vinyl records acquired via rq/cq: decrement ref count directly */
for( ulong i=0UL; iaccdb_type!=FD_ACCDB_TYPE_V2 ) continue;
- ref->ref_type==FD_ACCDB_REF_RO ? ro_close_cnt++ : rw_close_cnt++;
- req_err0 [ req_cnt ] = 0;
- req_val_gaddr0[ req_cnt ] = fd_wksp_gaddr_fast( data_wksp, (void *)ref->meta_laddr );
+ fd_vinyl_data_obj_t * obj = fd_vinyl_data_obj( (void *)ref->meta_laddr );
+ FD_ATOMIC_FETCH_AND_SUB( &v2->vinyl_line[ obj->line_idx ].ctl, 1UL );
+ accdb->base.ro_active--;
memset( ref, 0, sizeof(fd_accdb_ref_t) );
- req_cnt++;
- }
- if( req_cnt ) {
- if( FD_UNLIKELY( ro_close_cnt > accdb->base.ro_active ) ) {
- FD_LOG_CRIT(( "attempted to close more accdb_ro (%lu) than are open (%lu)",
- ro_close_cnt, accdb->base.ro_active ));
- }
- if( FD_UNLIKELY( rw_close_cnt > accdb->base.rw_active ) ) {
- FD_LOG_CRIT(( "attempted to close more accdb_rw (%lu) than are open (%lu)",
- rw_close_cnt, accdb->base.rw_active ));
- }
- ulong req_id = v2->vinyl_req_id++;
- memset( fd_vinyl_req_batch_comp( req_pool, batch_idx ), 0, sizeof(fd_vinyl_comp_t) );
- fd_vinyl_req_send_batch( rq, req_pool, req_wksp, req_id, link_id, FD_VINYL_REQ_TYPE_RELEASE, 0UL, batch_idx, req_cnt );
}
- /* While our vinyl request is inflight, release funk records
- (This does expensive DRAM accesses, which are convenient to do when
- we are waiting for the database to asynchronously respond) */
+ /* Release funk records */
for( ulong i=0UL; iseq )!=1UL ) FD_SPIN_PAUSE();
- FD_COMPILER_MFENCE();
- int comp_err = FD_VOLATILE_CONST( comp->err );
- if( FD_UNLIKELY( comp_err!=FD_VINYL_SUCCESS ) ) {
- FD_LOG_CRIT(( "vinyl tile rejected my RELEASE request: %i-%s", comp_err, fd_vinyl_strerror( comp_err ) ));
- }
- for( ulong i=0UL; ibase.ro_active -= ro_close_cnt;
- accdb->base.rw_active -= rw_close_cnt;
- }
-
- fd_vinyl_req_pool_release( req_pool, batch_idx );
}
ulong
@@ -780,6 +892,7 @@ fd_accdb_user_v2_init( fd_accdb_user_t * accdb_,
void * vinyl_rq,
void * vinyl_data,
void * vinyl_req_pool,
+ void * vinyl_line,
ulong vinyl_link_id,
ulong max_depth ) {
if( FD_UNLIKELY( !accdb_ ) ) {
@@ -817,11 +930,33 @@ fd_accdb_user_v2_init( fd_accdb_user_t * accdb_,
accdb->vinyl_data_wksp = vinyl_data;
accdb->vinyl_req_wksp = fd_wksp_containing( req_pool );
accdb->vinyl_req_pool = req_pool;
+ accdb->vinyl_line = vinyl_line;
accdb->base.accdb_type = FD_ACCDB_TYPE_V2;
accdb->base.vt = &fd_accdb_user_v2_vt;
return accdb_;
}
+void
+fd_accdb_user_v2_init_cache( fd_accdb_user_t * accdb_,
+ void * vinyl_shmeta,
+ void * vinyl_shele,
+ void * vinyl_shline,
+ ulong vinyl_line_cnt ) {
+ fd_accdb_user_v2_t * v2 = fd_type_pun( accdb_ );
+
+ if( FD_UNLIKELY( !vinyl_shmeta || !vinyl_shele || !vinyl_shline || !vinyl_line_cnt ) ) {
+ /* Specread disabled */
+ v2->vinyl_line_cnt = 0UL;
+ v2->vinyl_specrd_wksp = NULL;
+ return;
+ }
+
+ FD_TEST( fd_vinyl_meta_join( v2->vinyl_meta, vinyl_shmeta, vinyl_shele ) );
+ v2->vinyl_line = (fd_vinyl_line_t *)vinyl_shline;
+ v2->vinyl_line_cnt = vinyl_line_cnt;
+ v2->vinyl_specrd_wksp = v2->vinyl_data_wksp; /* same workspace */
+}
+
void
fd_accdb_user_v2_fini( fd_accdb_user_t * accdb ) {
fd_accdb_user_v2_t * user = (fd_accdb_user_v2_t *)accdb;
diff --git a/src/flamenco/accdb/fd_accdb_impl_v2.h b/src/flamenco/accdb/fd_accdb_impl_v2.h
index 60286d67073..fc02721901c 100644
--- a/src/flamenco/accdb/fd_accdb_impl_v2.h
+++ b/src/flamenco/accdb/fd_accdb_impl_v2.h
@@ -13,6 +13,7 @@
#include "../../vinyl/cq/fd_vinyl_cq.h"
#include "../../vinyl/rq/fd_vinyl_rq.h"
+#include "../../vinyl/line/fd_vinyl_line.h"
#include "fd_accdb_user.h"
#include "fd_accdb_lineage.h"
#include "fd_vinyl_req_pool.h"
@@ -47,6 +48,12 @@ struct fd_accdb_user_v2 {
fd_wksp_t * vinyl_data_wksp;
fd_wksp_t * vinyl_req_wksp;
fd_vinyl_req_pool_t * vinyl_req_pool;
+ fd_vinyl_line_t * vinyl_line; /* vinyl cache line array (shared memory) */
+
+ /* Speculative read (specread) state — populated by init_cache */
+ fd_vinyl_meta_t vinyl_meta[1]; /* local join of meta map */
+ ulong vinyl_line_cnt; /* number of cache lines */
+ fd_wksp_t * vinyl_specrd_wksp; /* data workspace for gaddr resolution */
};
typedef struct fd_accdb_user_v2 fd_accdb_user_v2_t;
@@ -62,9 +69,24 @@ fd_accdb_user_v2_init( fd_accdb_user_t * ljoin,
void * vinyl_rq,
void * vinyl_data,
void * vinyl_req_pool,
+ void * vinyl_line,
ulong vinyl_link_id,
ulong max_depth );
+/* fd_accdb_user_v2_init_cache enables speculative reads on an
+ already-initialized v2 accdb client. vinyl_shmeta / vinyl_shele /
+ vinyl_shline point to the shared meta map, element pool, and line
+ array created by the accdb tile. vinyl_line_cnt is the number of
+ cache lines. If vinyl_shmeta is NULL, specread is disabled (the
+ client only uses rq/cq). */
+
+void
+fd_accdb_user_v2_init_cache( fd_accdb_user_t * ljoin,
+ void * vinyl_shmeta,
+ void * vinyl_shele,
+ void * vinyl_shline,
+ ulong vinyl_line_cnt );
+
FD_PROTOTYPES_END
#endif /* HEADER_fd_src_flamenco_accdb_fd_accdb_impl_v2_h */
diff --git a/src/flamenco/accdb/fd_accdb_specread.h b/src/flamenco/accdb/fd_accdb_specread.h
new file mode 100644
index 00000000000..c094f39e032
--- /dev/null
+++ b/src/flamenco/accdb/fd_accdb_specread.h
@@ -0,0 +1,136 @@
+#ifndef HEADER_fd_src_flamenco_accdb_fd_accdb_specread_h
+#define HEADER_fd_src_flamenco_accdb_fd_accdb_specread_h
+
+/* fd_accdb_specread.h provides pin-based speculative reads of rooted
+ account data from the vinyl cache.
+
+ A specread client is a tile (replay, exec, ...) that has read-only
+ access to the vinyl meta map, element pool, and line array, plus the
+ data workspace. It reads cached account data directly from shared
+ memory, bypassing the rq/cq round-trip. On cache miss or contention,
+ the client falls back to the normal vinyl ACQUIRE path.
+
+ Pin protocol:
+
+ 1. fd_vinyl_meta_query_try → ele_idx, line_idx
+ 2. Validate meta seqlock via fd_vinyl_meta_query_test
+ 3. FETCH_AND_ADD(&line[line_idx].ctl, 1) to pin
+ 4. Bail if EVICTING, ref < 0, or cross-validation fails
+ 5. Resolve obj_gaddr, check rd_active == 0
+ 6. Point caller at val data (zero-copy)
+ 7. On close: FETCH_AND_SUB(&line[line_idx].ctl, 1) to unpin */
+
+#include "../../vinyl/line/fd_vinyl_line.h" /* includes meta + data */
+#include "../../discof/accdb/fd_accdb_line_ctl.h"
+#include "../fd_flamenco_base.h" /* fd_account_meta_t */
+
+FD_PROTOTYPES_BEGIN
+
+/* fd_accdb_specread_pin attempts to pin a cached account and return a
+ direct pointer to its metadata.
+
+ On success (FD_VINYL_SUCCESS): *out_meta points to the
+ fd_account_meta_t in the data cache, *out_line_idx gives the pinned
+ line. The caller MUST call fd_accdb_specread_unpin when done.
+
+ On failure: FD_VINYL_ERR_KEY (key not in meta) or FD_VINYL_ERR_AGAIN
+ (transient: eviction in progress, I/O pending, seqlock contention).
+ No pin is held; caller should fall back to rq/cq ACQUIRE. */
+
+static inline int
+fd_accdb_specread_pin( fd_vinyl_meta_t * meta,
+ fd_vinyl_line_t * line,
+ ulong line_cnt,
+ fd_wksp_t * data_wksp,
+ fd_vinyl_key_t const * key,
+ fd_account_meta_t const ** out_meta,
+ ulong * out_line_idx ) {
+
+ /* 1. Lockfree query of the meta map for key */
+
+ fd_vinyl_meta_query_t query[1];
+ int err = fd_vinyl_meta_query_try( meta, key, NULL, query, 0 /* non-blocking */ );
+ if( FD_UNLIKELY( err ) ) return err; /* ERR_KEY or ERR_AGAIN */
+
+ fd_vinyl_meta_ele_t const * ele = fd_vinyl_meta_query_ele_const( query );
+
+ /* Read fields of interest while the seqlock is held */
+
+ ulong ctl = ele->phdr.ctl;
+ ulong line_idx = ele->line_idx;
+ ulong ele_idx = (ulong)( ele - (fd_vinyl_meta_ele_t const *)fd_vinyl_meta_shele_const( meta ) );
+
+ /* 2. Validate meta seqlock — detect torn reads */
+
+ if( FD_UNLIKELY( fd_vinyl_meta_query_test( query ) ) ) return FD_VINYL_ERR_AGAIN;
+
+ /* Key not in bstream or being created? */
+
+ if( FD_UNLIKELY( !ctl || ctl==ULONG_MAX ) ) return FD_VINYL_ERR_AGAIN;
+
+ /* 3. Validate line_idx in range (key might not be cached) */
+
+ if( FD_UNLIKELY( line_idx>=line_cnt ) ) return FD_VINYL_ERR_AGAIN;
+
+ /* 4. Pin: atomically increment ref count */
+
+ ulong old_ctl = FD_ATOMIC_FETCH_AND_ADD( &line[ line_idx ].ctl, 1UL );
+
+ /* 5. If EVICTING set or ref was negative (acquired for modify),
+ undo the pin immediately */
+
+ if( FD_UNLIKELY( (old_ctl & FD_ACCDB_LINE_CTL_EVICTING) ||
+ fd_accdb_line_ctl_ref( old_ctl ) < 0L ) ) {
+ FD_ATOMIC_FETCH_AND_SUB( &line[ line_idx ].ctl, 1UL );
+ return FD_VINYL_ERR_AGAIN;
+ }
+
+ /* 6. Resolve obj_gaddr */
+
+ ulong obj_gaddr = line[ line_idx ].obj_gaddr;
+ if( FD_UNLIKELY( !obj_gaddr ) ) {
+ FD_ATOMIC_FETCH_AND_SUB( &line[ line_idx ].ctl, 1UL );
+ return FD_VINYL_ERR_AGAIN;
+ }
+
+ /* 7. Cross-validate: line still maps to same meta element */
+
+ if( FD_UNLIKELY( line[ line_idx ].ele_idx != ele_idx ) ) {
+ FD_ATOMIC_FETCH_AND_SUB( &line[ line_idx ].ctl, 1UL );
+ return FD_VINYL_ERR_AGAIN;
+ }
+
+ /* 8. Resolve to local address */
+
+ fd_vinyl_data_obj_t * obj = (fd_vinyl_data_obj_t *)
+ fd_wksp_laddr_fast( data_wksp, obj_gaddr );
+
+ /* 9. Check I/O not in progress */
+
+ if( FD_UNLIKELY( obj->rd_active ) ) {
+ FD_ATOMIC_FETCH_AND_SUB( &line[ line_idx ].ctl, 1UL );
+ return FD_VINYL_ERR_AGAIN;
+ }
+
+ /* 10. Success — return pointer directly into cache (zero-copy).
+ fd_vinyl_data_obj_val returns the start of the val payload,
+ which for accdb is fd_account_meta_t. */
+
+ *out_meta = (fd_account_meta_t const *)fd_vinyl_data_obj_val( obj );
+ *out_line_idx = line_idx;
+ return FD_VINYL_SUCCESS;
+}
+
+/* fd_accdb_specread_unpin releases a pin acquired by
+ fd_accdb_specread_pin. Must be called exactly once per successful
+ pin. */
+
+static inline void
+fd_accdb_specread_unpin( fd_vinyl_line_t * line,
+ ulong line_idx ) {
+ FD_ATOMIC_FETCH_AND_SUB( &line[ line_idx ].ctl, 1UL );
+}
+
+FD_PROTOTYPES_END
+
+#endif /* HEADER_fd_src_flamenco_accdb_fd_accdb_specread_h */
diff --git a/src/flamenco/accdb/fd_accdb_user.h b/src/flamenco/accdb/fd_accdb_user.h
index a38eae75616..94271545a76 100644
--- a/src/flamenco/accdb/fd_accdb_user.h
+++ b/src/flamenco/accdb/fd_accdb_user.h
@@ -123,6 +123,16 @@ struct fd_accdb_user_base {
ulong rw_active;
ulong ro_active;
ulong created_cnt;
+
+ /* Cache hit tracking for open_ro_multi */
+ ulong lookup_funk; /* Account found in funk (in-memory fork store) */
+ ulong lookup_specrd; /* Account found via speculative read (vinyl cache) */
+ ulong lookup_accdb; /* Account requested from accdb tile (vinyl rq/cq) */
+
+ /* Cumulative tickcount spent in each lookup regime */
+ long dt_funk; /* Time spent in funk lookups */
+ long dt_specrd; /* Time spent in specread pin/unpin */
+ long dt_vinyl; /* Time spent waiting for vinyl rq/cq */
};
typedef struct fd_accdb_user_base fd_accdb_user_base_t;
diff --git a/src/flamenco/accdb/test_accdb_v2.c b/src/flamenco/accdb/test_accdb_v2.c
deleted file mode 100644
index 71c12792b31..00000000000
--- a/src/flamenco/accdb/test_accdb_v2.c
+++ /dev/null
@@ -1,670 +0,0 @@
-#include "fd_accdb_base.h"
-#include "fd_accdb_admin_v1.h"
-#include "fd_accdb_impl_v2.h"
-#include "fd_accdb_admin.h"
-#include "fd_accdb_sync.h"
-#include "fd_accdb_pipe.h"
-#include "../../vinyl/fd_vinyl.h"
-
-#define WKSP_TAG (1UL)
-
-static uchar const s_key_a[ 32 ] = { 1 }; /* a: present in vinyl, account exists */
-static uchar const s_key_b[ 32 ] = { 2 }; /* b: present in vinyl, tombstone */
-static uchar const s_key_c[ 32 ] = { 3 }; /* c: present in funk, account exists */
-static uchar const s_key_d[ 32 ] = { 4 }; /* d: present in funk, tombstone*/
-static uchar const s_key_e[ 32 ] = { 5 }; /* e: not found */
-
-static int
-fd_vinyl_tile( int argc,
- char ** argv ) {
- (void)argc;
- fd_vinyl_exec( (fd_vinyl_t *)argv );
- return 0;
-}
-
-static void
-add_account_vinyl( fd_accdb_user_t * accdb_,
- uchar const * key,
- ulong lamports ) {
- fd_accdb_user_v2_t * accdb = (fd_accdb_user_v2_t *)accdb_;
-
- /* Start write */
- ulong batch_idx = fd_vinyl_req_pool_acquire ( accdb->vinyl_req_pool );
- fd_vinyl_key_t * req_key = fd_vinyl_req_batch_key ( accdb->vinyl_req_pool, batch_idx );
- ulong * req_val_gaddr = fd_vinyl_req_batch_val_gaddr( accdb->vinyl_req_pool, batch_idx );
- schar * req_err = fd_vinyl_req_batch_err ( accdb->vinyl_req_pool, batch_idx );
- fd_vinyl_comp_t * comp = fd_vinyl_req_batch_comp ( accdb->vinyl_req_pool, batch_idx );
- fd_vinyl_key_init( req_key, key, 32UL );
- ulong val_max = sizeof(fd_account_meta_t) + 32UL;
- *req_val_gaddr = val_max;
- memset( comp, 0, sizeof(fd_vinyl_comp_t) );
- fd_vinyl_req_send_batch(
- accdb->vinyl_rq,
- accdb->vinyl_req_pool,
- accdb->vinyl_req_wksp,
- accdb->vinyl_req_id++,
- accdb->vinyl_link_id,
- FD_VINYL_REQ_TYPE_ACQUIRE,
- FD_VINYL_REQ_FLAG_MODIFY | FD_VINYL_REQ_FLAG_CREATE | FD_VINYL_REQ_FLAG_EXCL,
- batch_idx,
- 1UL /* batch_cnt */
- );
- while( FD_VOLATILE_CONST( comp->seq )!=1UL ) FD_SPIN_PAUSE();
- FD_COMPILER_MFENCE();
- int comp_err = FD_VOLATILE_CONST( comp->err );
- if( FD_UNLIKELY( comp_err!=FD_VINYL_SUCCESS ) ) {
- FD_LOG_CRIT(( "vinyl tile rejected my ACQUIRE request: %i-%s", comp_err, fd_vinyl_strerror( comp_err ) ));
- }
- int err = FD_VOLATILE_CONST( req_err[0] );
- if( FD_UNLIKELY( err!=FD_VINYL_SUCCESS ) ) {
- FD_LOG_CRIT(( "vinyl tile ACQUIRE request failed: %i-%s", err, fd_vinyl_strerror( err ) ));
- }
-
- ulong val_gaddr = FD_VOLATILE_CONST( req_val_gaddr[0] );
- void * val = fd_wksp_laddr_fast( accdb->vinyl_data_wksp, val_gaddr );
- fd_vinyl_info_t * info = fd_vinyl_data_info( val );
- fd_account_meta_t * meta = val;
- uchar * data = (uchar *)( meta+1 );
-
- memset( meta, 0, val_max );
- meta->lamports = lamports;
- meta->dlen = 32U;
- memcpy( data, key, 32UL );
- info->val_sz = (uint)val_max;
-
- /* Finish write */
- memset( comp, 0, sizeof(fd_vinyl_comp_t) );
- req_val_gaddr[0] = val_gaddr;
- fd_vinyl_req_send_batch(
- accdb->vinyl_rq,
- accdb->vinyl_req_pool,
- accdb->vinyl_req_wksp,
- accdb->vinyl_req_id++,
- accdb->vinyl_link_id,
- FD_VINYL_REQ_TYPE_RELEASE,
- FD_VINYL_REQ_FLAG_MODIFY,
- batch_idx,
- 1UL /* batch_cnt */
- );
- while( FD_VOLATILE_CONST( comp->seq )!=1UL ) FD_SPIN_PAUSE();
- FD_COMPILER_MFENCE();
- comp_err = FD_VOLATILE_CONST( comp->err );
- if( FD_UNLIKELY( comp_err!=FD_VINYL_SUCCESS ) ) {
- FD_LOG_CRIT(( "vinyl tile rejected my RELEASE request: %i-%s", comp_err, fd_vinyl_strerror( comp_err ) ));
- }
- err = FD_VOLATILE_CONST( req_err[0] );
- if( FD_UNLIKELY( err!=FD_VINYL_SUCCESS ) ) {
- FD_LOG_CRIT(( "vinyl tile RELEASE request failed: %i-%s", err, fd_vinyl_strerror( err ) ));
- }
-
- fd_vinyl_req_pool_release( accdb->vinyl_req_pool, batch_idx );
-}
-
-static void
-add_account_funk( fd_accdb_user_t * accdb_,
- uchar const * key,
- ulong lamports ) {
- fd_accdb_user_v2_t * accdb = (fd_accdb_user_v2_t *)accdb_;
- fd_funk_t * funk = accdb->funk;
-
- fd_funk_rec_map_t * rec_map = funk->rec_map;
- fd_funk_rec_pool_t * rec_pool = funk->rec_pool;
-
- fd_funk_rec_t * rec = fd_funk_rec_pool_acquire( rec_pool, NULL, 1, NULL );
- FD_TEST( rec );
- ulong rec_idx = (ulong)( rec - rec_pool->ele );
- *rec = (fd_funk_rec_t) {
- .next_idx = UINT_MAX,
- .prev_idx = UINT_MAX
- };
- accdb->funk->rec_lock[ rec_idx ] = fd_funk_rec_ver_lock( 1UL, 0UL );
- fd_funk_txn_xid_set_root( rec->pair.xid );
- memcpy( rec->pair.key->uc, key, 32UL );
-
- ulong val_sz = sizeof(fd_account_meta_t) + 32UL;
- fd_account_meta_t * meta = fd_funk_val_truncate( rec, funk->alloc, funk->wksp, 16UL, val_sz, NULL );
- FD_TEST( meta ); memset( meta, 0, val_sz );
- uchar * data = (uchar *)( meta+1 );
-
- meta->lamports = lamports;
- meta->dlen = 32U;
- memcpy( data, key, 32UL );
-
- FD_TEST( fd_funk_rec_map_insert( rec_map, rec, 0 )==FD_MAP_SUCCESS );
-}
-
-static fd_funk_rec_t *
-ref_funk_rec( fd_accdb_ref_t const * ref ) {
- return (fd_funk_rec_t *)ref->user_data;
-}
-
-static ulong
-ref_ver_lock( fd_funk_t const * funk,
- fd_funk_rec_t const * rec ) {
- ulong rec_idx = (ulong)( rec - funk->rec_pool->ele );
- return funk->rec_lock[ rec_idx ];
-}
-
-static void
-test_account_creation( fd_accdb_user_t * accdb,
- fd_funk_txn_xid_t const * xid2,
- void const * addr,
- ulong lamports ) {
- fd_accdb_rw_t rw[1];
- fd_accdb_ro_t ro[1];
- fd_funk_t * funk = ((fd_accdb_user_v2_t *)accdb)->funk;
- FD_TEST( accdb->base.ro_active==0 && accdb->base.rw_active==0 );
-
- fd_funk_rec_t * rec;
-
- FD_TEST( fd_accdb_open_rw( accdb, rw, xid2, addr, 16UL, FD_ACCDB_FLAG_CREATE ) );
- FD_TEST( accdb->base.ro_active==0 && accdb->base.rw_active==1 );
- rec = ref_funk_rec( rw->ref );
- FD_TEST( fd_funk_rec_ver_alive( fd_funk_rec_ver_bits( ref_ver_lock( funk, rec ) ) )==1 );
- FD_TEST( fd_funk_rec_lock_bits( ref_ver_lock( funk, rec ) )==FD_FUNK_REC_LOCK_MASK ); /* write locked */
- fd_accdb_ref_lamports_set( rw, lamports );
- fd_accdb_close_rw( accdb, rw );
- FD_TEST( fd_funk_rec_ver_alive( fd_funk_rec_ver_bits( ref_ver_lock( funk, rec ) ) )==1 );
- FD_TEST( fd_funk_rec_lock_bits( ref_ver_lock( funk, rec ) )==0 );
- FD_TEST( accdb->base.ro_active==0 && accdb->base.rw_active==0 );
-
- FD_TEST( fd_accdb_open_ro( accdb, ro, xid2, addr ) );
- FD_TEST( accdb->base.ro_active==1 && accdb->base.rw_active==0 );
- rec = ref_funk_rec( ro->ref );
- FD_TEST( fd_funk_rec_ver_alive( fd_funk_rec_ver_bits( ref_ver_lock( funk, rec ) ) )==1 );
- FD_TEST( fd_funk_rec_lock_bits( ref_ver_lock( funk, rec ) )==1UL ); /* read lock */
- FD_TEST( fd_accdb_ref_lamports( ro )==lamports );
- fd_accdb_close_ro( accdb, ro );
- FD_TEST( fd_funk_rec_ver_alive( fd_funk_rec_ver_bits( ref_ver_lock( funk, rec ) ) )==1 );
- FD_TEST( fd_funk_rec_lock_bits( ref_ver_lock( funk, rec ) )==0 );
- FD_TEST( accdb->base.ro_active==0 && accdb->base.rw_active==0 );
-
- FD_TEST( fd_accdb_open_rw( accdb, rw, xid2, addr, 16UL, 0 ) );
- rec = ref_funk_rec( rw->ref );
- FD_TEST( fd_funk_rec_ver_alive( fd_funk_rec_ver_bits( ref_ver_lock( funk, rec ) ) )==1 );
- FD_TEST( fd_funk_rec_lock_bits( ref_ver_lock( funk, rec ) )==FD_FUNK_REC_LOCK_MASK ); /* write locked */
- fd_accdb_ref_lamports_set( rw, 0UL ); /* delete */
- fd_accdb_close_rw( accdb, rw );
- FD_TEST( fd_funk_rec_ver_alive( fd_funk_rec_ver_bits( ref_ver_lock( funk, rec ) ) )==1 );
- FD_TEST( fd_funk_rec_lock_bits( ref_ver_lock( funk, rec ) )==0UL );
-
- FD_TEST( !fd_accdb_open_rw( accdb, rw, xid2, addr, 16UL, 0 ) );
- FD_TEST( accdb->base.ro_active==0 && accdb->base.rw_active==0 );
-
- FD_TEST( !fd_accdb_open_ro( accdb, ro, xid2, addr ) );
- FD_TEST( accdb->base.ro_active==0 && accdb->base.rw_active==0 );
-}
-
-
-/* test_truncate verifies open_rw behavior with the TRUNCATE flag set.
-
- test_truncate_create: Account does not exist, create new (flags+=CREATE)
- test_truncate_nonexist: Account does not exist, return NULL
- test_truncate_inplace: Account exists and is mutable, truncate in-place
- test_truncate_copy: Account exists and is immutable, create new and copy meta */
-
-static void
-test_truncate_create( fd_accdb_admin_t * admin,
- fd_accdb_user_t * accdb ) {
- fd_funk_txn_xid_t root = fd_accdb_root_get( admin );
- fd_funk_txn_xid_t xid = { .ul={ 1UL, 0UL } };
- fd_accdb_attach_child( admin, &root, &xid );
-
- fd_funk_rec_key_t key = { .ul={ 42UL } };
- fd_accdb_rw_t rw[1];
- FD_TEST( fd_accdb_open_rw( accdb, rw, &xid, &key, 56UL, FD_ACCDB_FLAG_CREATE|FD_ACCDB_FLAG_TRUNCATE ) );
- FD_TEST( rw->ref->ref_type==FD_ACCDB_REF_RW );
- fd_funk_rec_t * rec = (void *)rw->ref->user_data;
- FD_TEST( rec->val_sz == sizeof(fd_account_meta_t) );
- FD_TEST( rec->val_max >= sizeof(fd_account_meta_t)+56UL );
- FD_TEST( rw->meta->dlen == 0UL );
- fd_accdb_close_rw( accdb, rw );
-
- fd_accdb_cancel( admin, &xid );
-}
-
-static void
-test_truncate_nonexist( fd_accdb_admin_t * admin,
- fd_accdb_user_t * accdb ) {
- fd_funk_txn_xid_t root = fd_accdb_root_get( admin );
- fd_funk_txn_xid_t xid = { .ul={ 2UL, 0UL } };
- fd_accdb_attach_child( admin, &root, &xid );
-
- fd_funk_rec_key_t key = { .ul={ 42UL } };
- fd_accdb_rw_t rw[1];
- FD_TEST( !fd_accdb_open_rw( accdb, rw, &xid, &key, 42UL, FD_ACCDB_FLAG_TRUNCATE ) );
-
- fd_accdb_close_rw( accdb, rw );
-}
-
-static void
-test_truncate_inplace( fd_accdb_admin_t * admin,
- fd_accdb_user_t * accdb ) {
- fd_funk_txn_xid_t root = fd_accdb_root_get( admin );
- fd_funk_txn_xid_t xid = { .ul={ 3UL, 0UL } };
- fd_accdb_attach_child( admin, &root, &xid );
-
- fd_funk_rec_key_t key = { .ul={ 42UL } };
- fd_accdb_rw_t rw[1];
- ulong data_sz_0 = 56UL;
- FD_TEST( fd_accdb_open_rw( accdb, rw, &xid, &key, data_sz_0, FD_ACCDB_FLAG_CREATE ) );
- FD_TEST( rw->ref->ref_type==FD_ACCDB_REF_RW );
- fd_accdb_ref_lamports_set( rw, 32UL );
- fd_accdb_ref_data_set( accdb, rw, "hello", 5UL );
- fd_funk_rec_t * rec = (void *)rw->ref->user_data;
- FD_TEST( rec->val_sz == sizeof(fd_account_meta_t)+5UL );
- FD_TEST( rec->val_max >= sizeof(fd_account_meta_t)+data_sz_0 );
- FD_TEST( rw->meta->dlen == 5UL );
- fd_accdb_close_rw( accdb, rw );
-
- ulong data_sz_1 = 256UL;
- FD_TEST( fd_accdb_open_rw( accdb, rw, &xid, &key, data_sz_1, FD_ACCDB_FLAG_TRUNCATE ) );
- FD_TEST( rw->ref->ref_type==FD_ACCDB_REF_RW );
- rec = (void *)rw->ref->user_data;
- FD_TEST( rec->val_sz == sizeof(fd_account_meta_t) );
- FD_TEST( rec->val_max >= sizeof(fd_account_meta_t)+data_sz_1 );
- FD_TEST( rw->meta->dlen == 0UL );
- fd_accdb_close_rw( accdb, rw );
-
- fd_accdb_close_rw( accdb, rw );
-}
-
-static void
-test_truncate_copy( fd_accdb_admin_t * admin,
- fd_accdb_user_t * accdb ) {
- fd_funk_txn_xid_t root = fd_accdb_root_get( admin );
- fd_funk_txn_xid_t xid1 = { .ul={ 4UL, 0UL } };
- fd_accdb_attach_child( admin, &root, &xid1 );
-
- fd_funk_rec_key_t key = { .ul={ 42UL } };
- fd_accdb_rw_t rw[1];
- FD_TEST( fd_accdb_open_rw( accdb, rw, &xid1, &key, 56UL, FD_ACCDB_FLAG_CREATE ) );
- FD_TEST( rw->ref->ref_type==FD_ACCDB_REF_RW );
- fd_accdb_ref_lamports_set( rw, 32UL );
- fd_accdb_ref_data_set( accdb, rw, "hello", 5UL );
- fd_funk_rec_t * rec = (void *)rw->ref->user_data;
- FD_TEST( rec->val_sz == sizeof(fd_account_meta_t)+5UL );
- FD_TEST( rec->val_max >= sizeof(fd_account_meta_t)+56UL );
- FD_TEST( rw->meta->dlen == 5UL );
- fd_accdb_close_rw( accdb, rw );
-
- fd_funk_txn_xid_t xid2 = { .ul={ 5UL, 0UL } };
- fd_accdb_attach_child( admin, &xid1, &xid2 );
- FD_TEST( fd_accdb_open_rw( accdb, rw, &xid2, &key, 256UL, FD_ACCDB_FLAG_TRUNCATE ) );
- FD_TEST( rw->ref->ref_type==FD_ACCDB_REF_RW );
- rec = (void *)rw->ref->user_data;
- FD_TEST( rec->val_sz == sizeof(fd_account_meta_t) );
- FD_TEST( rec->val_max >= sizeof(fd_account_meta_t)+256UL );
- FD_TEST( rw->meta->dlen == 0UL );
- fd_accdb_close_rw( accdb, rw );
-
- fd_accdb_cancel( admin, &xid2 );
- fd_accdb_cancel( admin, &xid1 );
-}
-
-static void
-run_tests( fd_accdb_user_t * accdb ) {
- fd_accdb_user_v2_t * v2 = (fd_accdb_user_v2_t *)accdb;
- fd_vinyl_req_pool_t * req_pool = v2->vinyl_req_pool;
- FD_TEST( accdb->base.ro_active==0UL );
-
- add_account_vinyl( accdb, s_key_a, 10000UL );
- add_account_vinyl( accdb, s_key_b, 0UL );
- add_account_vinyl( accdb, s_key_d, 40000UL );
- add_account_funk ( accdb, s_key_c, 20000UL );
- add_account_funk ( accdb, s_key_d, 0UL );
-
- fd_funk_txn_xid_t xid[1]; fd_funk_txn_xid_set_root( xid );
- fd_accdb_ro_t ro[1];
-
- FD_TEST( fd_accdb_open_ro( accdb, ro, xid, s_key_a ) );
- FD_TEST( ro->ref->accdb_type==FD_ACCDB_TYPE_V2 );
- FD_TEST( ro->ref->ref_type==FD_ACCDB_REF_RO );
- FD_TEST( accdb->base.ro_active==1UL );
- FD_TEST( fd_accdb_ref_lamports( ro )==10000UL );
- fd_accdb_close_ro( accdb, ro );
- FD_TEST( accdb->base.ro_active==0UL );
- FD_TEST( req_pool->free_cnt==2UL );
-
- FD_TEST( !fd_accdb_open_ro( accdb, ro, xid, s_key_b ) );
-
- FD_TEST( fd_accdb_open_ro( accdb, ro, xid, s_key_c ) );
- fd_funk_rec_t * rec = ref_funk_rec( ro->ref );
- FD_TEST( ref_ver_lock( v2->funk, rec )==fd_funk_rec_ver_lock( 1UL, 1UL ) );
- FD_TEST( accdb->base.ro_active==1UL );
- FD_TEST( ro->ref->accdb_type==FD_ACCDB_TYPE_V1 );
- FD_TEST( ro->ref->ref_type==FD_ACCDB_REF_RO );
- FD_TEST( fd_accdb_ref_lamports( ro )==20000UL );
- fd_accdb_close_ro( accdb, ro );
- FD_TEST( ref_ver_lock( v2->funk, rec )==fd_funk_rec_ver_lock( 1UL, 0UL ) );
- FD_TEST( accdb->base.ro_active==0UL );
- FD_TEST( req_pool->free_cnt==2UL );
-
- FD_TEST( !fd_accdb_open_ro( accdb, ro, xid, s_key_d ) );
-
- FD_TEST( !fd_accdb_open_ro( accdb, ro, xid, s_key_e ) );
-
- /* Test ro_pipe API */
-
- fd_accdb_ro_t * ro_tmp;
- fd_accdb_ro_pipe_t pipe[1];
- FD_TEST( fd_accdb_ro_pipe_init( pipe, accdb, xid ) );
- FD_TEST( pipe->req_cnt==0UL );
- FD_TEST( pipe->req_max==4UL );
- FD_TEST( req_pool->free_cnt==2UL );
-
- /* first batch: d, b, c, e */
- fd_accdb_ro_pipe_enqueue( pipe, s_key_d );
- FD_TEST( req_pool->free_cnt==2UL );
- FD_TEST( pipe->req_cnt==1UL );
- FD_TEST( !fd_accdb_ro_pipe_poll( pipe ) );
- fd_accdb_ro_pipe_enqueue( pipe, s_key_b );
- FD_TEST( !fd_accdb_ro_pipe_poll( pipe ) );
- fd_accdb_ro_pipe_enqueue( pipe, s_key_c );
- FD_TEST( !fd_accdb_ro_pipe_poll( pipe ) );
- fd_accdb_ro_pipe_enqueue( pipe, s_key_e );
- FD_TEST( req_pool->free_cnt==2UL );
-
- /* result for d */
- FD_TEST( (ro_tmp = fd_accdb_ro_pipe_poll( pipe )) );
- FD_TEST( ro_tmp->ref->ref_type==FD_ACCDB_REF_RO );
- FD_TEST( ro_tmp->ref->accdb_type==FD_ACCDB_TYPE_NONE );
- FD_TEST( 0==memcmp( fd_accdb_ref_address( ro_tmp ), s_key_d, 32UL ) );
- FD_TEST( ro_tmp->meta->lamports==0UL );
- FD_TEST( accdb->base.ro_active==3UL );
-
- /* result for b (tombstone) */
- FD_TEST( (ro_tmp = fd_accdb_ro_pipe_poll( pipe )) );
- FD_TEST( ro_tmp->ref->ref_type==FD_ACCDB_REF_RO );
- FD_TEST( ro_tmp->ref->accdb_type==FD_ACCDB_TYPE_NONE );
- FD_TEST( 0==memcmp( fd_accdb_ref_address( ro_tmp ), s_key_b, 32UL ) );
- FD_TEST( ro_tmp->meta->lamports==0UL );
-
- /* result for c */
- FD_TEST( (ro_tmp = fd_accdb_ro_pipe_poll( pipe )) );
- FD_TEST( ro_tmp->ref->ref_type==FD_ACCDB_REF_RO );
- FD_TEST( ro_tmp->ref->accdb_type==FD_ACCDB_TYPE_V1 );
- FD_TEST( 0==memcmp( fd_accdb_ref_address( ro_tmp ), s_key_c, 32UL ) );
- FD_TEST( ro_tmp->meta->lamports==20000UL );
-
- /* result for e (tombstone) */
- FD_TEST( (ro_tmp = fd_accdb_ro_pipe_poll( pipe )) );
- FD_TEST( ro_tmp->ref->accdb_type==FD_ACCDB_TYPE_NONE );
- FD_TEST( 0==memcmp( fd_accdb_ref_address( ro_tmp ), s_key_e, 32UL ) );
- FD_TEST( ro_tmp->meta->lamports==0UL );
- FD_TEST( accdb->base.ro_active==3UL );
- FD_TEST( !fd_accdb_ro_pipe_poll( pipe ) );
- FD_TEST( accdb->base.ro_active==0UL );
-
- /* result for a */
- fd_accdb_ro_pipe_enqueue( pipe, s_key_a );
- FD_TEST( !fd_accdb_ro_pipe_poll( pipe ) );
- fd_accdb_ro_pipe_flush( pipe );
- FD_TEST( (ro_tmp = fd_accdb_ro_pipe_poll( pipe )) );
- FD_TEST( ro_tmp->ref->accdb_type==FD_ACCDB_TYPE_V2 );
- FD_TEST( 0==memcmp( fd_accdb_ref_address( ro_tmp ), s_key_a, 32UL ) );
- FD_TEST( ro_tmp->meta->lamports==10000UL );
- FD_TEST( accdb->base.ro_active==1UL );
- FD_TEST( !fd_accdb_ro_pipe_poll( pipe ) );
- FD_TEST( accdb->base.ro_active==0UL );
-
- fd_accdb_ro_pipe_fini( pipe );
-
- fd_accdb_rw_t rw[1];
- fd_funk_txn_xid_t xid2[1] = {{ .ul={ 1UL, 2UL } }};
- fd_accdb_admin_t admin[1];
- fd_accdb_admin_v1_init( admin, v2->funk->shmem, (void *)v2->funk->txn_lock );
- fd_accdb_attach_child( admin, xid, xid2 );
- FD_TEST( accdb->base.ro_active==0 && accdb->base.rw_active==0 );
-
- /* vinyl tombstone */
- FD_TEST( !fd_accdb_open_rw( accdb, rw, xid2, s_key_b, 16UL, 0 ) );
- FD_TEST( accdb->base.ro_active==0 && accdb->base.rw_active==0 );
- test_account_creation( accdb, xid2, s_key_b, 1UL );
-
- /* funk tombstone, vinyl exist */
- FD_TEST( !fd_accdb_open_rw( accdb, rw, xid2, s_key_d, 16UL, 0 ) );
- FD_TEST( accdb->base.ro_active==0 && accdb->base.rw_active==0 );
- test_account_creation( accdb, xid2, s_key_d, 2UL );
-
- /* missing account */
- FD_TEST( !fd_accdb_open_rw( accdb, rw, xid2, s_key_e, 16UL, 0 ) );
- FD_TEST( accdb->base.ro_active==0 && accdb->base.rw_active==0 );
- test_account_creation( accdb, xid2, s_key_e, 4UL );
-
- /* repeatedly delete and recreate the same account */
- for( ulong i=0UL; i<1024UL; i++ ) {
- test_account_creation( accdb, xid2, s_key_e, 4UL );
- }
-
- fd_accdb_cancel( admin, xid2 );
-
- /* Test truncate */
-
- test_truncate_create ( admin, accdb );
- test_truncate_nonexist( admin, accdb );
- test_truncate_inplace ( admin, accdb );
- test_truncate_copy ( admin, accdb );
-
- /* Open vinyl record as writable */
-
- xid2->ul[1]++;
- fd_accdb_attach_child( admin, xid, xid2 );
- FD_TEST( fd_accdb_open_rw( accdb, rw, xid2, s_key_a, 0UL, 0 ) );
- FD_TEST( accdb->base.ro_active==0 && accdb->base.rw_active==1 );
- rec = ref_funk_rec( rw->ref );
- FD_TEST( fd_funk_rec_ver_alive( fd_funk_rec_ver_bits( ref_ver_lock( v2->funk, rec ) ) )==1 );
- FD_TEST( fd_accdb_ref_data_sz( rw->ro )==32UL );
- FD_TEST( 0==memcmp( fd_accdb_ref_data_const( rw->ro ), s_key_a, 32UL ) );
- fd_accdb_close_rw( accdb, rw );
- FD_TEST( accdb->base.ro_active==0 && accdb->base.rw_active==0 );
- FD_TEST( fd_funk_rec_ver_alive( fd_funk_rec_ver_bits( ref_ver_lock( v2->funk, rec ) ) )==1 );
- fd_accdb_cancel( admin, xid2 );
- FD_TEST( fd_funk_rec_ver_alive( fd_funk_rec_ver_bits( ref_ver_lock( v2->funk, rec ) ) )==0 );
-
- /* Open vinyl record as writable (truncate) */
-
- xid2->ul[1]++;
- fd_accdb_attach_child( admin, xid, xid2 );
- FD_TEST( fd_accdb_open_rw( accdb, rw, xid2, s_key_a, 0UL, FD_ACCDB_FLAG_TRUNCATE ) );
- FD_TEST( accdb->base.ro_active==0 && accdb->base.rw_active==1 );
- rec = ref_funk_rec( rw->ref );
- FD_TEST( fd_funk_rec_ver_alive( fd_funk_rec_ver_bits( ref_ver_lock( v2->funk, rec ) ) )==1 );
- FD_TEST( fd_accdb_ref_data_sz( rw->ro )==0UL );
- fd_accdb_close_rw( accdb, rw );
- FD_TEST( accdb->base.ro_active==0 && accdb->base.rw_active==0 );
- FD_TEST( fd_funk_rec_ver_alive( fd_funk_rec_ver_bits( ref_ver_lock( v2->funk, rec ) ) )==1 );
- fd_accdb_cancel( admin, xid2 );
- FD_TEST( fd_funk_rec_ver_alive( fd_funk_rec_ver_bits( ref_ver_lock( v2->funk, rec ) ) )==0 );
-
- fd_accdb_admin_fini( admin );
-}
-
-int
-main( int argc,
- char ** argv ) {
- fd_boot( &argc, &argv );
- if( FD_UNLIKELY( fd_tile_cnt() < 2UL ) ) {
- FD_LOG_ERR(( "This test requires at least 2 tiles (use --tile-cpus to configure)" ));
- }
-
- char const * _wksp = fd_env_strip_cmdline_cstr ( &argc, &argv, "--wksp", NULL, NULL );
- char const * _page_sz = fd_env_strip_cmdline_cstr ( &argc, &argv, "--page-sz", NULL, "gigantic" );
- ulong page_cnt = fd_env_strip_cmdline_ulong( &argc, &argv, "--page-cnt", NULL, 8UL );
- ulong near_cpu = fd_env_strip_cmdline_ulong( &argc, &argv, "--near-cpu", NULL, fd_log_cpu_id() );
- ulong tag = fd_env_strip_cmdline_ulong( &argc, &argv, "--tag", NULL, WKSP_TAG );
-
- /* Vinyl I/O parameters */
- ulong spad_max = fd_env_strip_cmdline_ulong( &argc, &argv, "--spad-max", NULL, fd_vinyl_io_spad_est() );
- ulong dev_sz = fd_env_strip_cmdline_ulong( &argc, &argv, "--dev-sz", NULL, 1UL << 30 );
- ulong io_seed = fd_env_strip_cmdline_ulong( &argc, &argv, "--io-seed", NULL, 1234UL );
-
- /* Vinyl cache parameters */
- ulong line_cnt = fd_env_strip_cmdline_ulong( &argc, &argv, "--line-cnt", NULL, 7UL );
- ulong ele_max = fd_env_strip_cmdline_ulong( &argc, &argv, "--ele-max", NULL, 8UL );
- ulong lock_cnt = fd_env_strip_cmdline_ulong( &argc, &argv, "--lock-cnt", NULL, 8UL );
- ulong probe_max = ele_max;
- ulong seed = fd_env_strip_cmdline_ulong( &argc, &argv, "--seed", NULL, 5678UL );
- ulong obj_sz = fd_env_strip_cmdline_ulong( &argc, &argv, "--obj-sz", NULL, 6UL << 30 );
-
- /* Vinyl runtime parameters */
- ulong async_min = fd_env_strip_cmdline_ulong( &argc, &argv, "--async-min", NULL, 5UL );
- ulong async_max = fd_env_strip_cmdline_ulong( &argc, &argv, "--async-max", NULL, 2UL*async_min );
- ulong part_thresh = fd_env_strip_cmdline_ulong( &argc, &argv, "--part-thresh", NULL, 64UL << 20 );
- ulong gc_thresh = fd_env_strip_cmdline_ulong( &argc, &argv, "--gc-thresh", NULL, 128UL << 20 );
- int gc_eager = fd_env_strip_cmdline_int ( &argc, &argv, "--gc-eager", NULL, 2 );
- char const * _style = fd_env_strip_cmdline_cstr ( &argc, &argv, "--style", NULL, "lz4" );
- int level = fd_env_strip_cmdline_int ( &argc, &argv, "--level", NULL, 0 );
-
- /* Vinyl client parameters */
- ulong rq_max = fd_env_strip_cmdline_ulong( &argc, &argv, "--rq-max", NULL, 32UL );
- ulong cq_max = fd_env_strip_cmdline_ulong( &argc, &argv, "--cq-max", NULL, 32UL );
- ulong link_id = fd_env_strip_cmdline_ulong( &argc, &argv, "--link-id", NULL, 2345UL );
- ulong burst_max = fd_env_strip_cmdline_ulong( &argc, &argv, "--burst-max", NULL, 1UL );
- ulong quota_max = fd_env_strip_cmdline_ulong( &argc, &argv, "--quota-max", NULL, 4UL );
-
- /* Funk (in-memory DB) parameters */
- ulong txn_max = fd_env_strip_cmdline_ulong( &argc, &argv, "--txn-max", NULL, 32UL );
- ulong rec_max = fd_env_strip_cmdline_ulong( &argc, &argv, "--rec-max", NULL, 512UL );
-
- int style = fd_cstr_to_vinyl_bstream_ctl_style( _style );
-
- FD_LOG_NOTICE(( "Setting up workspace" ));
-
- fd_wksp_t * wksp;
- if( _wksp ) {
- FD_LOG_NOTICE(( "Attaching to --wksp %s", _wksp ));
- wksp = fd_wksp_attach( _wksp );
- } else {
- FD_LOG_NOTICE(( "--wksp not specified, using an anonymous local workspace (--page-sz %s --page-cnt %lu --near-cpu %lu)",
- _page_sz, page_cnt, near_cpu ));
- wksp = fd_wksp_new_anonymous( fd_cstr_to_shmem_page_sz( _page_sz ), page_cnt, near_cpu, "wksp", 0UL );
- }
- FD_TEST( wksp );
-
- ulong io_footprint = fd_vinyl_io_mm_footprint( spad_max ); FD_TEST( io_footprint );
- ulong dev_footprint = fd_ulong_align_dn( dev_sz, FD_VINYL_BSTREAM_BLOCK_SZ ); FD_TEST( dev_footprint );
- ulong vinyl_footprint = fd_vinyl_footprint(); FD_TEST( vinyl_footprint );
- ulong cnc_footprint = fd_cnc_footprint( FD_VINYL_CNC_APP_SZ ); FD_TEST( cnc_footprint );
- ulong meta_footprint = fd_vinyl_meta_footprint( ele_max, lock_cnt, probe_max ); FD_TEST( meta_footprint );
- ulong line_footprint = sizeof(fd_vinyl_line_t) * line_cnt; FD_TEST( line_footprint );
- ulong ele_footprint = sizeof(fd_vinyl_meta_ele_t) * ele_max; FD_TEST( ele_footprint );
- ulong obj_footprint = fd_ulong_align_dn( obj_sz, alignof(fd_vinyl_data_obj_t) ); FD_TEST( obj_footprint );
- ulong rq_footprint = fd_vinyl_rq_footprint( rq_max ); FD_TEST( rq_footprint );
- ulong cq_footprint = fd_vinyl_cq_footprint( cq_max ); FD_TEST( cq_footprint );
-
- void * _io = fd_wksp_alloc_laddr( wksp, fd_vinyl_io_mm_align(), io_footprint, tag ); FD_TEST( _io );
- void * _dev = fd_wksp_alloc_laddr( wksp, FD_VINYL_BSTREAM_BLOCK_SZ, dev_footprint, tag ); FD_TEST( _dev );
- void * _vinyl = fd_wksp_alloc_laddr( wksp, fd_vinyl_align(), vinyl_footprint, tag ); FD_TEST( _vinyl );
- void * _cnc = fd_wksp_alloc_laddr( wksp, fd_cnc_align(), cnc_footprint, tag ); FD_TEST( _cnc );
- void * _meta = fd_wksp_alloc_laddr( wksp, fd_vinyl_meta_align(), meta_footprint, tag ); FD_TEST( _meta );
- void * _line = fd_wksp_alloc_laddr( wksp, alignof(fd_vinyl_line_t), line_footprint, tag ); FD_TEST( _line );
- void * _ele = fd_wksp_alloc_laddr( wksp, alignof(fd_vinyl_meta_ele_t), ele_footprint, tag ); FD_TEST( _ele );
- void * _obj = fd_wksp_alloc_laddr( wksp, alignof(fd_vinyl_data_obj_t), obj_footprint, tag ); FD_TEST( _obj );
- void * _rq = fd_wksp_alloc_laddr( wksp, fd_vinyl_rq_align(), rq_footprint, tag ); FD_TEST( _rq );
- void * _cq = fd_wksp_alloc_laddr( wksp, fd_vinyl_cq_align(), cq_footprint, tag ); FD_TEST( _cq );
-
- fd_vinyl_io_t * io = fd_vinyl_io_mm_init( _io, spad_max, _dev, dev_footprint, 1, "test", 5UL, io_seed );
- FD_TEST( io );
-
- fd_vinyl_t * vinyl = fd_vinyl_init( NULL, 0UL, 0UL, level, _vinyl,
- _cnc, cnc_footprint,
- _meta, meta_footprint,
- _line, line_footprint,
- _ele, ele_footprint,
- _obj, obj_footprint,
- io, seed, wksp, async_min, async_max,
- part_thresh, gc_thresh, gc_eager, style );
-
- FD_TEST( vinyl );
-
- FD_LOG_NOTICE(( "Vinyl booting" ));
-
- fd_tile_exec_t * exec = fd_tile_exec_new( 1UL, fd_vinyl_tile, 0, (char **)vinyl );
- FD_TEST( exec );
-
- fd_vinyl_rq_t * rq = fd_vinyl_rq_join( fd_vinyl_rq_new( _rq, rq_max ) ); FD_TEST( rq );
- fd_vinyl_cq_t * cq = fd_vinyl_cq_join( fd_vinyl_cq_new( _cq, cq_max ) ); FD_TEST( cq );
-
- fd_cnc_t * cnc = fd_cnc_join( _cnc ); FD_TEST( cnc );
- FD_TEST( fd_cnc_wait( cnc, FD_VINYL_CNC_SIGNAL_BOOT, (long)5e9, NULL )==FD_VINYL_CNC_SIGNAL_RUN );
-
- FD_LOG_NOTICE(( "Vinyl running" ));
-
- ulong funk_seed = 9876UL;
- ulong funk_footprint = fd_funk_shmem_footprint( txn_max, rec_max );
- ulong lock_footprint = fd_funk_locks_footprint( txn_max, rec_max );
- void * shfunk = fd_wksp_alloc_laddr( wksp, fd_funk_align(), funk_footprint, tag );
- void * shlocks = fd_wksp_alloc_laddr( wksp, fd_funk_align(), lock_footprint, tag );
- FD_TEST( shfunk );
- FD_TEST( shlocks );
- FD_TEST( fd_funk_shmem_new( shfunk, tag, funk_seed, txn_max, rec_max ) );
- FD_TEST( fd_funk_locks_new( shlocks, txn_max, rec_max ) );
-
- ulong req_pool_footprint = fd_vinyl_req_pool_footprint( 2UL, 4UL );
- FD_TEST( req_pool_footprint );
- void * _req_pool = fd_wksp_alloc_laddr( wksp, fd_vinyl_req_pool_align(), req_pool_footprint, tag );
- FD_TEST( _req_pool );
- void * req_pool = fd_vinyl_req_pool_new( _req_pool, 2UL, 4UL );
- FD_TEST( req_pool );
-
- FD_LOG_NOTICE(( "Connecting client to vinyl" ));
-
- FD_TEST( !fd_vinyl_client_join( cnc, rq, cq, wksp, link_id, burst_max, quota_max ) );
-
- fd_accdb_user_t accdb[1];
- FD_TEST( fd_accdb_user_v2_init( accdb, shfunk, shlocks, _rq, wksp, req_pool, link_id, txn_max ) );
- FD_TEST( accdb->base.accdb_type == FD_ACCDB_TYPE_V2 );
-
- FD_LOG_NOTICE(( "Running tests" ));
-
- run_tests( accdb );
-
- FD_LOG_NOTICE(( "Cleaning up" ));
-
- fd_accdb_admin_t admin[1];
- FD_TEST( fd_accdb_admin_v1_init( admin, shfunk, shlocks ) );
- fd_accdb_v1_clear( admin );
- fd_accdb_admin_fini( admin );
-
- fd_accdb_user_fini( accdb );
-
- FD_TEST( !fd_vinyl_client_leave( cnc, link_id ) );
-
- FD_LOG_NOTICE(( "Vinyl stopping" ));
- FD_TEST( !fd_vinyl_halt( cnc ) );
- FD_TEST( fd_cnc_leave( cnc )==_cnc );
-
- fd_tile_exec_delete( exec, NULL );
-
- FD_TEST( fd_vinyl_cq_delete( fd_vinyl_cq_leave( cq ) )==_cq );
- FD_TEST( fd_vinyl_rq_delete( fd_vinyl_rq_leave( rq ) )==_rq );
-
- FD_TEST( fd_vinyl_fini( vinyl )==_vinyl );
- FD_TEST( fd_vinyl_io_fini( io )==_io );
-
- fd_wksp_free_laddr( fd_vinyl_req_pool_delete( req_pool ) );
- fd_wksp_free_laddr( shlocks );
- fd_wksp_free_laddr( fd_funk_delete( shfunk ) );
- fd_wksp_free_laddr( _cq );
- fd_wksp_free_laddr( _rq );
- fd_wksp_free_laddr( _obj );
- fd_wksp_free_laddr( _ele );
- fd_wksp_free_laddr( _line );
- fd_wksp_free_laddr( _meta );
- fd_wksp_free_laddr( _cnc );
- fd_wksp_free_laddr( _vinyl );
- fd_wksp_free_laddr( _dev );
- fd_wksp_free_laddr( _io );
-
- fd_wksp_usage_t wksp_usage;
- FD_TEST( fd_wksp_usage( wksp, NULL, 0UL, &wksp_usage ) );
- FD_TEST( wksp_usage.free_cnt==wksp_usage.total_cnt );
-
- if( _wksp ) fd_wksp_detach( wksp );
- else fd_wksp_delete_anonymous( wksp );
-
- FD_LOG_NOTICE(( "pass" ));
- fd_halt();
- return 0;
-}
diff --git a/src/flamenco/runtime/fd_executor.c b/src/flamenco/runtime/fd_executor.c
index 96341be68f8..f489339bdf0 100644
--- a/src/flamenco/runtime/fd_executor.c
+++ b/src/flamenco/runtime/fd_executor.c
@@ -1524,12 +1524,87 @@ fd_executor_setup_accounts_for_txn( fd_runtime_t * runtime,
txn_out->accounts.rollback_nonce_mem = writable_accs_mem[ writable_account_cnt+1UL ];
ushort executable_idx = 0U;
- for( ushort i=0; iaccounts.cnt; i++ ) {
- fd_executor_setup_txn_account( runtime, bank, txn_in, txn_out, i, writable_accs_mem, &writable_accs_idx );
- fd_account_meta_t * meta = txn_out->accounts.account[ i ].meta;
- if( FD_UNLIKELY( meta && memcmp( meta->owner, fd_solana_bpf_loader_upgradeable_program_id.key, sizeof(fd_pubkey_t) ) == 0 ) ) {
- fd_executor_setup_executable_account( runtime, bank, meta, &executable_idx );
+ if( FD_LIKELY( !txn_in->bundle.is_bundle ) ) {
+ /* Fast path: batch fetch all accounts from DB in one call.
+ This amortizes I/O wait time across all accounts rather than
+ issuing individual lookups per account. */
+
+ fd_funk_txn_xid_t xid = { .ul = { fd_bank_slot_get( bank ), bank->data->idx } };
+ ushort acct_cnt = (ushort)txn_out->accounts.cnt;
+
+ fd_accdb_open_ro_multi( runtime->accdb,
+ txn_out->accounts.account->ro,
+ &xid,
+ txn_out->accounts.keys,
+ acct_cnt );
+
+ for( ushort i=0; iaccounts.keys[ i ];
+ fd_accdb_rw_t * ref_slot = &txn_out->accounts.account[ i ];
+ fd_accdb_rw_t * account = ref_slot;
+
+ /* For non-existent accounts (zero lamports), close the DB
+ reference and treat as not found. */
+ if( fd_accdb_ref_lamports( account->ro )==0UL ) {
+ fd_accdb_close_ref( runtime->accdb, ref_slot->ref );
+ account = NULL;
+ }
+
+ if( txn_out->accounts.is_writable[ i ] ) {
+ uchar * new_raw_data = writable_accs_mem[ writable_accs_idx ];
+ ulong dlen = !!account ? fd_accdb_ref_data_sz( (fd_accdb_ro_t *)account ) : 0UL;
+ writable_accs_idx++;
+
+ if( FD_LIKELY( account ) ) {
+ fd_memcpy( new_raw_data, account->meta, sizeof(fd_account_meta_t)+dlen );
+ fd_accdb_close_ro( runtime->accdb, (fd_accdb_ro_t *)account );
+ } else {
+ fd_account_meta_init( (fd_account_meta_t *)new_raw_data );
+ }
+
+ account = fd_accdb_rw_init_nodb(
+ (fd_accdb_rw_t *)ref_slot,
+ address,
+ (fd_account_meta_t *)new_raw_data,
+ FD_RUNTIME_ACC_SZ_MAX
+ );
+
+ } else {
+ if( FD_UNLIKELY( fd_pubkey_eq( address, &fd_sysvar_instructions_id ) ) ) {
+ if( FD_LIKELY( account ) ) {
+ fd_accdb_close_ro( runtime->accdb, (fd_accdb_ro_t *)account );
+ }
+ fd_account_meta_t * meta = fd_account_meta_init( (void *)runtime->accounts.sysvar_instructions_mem );
+ account = (fd_accdb_rw_t *)fd_accdb_ro_init_nodb( (fd_accdb_ro_t *)ref_slot, address, meta );
+ } else if( FD_LIKELY( account ) ) {
+ /* transfer ownership of DB reference to runtime struct;
+ reference is freed in cancel/commit */
+ } else {
+ account = (fd_accdb_rw_t *)fd_accdb_ro_init_nodb( (fd_accdb_ro_t *)ref_slot, address, &FD_ACCOUNT_META_DEFAULT );
+ }
+ }
+
+ runtime->accounts.starting_lamports[i] = fd_accdb_ref_lamports( account->ro );
+ runtime->accounts.starting_dlen[i] = fd_accdb_ref_data_sz ( account->ro );
+ runtime->accounts.refcnt[i] = 0UL;
+
+ fd_account_meta_t * meta = txn_out->accounts.account[ i ].meta;
+ if( FD_UNLIKELY( meta && memcmp( meta->owner, fd_solana_bpf_loader_upgradeable_program_id.key, sizeof(fd_pubkey_t) ) == 0 ) ) {
+ fd_executor_setup_executable_account( runtime, bank, meta, &executable_idx );
+ }
+ }
+
+ } else {
+ /* Bundle path: per-account setup since accounts may reference
+ previous transactions in the bundle. */
+ for( ushort i=0; iaccounts.cnt; i++ ) {
+ fd_executor_setup_txn_account( runtime, bank, txn_in, txn_out, i, writable_accs_mem, &writable_accs_idx );
+ fd_account_meta_t * meta = txn_out->accounts.account[ i ].meta;
+
+ if( FD_UNLIKELY( meta && memcmp( meta->owner, fd_solana_bpf_loader_upgradeable_program_id.key, sizeof(fd_pubkey_t) ) == 0 ) ) {
+ fd_executor_setup_executable_account( runtime, bank, meta, &executable_idx );
+ }
}
}
diff --git a/src/vinyl/Local.mk b/src/vinyl/Local.mk
index 24856df403e..4e10e28ef81 100644
--- a/src/vinyl/Local.mk
+++ b/src/vinyl/Local.mk
@@ -1,13 +1,7 @@
ifdef FD_HAS_LZ4
$(call make-lib,fd_vinyl)
$(call add-hdrs,fd_vinyl_base.h fd_vinyl.h)
-$(call add-objs,fd_vinyl_base fd_vinyl_recover fd_vinyl_compact fd_vinyl_cmd fd_vinyl fd_vinyl_exec,fd_vinyl)
-ifdef FD_HAS_HOSTED
-$(call make-bin,fd_vinyl_ctl,fd_vinyl_ctl,fd_vinyl fd_tango fd_util)
-endif
+$(call add-objs,fd_vinyl_base fd_vinyl,fd_vinyl)
$(call make-unit-test,test_vinyl_base,test_vinyl_base,fd_vinyl fd_tango fd_util)
-ifdef FD_HAS_HOSTED
-$(call make-unit-test,test_vinyl_req,test_vinyl_req,fd_vinyl fd_tango fd_util)
-endif
$(call run-unit-test,test_vinyl_base)
endif
diff --git a/src/vinyl/data/fd_vinyl_data.c b/src/vinyl/data/fd_vinyl_data.c
index 4a8d30fc30a..63eab51f035 100644
--- a/src/vinyl/data/fd_vinyl_data.c
+++ b/src/vinyl/data/fd_vinyl_data.c
@@ -6,6 +6,23 @@ fd_vinyl_data_szc_all_blocks( ulong szc ) {
return ((1UL << ((int)fd_vinyl_data_szc_cfg[ szc ].obj_cnt - 1)) << 1) - 1UL;
}
+static inline void
+fd_vinyl_data_lock( int * lock ) {
+ for(;;) {
+ if( FD_LIKELY( !FD_VOLATILE_CONST( *lock ) ) ) {
+ if( FD_LIKELY( !FD_ATOMIC_CAS( lock, 0, 1 ) ) ) break;
+ }
+ FD_SPIN_PAUSE();
+ }
+ FD_COMPILER_MFENCE();
+}
+
+static inline void
+fd_vinyl_data_unlock( int * lock ) {
+ FD_COMPILER_MFENCE();
+ FD_VOLATILE( *lock ) = 0;
+}
+
FD_FN_CONST static inline ulong
fd_vinyl_data_obj_off( void const * laddr0,
fd_vinyl_data_obj_t const * obj ) {
@@ -152,10 +169,11 @@ fd_vinyl_data_fini( fd_vinyl_data_t * data ) {
return data;
}
-/* Note: the algorithms below is identical to fd_alloc. But since it
- is running single threaded and non-persistent, there's less atomic
- operation and/or address translation shenanigans going on. See
- fd_alloc for more in depth discussions. */
+/* Note: the algorithms below are similar to fd_alloc. Since it is
+ non-persistent, there's less address translation shenanigans going
+ on. Concurrency is handled via per-sizeclass spinlocks with a
+ strict ascending lock order (szc < parent_szc < vol_lock) to prevent
+ deadlock. See fd_alloc for more in depth discussions. */
fd_vinyl_data_obj_t *
fd_vinyl_data_alloc( fd_vinyl_data_t * data,
@@ -164,8 +182,11 @@ fd_vinyl_data_alloc( fd_vinyl_data_t * data,
FD_CRIT( data, "NULL data" );
FD_CRIT( szcladdr0;
- fd_vinyl_data_vol_t * vol = data->vol;
+ void * laddr0 = data->laddr0;
+ fd_vinyl_data_vol_t * vol = data->vol;
+
+ fd_vinyl_data_lock( &data->superblock[ szc ].lock );
+
fd_vinyl_data_obj_t ** _active = &data->superblock[ szc ].active;
fd_vinyl_data_obj_t ** _inactive_top = &data->superblock[ szc ].inactive_top;
@@ -200,8 +221,11 @@ fd_vinyl_data_alloc( fd_vinyl_data_t * data,
ulong parent_szc = (ulong)fd_vinyl_data_szc_cfg[ szc ].parent_szc;
if( FD_LIKELY( parent_szc szc. */
+
superblock = fd_vinyl_data_alloc( data, parent_szc );
- if( FD_UNLIKELY( !superblock ) ) return NULL;
+ if( FD_UNLIKELY( !superblock ) ) { fd_vinyl_data_unlock( &data->superblock[ szc ].lock ); return NULL; }
/* superblock->type init by obj_alloc to ALLOC, reset below */
/* superblock->szc init by obj_alloc */
@@ -212,10 +236,18 @@ fd_vinyl_data_alloc( fd_vinyl_data_t * data,
} else {
+ fd_vinyl_data_lock( &data->vol_lock );
+
ulong vol_idx = data->vol_idx_free;
- if( FD_UNLIKELY( vol_idx >= data->vol_cnt ) ) return NULL;
+ if( FD_UNLIKELY( vol_idx >= data->vol_cnt ) ) {
+ fd_vinyl_data_unlock( &data->vol_lock );
+ fd_vinyl_data_unlock( &data->superblock[ szc ].lock );
+ return NULL;
+ }
data->vol_idx_free = vol[ vol_idx ].obj->idx;
+ fd_vinyl_data_unlock( &data->vol_lock );
+
superblock = vol[ vol_idx ].obj;
/* superblock->type init below */
@@ -250,43 +282,20 @@ fd_vinyl_data_alloc( fd_vinyl_data_t * data,
superblock->free_blocks = free_blocks;
/* If this superblock still has free blocks in it, return it to
- circulation for future allocation as szc's active superblock,
- pushing any displaced superblock onto szc's inactive superblock
- stack. Other strategies are possible, see fd_alloc for discussion
- of tradeoffs. */
-
-# if 0
-
- if( FD_LIKELY( free_blocks ) ) {
-
- fd_vinyl_data_obj_t * displaced_superblock = *_active;
- *_active = superblock;
-
- if( FD_UNLIKELY( displaced_superblock ) ) {
-
- FD_ALERT( !fd_vinyl_data_superblock_test( data, displaced_superblock, szc ), "corruption detected" );
-
- displaced_superblock->next_off = fd_vinyl_data_obj_off( laddr0, *_inactive_top );
- *_inactive_top = displaced_superblock;
-
- }
-
- }
-
-# else
-
- /* For a non-concurrent implementation, we know szc has no active
- superblock active at this point (because their's no concurrent
- alloc or free that could have set it behind our back). We don't
- have to worry about displacing a superblock, simplifying the
- above. */
+ circulation for future allocation as szc's active superblock.
+ Since we hold lock[szc], we know szc has no active superblock at
+ this point (no concurrent alloc or free can set it behind our back
+ while we hold the lock). We don't have to worry about displacing
+ a superblock, simplifying this. Other strategies are possible, see
+ fd_alloc for discussion of tradeoffs. */
fd_vinyl_data_obj_t * tmp[1];
*(free_blocks ? _active : tmp) = superblock; /* branchless conditional store */
-# endif
+ fd_vinyl_data_unlock( &data->superblock[ szc ].lock );
- /* Initialize the allocated object metadata and return. */
+ /* Initialize the allocated object metadata and return. The object
+ is not yet visible to other threads so no lock is needed here. */
fd_vinyl_data_obj_t * obj = (fd_vinyl_data_obj_t *)( (ulong)superblock + sizeof(fd_vinyl_data_obj_t)
+ idx*fd_vinyl_data_szc_obj_footprint( szc ) );
@@ -325,10 +334,14 @@ fd_vinyl_data_free( fd_vinyl_data_t * data,
if( FD_UNLIKELY( szc>=FD_VINYL_DATA_SZC_CNT ) ) {
FD_CRIT( idx < data->vol_cnt, "corruption detected" ); /* valid idx for vol */
+ fd_vinyl_data_lock( &data->vol_lock );
+
obj->type = FD_VINYL_DATA_OBJ_TYPE_FREEVOL; /* Mark as on the free stack */
obj->idx = data->vol_idx_free;
data->vol_idx_free = idx;
+ fd_vinyl_data_unlock( &data->vol_lock );
+
return;
}
@@ -337,6 +350,8 @@ fd_vinyl_data_free( fd_vinyl_data_t * data,
/* At this point, obj appears to be contained in a superblock at
position idx. Mark the object as free in the superblock. */
+ fd_vinyl_data_lock( &data->superblock[ szc ].lock );
+
fd_vinyl_data_obj_t * superblock = (fd_vinyl_data_obj_t *)
((ulong)obj - sizeof(fd_vinyl_data_obj_t) - idx*fd_vinyl_data_szc_obj_footprint( szc ));
@@ -359,7 +374,7 @@ fd_vinyl_data_free( fd_vinyl_data_t * data,
superblock onto the szc's inactive superblock stack.
Otherwise, if this free made the superblock totally empty, we check
- if the szc'c inactive superblock top is also totally empty. If so,
+ if the szc's inactive superblock top is also totally empty. If so,
we pop the inactive stack and free that.
This keeps a small bounded supply empty superblocks around for fast
@@ -399,7 +414,14 @@ fd_vinyl_data_free( fd_vinyl_data_t * data,
data->superblock[ szc ].inactive_top = fd_vinyl_data_obj_ptr( data->laddr0, candidate_superblock->next_off );
+ /* Recursive free of the empty superblock. Its szc is
+ parent_szc > szc, so lock ordering is preserved. We
+ release our lock first since szc state is consistent. */
+
+ fd_vinyl_data_unlock( &data->superblock[ szc ].lock );
+
fd_vinyl_data_free( data, candidate_superblock );
+ return;
}
}
@@ -408,6 +430,8 @@ fd_vinyl_data_free( fd_vinyl_data_t * data,
}
+ fd_vinyl_data_unlock( &data->superblock[ szc ].lock );
+
}
static FD_FOR_ALL_BEGIN( fd_vinyl_data_reset_task, 1L ) {
@@ -453,9 +477,11 @@ fd_vinyl_data_reset( fd_tpool_t * tpool, ulong t0, ulong t1, int level,
FD_FOR_ALL( fd_vinyl_data_reset_task, tpool,t0,t1, 0L,(long)data->vol_cnt, data, level );
+ data->vol_lock = 0;
data->vol_idx_free = 0UL;
for( ulong szc=0UL; szcsuperblock[ szc ].lock = 0;
data->superblock[ szc ].active = NULL;
data->superblock[ szc ].inactive_top = NULL;
}
diff --git a/src/vinyl/data/fd_vinyl_data.h b/src/vinyl/data/fd_vinyl_data.h
index 7664454ad37..4e7437dd521 100644
--- a/src/vinyl/data/fd_vinyl_data.h
+++ b/src/vinyl/data/fd_vinyl_data.h
@@ -7,10 +7,10 @@
lockfree operated on by multiple threads in other address spaces and
async direct I/O hardware concurrently.
- Note that, though pairs are cached in a shared memory region, this is
- not a persistent or concurrent datastructure. Specifically, only the
- vinyl tile can allocate or free objects from it and then can do only
- sequentially.
+ Pairs are cached in a shared memory region. The allocator is thread
+ safe: multiple threads may concurrently allocate and free objects
+ using per-sizeclass spinlocks with a consistent lock ordering
+ (lock[szc] < lock[parent_szc] < vol_lock) to prevent deadlock.
Notes:
@@ -26,10 +26,9 @@
The algorithms that manage the allocations are virtually identical to
fd_groove and fd_alloc. But they have been simplified, customized
and optimized for this use case (e.g. minimal need for address
- translation, no need for atomic operations, no need for concurrency
- group optimizations, no need to layout cache for concurrent access,
- much more fine grained size classes for minimal data store overheads,
- etc). This also does extensive (and compile time configurable)
+ translation, simple spinlock concurrency, much more fine grained
+ size classes for minimal data store overheads, etc). This also does
+ extensive (and compile time configurable)
memory data integrity continuously to help catch memory corruption
(either due to hardware failures, buggy usage or malicious usage).
@@ -258,6 +257,23 @@ FD_PROTOTYPES_BEGIN
fd_vinyl_data_* mirror the above but they take the value region as
input. */
+/* fd_vinyl_data_laddr resolves a data object global address to a local
+ address given the data cache's laddr0 (i.e. the workspace base).
+ Returns NULL if gaddr is 0.
+
+ fd_vinyl_data_gaddr converts a local address to a data object global
+ address given the data cache's laddr0. */
+
+FD_FN_CONST static inline void *
+fd_vinyl_data_laddr( ulong gaddr, void * laddr0 ) {
+ return (void *)fd_ulong_if( !!gaddr, (ulong)laddr0 + gaddr, 0UL );
+}
+
+FD_FN_CONST static inline ulong
+fd_vinyl_data_gaddr( void const * laddr, void const * laddr0 ) {
+ return fd_ulong_if( !!laddr, (ulong)laddr - (ulong)laddr0, 0UL );
+}
+
FD_FN_CONST static inline fd_vinyl_bstream_phdr_t *
fd_vinyl_data_obj_phdr( fd_vinyl_data_obj_t const * obj ) {
return (fd_vinyl_bstream_phdr_t *)((ulong)obj + sizeof(fd_vinyl_data_obj_t));
@@ -343,8 +359,10 @@ struct __attribute((aligned(FD_VINYL_DATA_ALIGN))) fd_vinyl_data {
(FD_VINYL_BSTREAM_BLOCK_SZ aligned) */
fd_vinyl_data_vol_t * vol; /* Vols, indexed [0,vol_cnt), in raw shared memory region */
ulong vol_cnt; /* Num vols, in [0,FD_VINYL_DATA_VOL_MAX) */
+ int vol_lock; /* Spinlock protecting vol_idx_free */
ulong vol_idx_free; /* Idx of first free volume if in [0,vol_cnt), no free volumes o.w. */
struct {
+ int lock; /* Spinlock protecting this size class */
fd_vinyl_data_obj_t * active; /* active superblock for this size class */
fd_vinyl_data_obj_t * inactive_top; /* top of the inactive superblock stack for this size class */
} superblock[ FD_VINYL_DATA_SZC_CNT ];
@@ -436,23 +454,25 @@ fd_vinyl_data_is_valid_obj( void const * laddr,
/* fd_vinyl_data_alloc acquires an object of sizeclass szc from the data
cache. Returns a pointer to the object on success and NULL if there
- is no space available in the data. Will FD_LOG_CRIT if anything
- wonky is detected (bad, memory corruption, etc). */
+ is no space available in the data. Thread safe. Will FD_LOG_CRIT
+ if anything wonky is detected (bad, memory corruption, etc). */
fd_vinyl_data_obj_t *
fd_vinyl_data_alloc( fd_vinyl_data_t * data,
ulong szc );
/* fd_vinyl_data_free releases obj to the data cache. This cannot fail
- from the caller's perspective. Will FD_LOG_CRIT if anything wonky is
- detected (bad args, memory corruption, etc). */
+ from the caller's perspective. Thread safe. Will FD_LOG_CRIT if
+ anything wonky is detected (bad args, memory corruption, etc). */
void
fd_vinyl_data_free( fd_vinyl_data_t * data,
fd_vinyl_data_obj_t * obj );
/* fd_vinyl_data_reset uses the caller and tpool threads (t0,t1) to free
- all objects from the data cache. level zero/non-zero indicates to do
+ all objects from the data cache. Not thread safe with concurrent
+ alloc/free; caller must ensure exclusive access. level zero/non-zero
+ indicates to do
soft/hard reset. In a hard reset, the shmem region is zero'd before
formatting it into a set of free data volumes. This cannot fail from
the caller's perspective. Assumes tpool threads (t0,t1) are
@@ -465,7 +485,9 @@ fd_vinyl_data_reset( fd_tpool_t * tpool, ulong t0, ulong t1, int level,
/* fd_vinyl_data_verify returns FD_VINYL_SUCCESS (0) if the given data
appears to be a valid vinyl data and FD_VINYL_ERR_CORRUPT (negative)
- otherwise (logs details). This only verifies the vinyl data's state
+ otherwise (logs details). Not thread safe with concurrent
+ alloc/free; caller must ensure exclusive access. This only verifies
+ the vinyl data's state
and superblock heirarchy are intact. It does not test any of the
allocations for correctness (but could given access to the bstream,
line and/or meta). */
diff --git a/src/vinyl/data/test_vinyl_data.c b/src/vinyl/data/test_vinyl_data.c
index 87aa726c5c0..d4df92e3630 100644
--- a/src/vinyl/data/test_vinyl_data.c
+++ b/src/vinyl/data/test_vinyl_data.c
@@ -14,7 +14,7 @@ FD_STATIC_ASSERT( sizeof (fd_vinyl_data_obj_t)==FD_VINYL_BSTREAM_BLOCK_SZ, unit_
FD_STATIC_ASSERT( FD_VINYL_DATA_VOL_FOOTPRINT==34078592UL, unit_test );
FD_STATIC_ASSERT( FD_VINYL_DATA_ALIGN == 128UL, unit_test );
-FD_STATIC_ASSERT( FD_VINYL_DATA_FOOTPRINT==5376UL, unit_test );
+FD_STATIC_ASSERT( FD_VINYL_DATA_FOOTPRINT==7936UL, unit_test );
FD_STATIC_ASSERT( alignof(fd_vinyl_data_vol_t)==FD_VINYL_BSTREAM_BLOCK_SZ, unit_test );
FD_STATIC_ASSERT( sizeof (fd_vinyl_data_vol_t)==FD_VINYL_DATA_VOL_FOOTPRINT, unit_test );
diff --git a/src/vinyl/fd_vinyl.c b/src/vinyl/fd_vinyl.c
index bb5e0f10cd0..fc6f764f8c0 100644
--- a/src/vinyl/fd_vinyl.c
+++ b/src/vinyl/fd_vinyl.c
@@ -10,124 +10,6 @@ fd_vinyl_footprint( void ) {
return sizeof(fd_vinyl_t);
}
-fd_vinyl_t *
-fd_vinyl_init( fd_tpool_t * tpool, ulong t0, ulong t1, int level,
- void * _vinyl,
- void * _cnc, ulong cnc_footprint,
- void * _meta, ulong meta_footprint,
- void * _line, ulong line_footprint,
- void * _ele, ulong ele_footprint,
- void * _obj, ulong obj_footprint,
- fd_vinyl_io_t * io,
- ulong seed,
- void * obj_laddr0,
- ulong async_min,
- ulong async_max,
- ulong part_thresh,
- ulong gc_thresh,
- int gc_eager,
- int style ) {
- if( t1<=t0 ) t0 = 0UL, t1 = 1UL;
-
- FD_LOG_NOTICE(( "Testing vinyl configuration" ));
-
-# define TEST( c ) do { if( FD_UNLIKELY( !(c) ) ) { FD_LOG_WARNING(( "FAIL: %s", #c )); return NULL; } } while(0)
-
- TEST( _vinyl ); TEST( fd_ulong_is_aligned( (ulong)_vinyl, fd_vinyl_align() ) );
- TEST( _cnc ); TEST( fd_ulong_is_aligned( (ulong)_cnc, fd_cnc_align() ) );
- TEST( _meta ); TEST( fd_ulong_is_aligned( (ulong)_meta, fd_vinyl_meta_align() ) );
- TEST( _line ); TEST( fd_ulong_is_aligned( (ulong)_line, alignof(fd_vinyl_line_t) ) );
- TEST( _ele ); TEST( fd_ulong_is_aligned( (ulong)_ele, alignof(fd_vinyl_meta_ele_t) ) );
- TEST( _obj ); TEST( fd_ulong_is_aligned( (ulong)_obj, alignof(fd_vinyl_data_obj_t) ) );
-
- TEST( cnc_footprint >= fd_cnc_footprint( FD_VINYL_CNC_APP_SZ ) );
-
- ulong ele_max = fd_ulong_pow2_dn( ele_footprint / sizeof( fd_vinyl_meta_ele_t ) );
- ulong lock_cnt = fd_vinyl_meta_lock_cnt_est( ele_max );
- ulong probe_max = ele_max;
-
- TEST( ele_max>=4UL );
- TEST( meta_footprint >= fd_vinyl_meta_footprint( ele_max, lock_cnt, probe_max ) );
-
- ulong pair_max = ele_max - 1UL;
- ulong line_cnt = fd_ulong_min( line_footprint / sizeof( fd_vinyl_line_t ), pair_max );
-
- TEST( (3UL<=line_cnt) & (line_cnt<=FD_VINYL_LINE_MAX) );
-
- TEST( io );
-
- /* seed is arb */
-
- TEST( (0ULcnc = fd_cnc_join( fd_cnc_new( _cnc, FD_VINYL_CNC_APP_SZ, FD_VINYL_CNC_TYPE, fd_log_wallclock() ) ); TEST( vinyl->cnc );
- vinyl->line = (fd_vinyl_line_t *)_line;
- vinyl->io = io;
-
- vinyl->line_cnt = line_cnt;
- vinyl->pair_max = pair_max;
- vinyl->async_min = async_min;
- vinyl->async_max = async_max;
-
- vinyl->part_thresh = part_thresh;
- vinyl->gc_thresh = gc_thresh;
- vinyl->gc_eager = gc_eager;
- vinyl->style = style;
- vinyl->line_idx_lru = 0U;
- vinyl->pair_cnt = 0UL;
- vinyl->garbage_sz = 0UL;
-
- TEST( fd_vinyl_meta_join( vinyl->meta, fd_vinyl_meta_new( _meta, ele_max, lock_cnt, probe_max, seed ), _ele )==vinyl->meta );
-
- TEST( fd_vinyl_data_init( vinyl->data, _obj, obj_footprint, obj_laddr0 )==vinyl->data );
-
- vinyl->cnc_footprint = cnc_footprint;
- vinyl->meta_footprint = meta_footprint;
- vinyl->line_footprint = line_footprint;
- vinyl->ele_footprint = ele_footprint;
- vinyl->obj_footprint = obj_footprint;
-
- FD_LOG_NOTICE(( "Recovering bstream past (level %i)", level ));
-
- TEST( fd_vinyl_seq_eq( fd_vinyl_recover( tpool,t0,t1, level, vinyl ), fd_vinyl_io_seq_present( io ) ) );
-
-# undef TEST
-
- FD_LOG_NOTICE(( "Initializing complete" ));
-
- return vinyl;
-}
-
void *
fd_vinyl_fini( fd_vinyl_t * vinyl ) {
diff --git a/src/vinyl/fd_vinyl.h b/src/vinyl/fd_vinyl.h
index 89a1d72cf25..7b638e697ab 100644
--- a/src/vinyl/fd_vinyl.h
+++ b/src/vinyl/fd_vinyl.h
@@ -109,36 +109,6 @@ FD_PROTOTYPES_BEGIN
FD_FN_CONST ulong fd_vinyl_align ( void );
FD_FN_CONST ulong fd_vinyl_footprint( void );
-/* fd_vinyl_init uses the the caller (typically tpool thread t0) and
- tpool threads (t0,t1) to init the vinyl structure (this structure can
- be extremely large ... hundreds of gigabytes to terabytes in memory
- for petabytes or more in persistent storage ... so it is worthwhile
- to parallelize the initialization). The bstream's past will be used
- to recover the vinyl instance to the bstream's seq_present. The
- recovery level is given by level (see fd_vinyl_recover below).
- Assumes tpool threads (t0,t1) are available for dispatch. These
- threads will be avaialble for dispatch on return. Retain no interest
- in tpool. If tpool is NULL and/or the set [t0,t1) is empty/invalid,
- uses a serial algorithm for initialization. */
-
-fd_vinyl_t *
-fd_vinyl_init( fd_tpool_t * tpool, ulong t0, ulong t1, int level,
- void * lmem, /* memory region to hold the vinyl's state */
- void * shcnc, ulong cnc_footprint, /* memory region to use for the tile cnc */
- void * shmeta, ulong meta_footprint, /* memory region to use for the cached pair metadata state */
- void * shline, ulong line_footprint, /* memory region to use for the cached pair state */
- void * shele, ulong ele_footprint, /* memory region to use for the cached pair metadata */
- void * shobj, ulong obj_footprint, /* memory region to use for the cached pairs */
- fd_vinyl_io_t * io, /* interface to the underlying bstream */
- ulong seed,
- void * obj_laddr0,
- ulong async_min,
- ulong async_max,
- ulong part_thresh,
- ulong gc_thresh,
- int gc_eager,
- int style );
-
void *
fd_vinyl_fini( fd_vinyl_t * vinyl );
@@ -204,74 +174,6 @@ FD_FN_PURE static inline ulong fd_vinyl_gc_thresh ( fd_vinyl_t const * vinyl )
FD_FN_PURE static inline int fd_vinyl_gc_eager ( fd_vinyl_t const * vinyl ) { return vinyl->gc_eager; }
FD_FN_PURE static inline int fd_vinyl_style ( fd_vinyl_t const * vinyl ) { return vinyl->style; }
-/* fd_vinyl_compact does up to compact_max rounds of compaction to the
- bstream's past. This cannot fail from the caller's perspective (will
- FD_LOG_CRIT if any corruption is detected). */
-/* FIXME: PRIVATE */
-
-void
-fd_vinyl_compact( fd_vinyl_t * vinyl,
- ulong compact_max );
-
-/* fd_vinyl_recover uses the caller (typically tpool thread t0) and
- tpool threads (t0,t1) to reset the vinyl meta cache, reset the vinyl
- data cache, reset vinyl cache line eviction priorities and repopulate
- the vinyl meta data cache from the current state of the bstream's
- past to the bstream's seq_present. level zero/non-zero indicates to
- do a soft/hard reset. In a soft reset, the data cache region is
- minimally cleared. In a hard reset, it is fully cleared. A hard
- reset is recommended for most usage but a soft reset can allow faster
- startup for rapid iteration during development.
-
- Returns the bstream sequence number of how far recovery got (if this
- is not seq_present, the recovery was partial and it is theoretically
- moves in the recovery were not processed atomically). Logs details
- of any issues encoutered.
-
- Assumes the tpool threads (t0,t1) are available for dispatch.
- Retains no interest in tpool and threads (t0,t1) will be available
- for dispatch on return. */
-/* FIXME: PRIVATE */
-
-ulong
-fd_vinyl_recover( fd_tpool_t * tpool, ulong t0, ulong t1, int level,
- fd_vinyl_t * vinyl );
-
-/* fd_vinyl_exec runs a vinyl tile on the caller. */
-
-void
-fd_vinyl_exec( fd_vinyl_t * vinyl );
-
-int
-fd_vinyl_halt( fd_cnc_t * cnc );
-
-int
-fd_vinyl_sync( fd_cnc_t * cnc );
-
-int
-fd_vinyl_get( fd_cnc_t * cnc,
- int opt,
- ulong * opt_val );
-
-int
-fd_vinyl_set( fd_cnc_t * cnc,
- int opt,
- ulong val,
- ulong * opt_val );
-
-int
-fd_vinyl_client_join( fd_cnc_t * cnc,
- fd_vinyl_rq_t * rq,
- fd_vinyl_cq_t * cq,
- fd_wksp_t * wksp,
- ulong link_id,
- ulong burst_max,
- ulong quota_max );
-
-int
-fd_vinyl_client_leave( fd_cnc_t * cnc,
- ulong link_id );
-
#define FD_VINYL_CNC_SIGNAL_CSTR_BUF_MAX (21UL)
char *
diff --git a/src/vinyl/fd_vinyl_case_acquire.c b/src/vinyl/fd_vinyl_case_acquire.c
deleted file mode 100644
index d88255e6e98..00000000000
--- a/src/vinyl/fd_vinyl_case_acquire.c
+++ /dev/null
@@ -1,399 +0,0 @@
- case FD_VINYL_REQ_TYPE_ACQUIRE: {
- ulong req_flags = (ulong)req->flags;
- fd_vinyl_key_t const * req_key = MAP_REQ_GADDR( req->key_gaddr, fd_vinyl_key_t, batch_cnt );
- ulong * req_val_gaddr = MAP_REQ_GADDR( req->val_gaddr_gaddr, ulong, batch_cnt );
- schar * req_err = MAP_REQ_GADDR( req->err_gaddr, schar, batch_cnt );
-
- int req_flag_modify = fd_vinyl_req_flag_modify( req_flags );
- int req_flag_ignore = fd_vinyl_req_flag_ignore( req_flags );
- int req_flag_create = fd_vinyl_req_flag_create( req_flags );
- int req_flag_excl = fd_vinyl_req_flag_excl ( req_flags );
- int req_evict_prio = fd_vinyl_req_evict_prio ( req_flags );
-
- int bad_gaddr = (!!batch_cnt) & ((!req_key) | (!req_val_gaddr) | (!req_err));
- int bad_quota = quota_remFD_VINYL_VAL_MAX ) ) DONE( FD_VINYL_ERR_INVAL );
- }
-
- /* Query vinyl meta for key */
-
- fd_vinyl_key_t const * key = req_key + batch_idx;
-
- ulong memo = fd_vinyl_key_memo( meta_seed, key );
-
- ulong _ele_idx; /* avoid pointer escape */
- int err = fd_vinyl_meta_query_fast( ele0, ele_max, key, memo, &_ele_idx );
- ulong ele_idx = _ele_idx; /* In [0,ele_max) */
-
- if( FD_LIKELY( !err ) ) { /* pair key meta cached */
-
- /* At this point, pair key either exists at bstream seq_present
- or is in the process of being created. If pair key is being
- created, fail with AGAIN (it must be acquired for modify). */
-
- ulong pair_ctl = ele0[ ele_idx ].phdr.ctl;
-
- FD_CRIT( (fd_vinyl_bstream_ctl_type( pair_ctl )==FD_VINYL_BSTREAM_CTL_TYPE_PAIR) | (pair_ctl==ULONG_MAX),
- "corruption detected" );
-
- if( FD_UNLIKELY( pair_ctl==ULONG_MAX ) ) DONE( FD_VINYL_ERR_AGAIN );
-
- /* At this point, pair key exists at bstream seq_present. */
-
- ulong val_sz = (ulong)ele0[ ele_idx ].phdr.info.val_sz;
- ulong line_idx = ele0[ ele_idx ].line_idx;
-
- FD_CRIT( val_sz<=FD_VINYL_VAL_MAX, "corruption detected" );
- FD_CRIT( (line_idxline_idx==line_idx, "corruption detected" );
-
- ulong line_ctl = line[ line_idx ].ctl;
-
- ulong ver = fd_vinyl_line_ctl_ver( line_ctl );
- long ref = fd_vinyl_line_ctl_ref( line_ctl );
-
- if( FD_LIKELY( !req_flag_modify ) ) {
-
- /* At this point, we are acquiring a cached pair for read.
- If the line is acquired for modify, fail with AGAIN. If
- there are too many acquires for read on this pair, CRIT
- (could consider AGAIN here). Otherwise, we update the
- ref count (don't change the ver), point the client at the
- line caching pair key to finish the acquire. Note that
- we don't validate the pair header if we detect that an
- earlier acquire in this batch started fetching the pair
- because the read might still be in progress (see note
- below for more details). */
-
- if( FD_UNLIKELY( ref<0L ) ) DONE( FD_VINYL_ERR_AGAIN );
- if( FD_UNLIKELY( ref>=FD_VINYL_LINE_REF_MAX ) ) FD_LOG_CRIT(( "too many acquires for read on this pair" ));
-
- if( FD_LIKELY( !obj->rd_active ) ) {
- fd_vinyl_bstream_phdr_t * phdr = fd_vinyl_data_obj_phdr( obj );
-
- FD_CRIT( fd_vinyl_data_obj_val_max( obj ) >= val_sz, "corruption detected" );
- FD_CRIT( phdr->ctl==fd_vinyl_bstream_ctl( FD_VINYL_BSTREAM_CTL_TYPE_PAIR,
- FD_VINYL_BSTREAM_CTL_STYLE_RAW, val_sz ), "corruption detected" );
- FD_CRIT( fd_vinyl_key_eq( &phdr->key, key ), "corruption detected" );
- FD_CRIT( !memcmp( &phdr->info, &ele0[ ele_idx ].phdr.info, sizeof(fd_vinyl_info_t) ), "corruption detected" );
- }
-
- line[ line_idx ].ctl = fd_vinyl_line_ctl( ver, ref+1L ); /* don't bump ver */
-
- req_val_gaddr[ batch_idx ] = (ulong)fd_vinyl_data_obj_val( obj ) - data_laddr0;
-
- DONE( FD_VINYL_SUCCESS );
-
- }
-
- /* At this point, we are acquiring a cached pair for modify.
- If we are not allowed to acquire an existing pair for
- modify (INVAL) or if the line line_idx is already acquired
- for anything (AGAIN), fail. */
-
- if( FD_UNLIKELY( ref ) ) DONE( FD_VINYL_ERR_AGAIN );
- if( FD_UNLIKELY( req_flag_excl ) ) DONE( FD_VINYL_ERR_INVAL );
-
- fd_vinyl_bstream_phdr_t * phdr = fd_vinyl_data_obj_phdr( obj );
-
- FD_CRIT( !obj->rd_active, "corruption detected" );
- FD_CRIT( fd_vinyl_data_obj_val_max( obj ) >= val_sz, "corruption detected" );
- FD_CRIT( phdr->ctl==fd_vinyl_bstream_ctl( FD_VINYL_BSTREAM_CTL_TYPE_PAIR,
- FD_VINYL_BSTREAM_CTL_STYLE_RAW, val_sz ), "corruption detected" );
- FD_CRIT( fd_vinyl_key_eq( &phdr->key, key ), "corruption detected" );
- FD_CRIT( !memcmp( &phdr->info, &ele0[ ele_idx ].phdr.info, sizeof(fd_vinyl_info_t) ), "corruption detected" );
-
- /* If the ignore flag is set, set the cached value size to 0. */
-
- if( req_flag_ignore ) {
- phdr->info.val_sz = 0U;
- val_sz = 0UL;
- }
-
- /* If the current location for the pair key's data isn't
- sufficient to hold the worst case val_sz that the client
- might modify the pair's value into, adjust the space
- available for the pair to the user's val_max. Because we
- might be ignoring the existing value, this could be smaller
- than the current object. (We could chose to not trim in
- this case because it will get trimmed again on release.
- But doing so makes a more consistent guarantee to the
- client and makes testing easier.) */
-
- ulong csz = sizeof(fd_vinyl_bstream_phdr_t) + val_sz;
-
- ulong szc_new = fd_vinyl_data_szc( fd_ulong_max( val_sz, req_val_max ) );
- ulong szc_old = (ulong)obj->szc;
-
- if( FD_UNLIKELY( szc_new != szc_old ) ) {
-
- fd_vinyl_data_obj_t * obj_new = fd_vinyl_data_alloc( data, szc_new );
- if( FD_UNLIKELY( !obj_new ) ) FD_LOG_CRIT(( "increase data cache size" ));
-
- fd_vinyl_bstream_phdr_t * phdr_new = fd_vinyl_data_obj_phdr( obj_new );
-
- memcpy( phdr_new, phdr, csz );
-
- fd_vinyl_data_free( data, obj );
-
- phdr = phdr_new;
- obj = obj_new;
-
- line[ line_idx ].obj = obj; obj->line_idx = line_idx; obj->rd_active = (short)0;
- }
-
- /* Zero out any remaining space in the pair. */
-
- ulong zsz = fd_vinyl_bstream_pair_sz( fd_vinyl_data_szc_val_max( szc_new ) ) - csz;
- memset( ((uchar *)phdr) + csz, 0, zsz );
-
- /* Finish up acquiring for modify */
-
- //line[ line_idx ].obj = ... already init;
- //line[ line_idx ].ele_idx = ... already init;
- line[ line_idx ].ctl = fd_vinyl_line_ctl( ver+1UL, -1L ); /* bump ver */
-
- fd_vinyl_line_evict_prio( &vinyl->line_idx_lru, line, line_cnt, line_idx, req_evict_prio );
-
- //phdr->ctl = ... already init
- //phdr->key = ... already init
- //phdr->info = ... already init
-
- //ele0[ ele_idx ] = ... already init
-
- req_val_gaddr[ batch_idx ] = (ulong)fd_vinyl_data_obj_val( obj ) - data_laddr0;
-
- DONE( FD_VINYL_SUCCESS );
-
- } /* pair key data cached */
-
- /* At this point, pair key is not cached. If we are not allowed
- to acquire this pair, fail. Otherwise, evict the least
- recently used evictable line (this should always be possible
- if quotas are confiured correctly) to make room to cache this
- pair. Connect this line to meta element ele_idx, set the
- line's reference count appropriately, bump the line's version
- and move the line to the desired location in the eviction
- sequence. We don't modify any shared fields in meta element
- ele_idx so we can do the modification fast.
-
- We do this upfront to free data cache for the alloc if the
- LRU line is in use and to handle the same pair appearing
- multiple times in an acquire.
-
- That is, if req_key appears multiple times in an acquire to
- modify, the trailing redundant acquires will see the object
- as cached with ref==-1 and fail with AGAIN. If the key
- appears multiple times in an acquire for read, the trailing
- redundant acquires will see the object as cached with ref>0
- and rd_active==1, conclude that the first redundant acquire
- is in the process of reading the pair into cache, skip any
- racy metadata checks, increase the ref count and succeed.
-
- IMPORTANT SAFETY TIP! Note that this implies that client
- doing an acquire-for-read with redundant keys and with
- speculative processing will see req_err transition to success
- for the trailing redundant items for a key before the leading
- item of that key transitions to success (and thus before the
- object is fully read / verified and/or decoded). It is up to
- the client doing speculative cut through processing to avoid
- redundant keys or react accordingly. */
-
- if( FD_UNLIKELY( req_flag_modify & req_flag_excl ) ) DONE( FD_VINYL_ERR_INVAL );
-
- line_idx = fd_vinyl_line_evict_lru( &vinyl->line_idx_lru, line, line_cnt, ele0, ele_max, data );
-
- ulong line_ctl = line[ line_idx ].ctl;
-
- ulong ver = fd_vinyl_line_ctl_ver( line_ctl );
-
- line[ line_idx ].ele_idx = ele_idx; ele0[ ele_idx ].line_idx = line_idx;
- line[ line_idx ].ctl = fd_vinyl_line_ctl( ver+1UL, req_flag_modify ? -1L : 1L );
-
- fd_vinyl_line_evict_prio( &vinyl->line_idx_lru, line, line_cnt, line_idx, req_evict_prio );
-
- /* Allocate an appropriately sized object to hold this pair,
- connect it to this line and report the location to the client. */
-
- ulong val_max = fd_ulong_if( !req_flag_modify, val_sz,
- fd_ulong_if( !req_flag_ignore, fd_ulong_max( val_sz, req_val_max ),
- req_val_max ) );
-
- ulong szc = fd_vinyl_data_szc( val_max );
-
- fd_vinyl_data_obj_t * obj = fd_vinyl_data_alloc( data, szc );
- if( FD_UNLIKELY( !obj ) ) FD_LOG_CRIT(( "increase data cache size" ));
-
- line[ line_idx ].obj = obj; obj->line_idx = line_idx;
-
- void * val = fd_vinyl_data_obj_val( obj );
-
- req_val_gaddr[ batch_idx ] = (ulong)val - data_laddr0;
-
- /* If we need to do I/O, start reading encoded pair data and
- defer the data integrity and decoding to later (and then in
- whatever order the I/O layer sees fit). */
-
- if( FD_LIKELY( !(req_flag_modify & req_flag_ignore) ) ) {
- obj->rd_active = (short)1;
-
- int style = fd_vinyl_bstream_ctl_style( pair_ctl );
- ulong val_esz = fd_vinyl_bstream_ctl_sz ( pair_ctl );
-
- FD_CRIT( val_esz<=FD_VINYL_VAL_MAX, "corruption detected" );
- FD_CRIT( (style!=FD_VINYL_BSTREAM_CTL_STYLE_RAW) | (val_sz==val_esz), "corruption detected" );
-
- fd_vinyl_data_obj_t * cobj;
-
- if( FD_LIKELY( style==FD_VINYL_BSTREAM_CTL_STYLE_RAW ) ) cobj = obj;
- else {
- cobj = fd_vinyl_data_alloc( data, fd_vinyl_data_szc( val_esz ) );
- if( FD_UNLIKELY( !cobj ) ) FD_LOG_CRIT(( "increase data cache size" ));
- }
-
- cobj->rd->ctx = (ulong)obj;
- cobj->rd->seq = ele0[ ele_idx ].seq;
- cobj->rd->dst = fd_vinyl_data_obj_phdr( cobj );
- cobj->rd->sz = fd_vinyl_bstream_pair_sz( val_esz );
-
- cobj->rd_err = req_err + batch_idx;
-
- fd_vinyl_io_read( io, cobj->rd );
- read_cnt++;
-
- quota_rem--;
- goto next_acquire;
- }
-
- /* At this point, we are acquiring to modify but we don't need
- the existing value. We populate the cached pair header
- appropriately for the modify and zero the rest to complete
- this request immediately. */
-
- obj->rd_active = (short)0;
-
- fd_vinyl_bstream_phdr_t * phdr = fd_vinyl_data_obj_phdr( obj );
-
- phdr->ctl = fd_vinyl_bstream_ctl( FD_VINYL_BSTREAM_CTL_TYPE_PAIR, FD_VINYL_BSTREAM_CTL_STYLE_RAW, val_sz );
- phdr->key = *key;
- phdr->info = ele0[ ele_idx ].phdr.info;
-
- phdr->info.val_sz = 0U;
-
- memset( val, 0, fd_vinyl_data_szc_obj_footprint( szc ) - sizeof(fd_vinyl_data_obj_t) - sizeof(fd_vinyl_bstream_phdr_t) );
-
- DONE( FD_VINYL_SUCCESS );
-
- } /* pair key meta cached */
-
- /* At this point, pair key does not exist at bstream seq_present
- and is not in the process of being created. If we aren't
- allowed to create pair key, fail. Otherwise, evict the least
- recently used evictable line (this should always be possible if
- quotas are configured correctly) to make room to cache this
- pair, set the line's reference count appropriately, bump the
- version and move the line to the desired location in the
- eviction sequence. We do this upfront to free data cache for
- the alloc if the LRU line is in use. */
-
- if( FD_UNLIKELY( !(req_flag_modify & req_flag_create) ) ) DONE( FD_VINYL_ERR_KEY );
-
- ulong line_idx = fd_vinyl_line_evict_lru( &vinyl->line_idx_lru, line, line_cnt, ele0, ele_max, data );
-
- ulong line_ctl = line[ line_idx ].ctl;
-
- ulong ver = fd_vinyl_line_ctl_ver( line_ctl );
-
- line[ line_idx ].ctl = fd_vinyl_line_ctl( ver+1UL, -1L );
-
- fd_vinyl_line_evict_prio( &vinyl->line_idx_lru, line, line_cnt, line_idx, req_evict_prio );
-
- /* Allocate an appropriately sized object to hold this pair and
- connect it to this line. */
-
- ulong szc = fd_vinyl_data_szc( req_val_max );
-
- fd_vinyl_data_obj_t * obj = fd_vinyl_data_alloc( data, szc );
- if( FD_UNLIKELY( !obj ) ) FD_LOG_CRIT(( "increase data cache size" ));
-
- line[ line_idx ].obj = obj; obj->line_idx = line_idx; obj->rd_active = (short)0;
-
- /* Allocate a meta element to hold metadata for this pair and
- connect it to this line. Since we are inserting at meta
- element ele_idx, we don't need to lock anything so long as we
- mark the element as in use very last. */
-
- ulong pair_cnt = vinyl->pair_cnt;
- if( FD_UNLIKELY( pair_cnt>=pair_max ) ) FD_LOG_CRIT(( "increase meta cache size" ));
- vinyl->pair_cnt = pair_cnt + 1UL;
-
- ele0[ ele_idx ].memo = memo;
- //ele0[ ele_idx ].phdr.ctl init below
- ele0[ ele_idx ].phdr.key = *key;
- memset( &ele0[ ele_idx ].phdr.info, 0, sizeof(fd_vinyl_info_t) ); /* sets val_sz to 0 */
- ele0[ ele_idx ].line_idx = line_idx;
- ele0[ ele_idx ].seq = 0UL; /* Will be init on release */
-
- FD_COMPILER_MFENCE();
- ele0[ ele_idx ].phdr.ctl = ULONG_MAX; /* Mark as being created */
- FD_COMPILER_MFENCE();
-
- line[ line_idx ].ele_idx = ele_idx;
-
- /* Initialize the data region for a new pair */
-
- *fd_vinyl_data_obj_phdr( obj ) = ele0[ ele_idx ].phdr;
-
- uchar * val = (uchar *)fd_vinyl_data_obj_val( obj );
-
- memset( val, 0, fd_vinyl_data_szc_obj_footprint( szc ) - sizeof(fd_vinyl_data_obj_t) - sizeof(fd_vinyl_bstream_phdr_t) );
-
- req_val_gaddr[ batch_idx ] = (ulong)val - data_laddr0;
-
- DONE( FD_VINYL_SUCCESS );
-
- next_acquire: /* silly language restriction */;
-
-# undef DONE
-
- } /* for batch_idx */
-
- FD_CRIT( (!read_cnt) | (!(req_flag_modify & req_flag_ignore)), "corruption detected" );
-
- comp_err = FD_VINYL_SUCCESS;
- break;
- }
diff --git a/src/vinyl/fd_vinyl_case_erase.c b/src/vinyl/fd_vinyl_case_erase.c
deleted file mode 100644
index 18adddc6fb1..00000000000
--- a/src/vinyl/fd_vinyl_case_erase.c
+++ /dev/null
@@ -1,108 +0,0 @@
- case FD_VINYL_REQ_TYPE_ERASE: {
-
- fd_vinyl_key_t const * req_key = MAP_REQ_GADDR( req->key_gaddr, fd_vinyl_key_t, batch_cnt );
- schar * req_err = MAP_REQ_GADDR( req->err_gaddr, schar, batch_cnt );
-
- if( FD_UNLIKELY( (!!batch_cnt) & ((!req_key) | (!req_err)) ) ) {
- comp_err = FD_VINYL_ERR_INVAL;
- break;
- }
-
- for( ulong batch_idx=0UL; batch_idxline_idx==line_idx, "corruption detected" );
- FD_CRIT ( !obj->rd_active, "corruption detected" );
-
- ulong ctl = line[ line_idx ].ctl;
-
- ulong ver = fd_vinyl_line_ctl_ver( ctl );
- long ref = fd_vinyl_line_ctl_ref( ctl );
-
- if( FD_UNLIKELY( ref ) ) {
- FD_COMPILER_MFENCE();
- req_err[ batch_idx ] = (schar)FD_VINYL_ERR_AGAIN;
- FD_COMPILER_MFENCE();
- fail_cnt++;
- continue;
- }
-
- line[ line_idx ].obj = NULL;
- line[ line_idx ].ele_idx = ULONG_MAX; //ele0[ ele_idx ].line_idx = ULONG_MAX; /* Technically not necessary given below */
- line[ line_idx ].ctl = fd_vinyl_line_ctl( ver+1UL, 0L ); /* bump version */
-
- fd_vinyl_line_evict_prio( &vinyl->line_idx_lru, line, line_cnt, line_idx, FD_VINYL_LINE_EVICT_PRIO_LRU );
-
- fd_vinyl_data_free( data, obj );
-
- } else {
-
- FD_CRIT( line_idx==ULONG_MAX, "corruption detected" );
-
- }
-
- /* At this point, pair key exists and is not cached. Append a
- dead block and remove it from the meta. This generates two
- pieces of bstream garbage: the old pair and the dead block
- itself (the dead block is only needed for recovery and then
- only while the old pair is in the bstream's past). */
-
- /* FIXME: COMPACT SEQUENTIAL DEADS IN THE BSTREAM TO BE MORE
- SPACE EFFICIENT? */
-
- ulong val_esz = fd_vinyl_bstream_ctl_sz( ele0[ ele_idx ].phdr.ctl );
-
- accum_garbage_cnt += 2UL;
- accum_garbage_sz += fd_vinyl_bstream_pair_sz( val_esz ) + FD_VINYL_BSTREAM_BLOCK_SZ;
-
- fd_vinyl_io_append_dead( io, &ele0[ ele_idx ].phdr, NULL, 0UL );
- append_cnt++;
- accum_dead_cnt++;
-
- fd_vinyl_meta_remove_fast( ele0, ele_max, lock, lock_shift, line, line_cnt, ele_idx );
-
- ulong pair_cnt = vinyl->pair_cnt;
- FD_CRIT( pair_cnt, "corruption detected" );
- vinyl->pair_cnt = pair_cnt - 1UL;
-
- FD_COMPILER_MFENCE();
- req_err[ batch_idx ] = (schar)FD_VINYL_SUCCESS;
- FD_COMPILER_MFENCE();
- }
-
- comp_err = FD_VINYL_SUCCESS;
- break;
- }
diff --git a/src/vinyl/fd_vinyl_case_fetch.c b/src/vinyl/fd_vinyl_case_fetch.c
deleted file mode 100644
index 247c8a621a7..00000000000
--- a/src/vinyl/fd_vinyl_case_fetch.c
+++ /dev/null
@@ -1,116 +0,0 @@
- case FD_VINYL_REQ_TYPE_FETCH: {
- fd_vinyl_key_t const * req_key = MAP_REQ_GADDR( req->key_gaddr, fd_vinyl_key_t, batch_cnt );
-
- if( FD_UNLIKELY( (!!batch_cnt) & (!req_key) ) ) break;
-
- for( ulong batch_idx=0UL; batch_idxline_idx_lru, line, line_cnt, line_idx, FD_VINYL_LINE_EVICT_PRIO_MRU );
-
- continue;
- }
-
- /* At this point, pair key existsat seq_present but is not cached.
- Evict the least recently used evictable line to make room to
- cache this pair. Connect this line to meta element ele_idx,
- set the line's reference count to zero, bump the line's version
- and set the eviction priority to MRU. We don't modify any
- shared fields in meta element ele_idx so we can do the
- modification fast.
-
- We do this upfront to free data cache for the alloc if the LRU
- line is in use and to handle the same pair appearing multiple
- times in an acquire.
-
- The mechanics for fetch requests with redundant keys are
- similar to acquire-for-read requests. In this case, trailing
- redundant fetches will see the pair as cached (due to the first
- redundant fetch ... this one), set the eviction priority to MRU
- (again) and then continue. */
-
- ulong pair_ctl = ele0[ ele_idx ].phdr.ctl;
- ulong val_sz = (ulong)ele0[ ele_idx ].phdr.info.val_sz;
-
- FD_CRIT( fd_vinyl_bstream_ctl_type( pair_ctl )==FD_VINYL_BSTREAM_CTL_TYPE_PAIR, "corruption detected" );
- FD_CRIT( val_sz<=FD_VINYL_VAL_MAX, "corruption detected" );
-
- line_idx = fd_vinyl_line_evict_lru( &vinyl->line_idx_lru, line, line_cnt, ele0, ele_max, data );
-
- ulong line_ctl = line[ line_idx ].ctl;
-
- ulong ver = fd_vinyl_line_ctl_ver( line_ctl );
-
- line[ line_idx ].ele_idx = ele_idx; ele0[ ele_idx ].line_idx = line_idx;
- line[ line_idx ].ctl = fd_vinyl_line_ctl( ver+1UL, 0L );
-
- fd_vinyl_line_evict_prio( &vinyl->line_idx_lru, line, line_cnt, line_idx, FD_VINYL_LINE_EVICT_PRIO_MRU );
-
- /* Allocate an appropriately sized object to hold this pair,
- connect it to this line and start reading the encoded pair data
- into obj. */
-
- ulong szc = fd_vinyl_data_szc( val_sz );
-
- fd_vinyl_data_obj_t * obj = fd_vinyl_data_alloc( data, szc );
- if( FD_UNLIKELY( !obj ) ) FD_LOG_CRIT(( "increase data cache size" ));
-
- line[ line_idx ].obj = obj; obj->line_idx = line_idx;
-
- /* Start reading encoded pair data and defer the validation and
- decoding to later (and then in whatever order the I/O layer
- sees fit). */
-
- obj->rd_active = (short)1;
-
- int style = fd_vinyl_bstream_ctl_style( pair_ctl );
- ulong val_esz = fd_vinyl_bstream_ctl_sz ( pair_ctl );
-
- FD_CRIT( val_esz<=FD_VINYL_VAL_MAX, "corruption detected" );
- FD_CRIT( (style!=FD_VINYL_BSTREAM_CTL_STYLE_RAW) | (val_sz==val_esz), "corruption detected" );
-
- fd_vinyl_data_obj_t * cobj;
-
- if( FD_LIKELY( style==FD_VINYL_BSTREAM_CTL_STYLE_RAW ) ) cobj = obj;
- else {
- cobj = fd_vinyl_data_alloc( data, fd_vinyl_data_szc( val_esz ) );
- if( FD_UNLIKELY( !cobj ) ) FD_LOG_CRIT(( "increase data cache size" ));
- }
-
- cobj->rd->ctx = (ulong)obj;
- cobj->rd->seq = ele0[ ele_idx ].seq;
- cobj->rd->dst = fd_vinyl_data_obj_phdr( cobj );
- cobj->rd->sz = fd_vinyl_bstream_pair_sz( val_esz );
-
- cobj->rd_err = (schar *)cobj->unused;
-
- fd_vinyl_io_read( io, cobj->rd );
- read_cnt++;
- }
-
- break;
- }
diff --git a/src/vinyl/fd_vinyl_case_flush.c b/src/vinyl/fd_vinyl_case_flush.c
deleted file mode 100644
index 75512879ff6..00000000000
--- a/src/vinyl/fd_vinyl_case_flush.c
+++ /dev/null
@@ -1,68 +0,0 @@
- case FD_VINYL_REQ_TYPE_FLUSH: {
-
- fd_vinyl_key_t const * req_key = MAP_REQ_GADDR( req->key_gaddr, fd_vinyl_key_t, batch_cnt );
-
- if( FD_UNLIKELY( (!!batch_cnt) & (!req_key) ) ) break; /* flushes don't generate completions */
-
- for( ulong batch_idx=0UL; batch_idxline_idx==line_idx, "corruption detected" );
- FD_CRIT ( !obj->rd_active, "corruption detected" );
-
- fd_vinyl_line_evict_prio( &vinyl->line_idx_lru, line, line_cnt, line_idx, FD_VINYL_LINE_EVICT_PRIO_LRU );
-
- ulong ctl = line[ line_idx ].ctl;
-
- ulong ver = fd_vinyl_line_ctl_ver( ctl );
- long ref = fd_vinyl_line_ctl_ref( ctl );
-
- if( FD_UNLIKELY( ref ) ) continue;
-
- /* At this point, pair key is cached, not acquired and the line
- is at LRU position. Flush the cached data. We don't modify
- any shared fields of meta element ele_idx so we can do this
- fast. */
-
- line[ line_idx ].obj = NULL;
- line[ line_idx ].ele_idx = ULONG_MAX;
- line[ line_idx ].ctl = fd_vinyl_line_ctl( ver+1UL, 0UL );
- /* evict prio updated above */
-
- ele0[ ele_idx ].line_idx = ULONG_MAX;
-
- fd_vinyl_data_free( data, obj );
-
- }
-
- break;
- }
diff --git a/src/vinyl/fd_vinyl_case_move.c b/src/vinyl/fd_vinyl_case_move.c
deleted file mode 100644
index 142a8be51ae..00000000000
--- a/src/vinyl/fd_vinyl_case_move.c
+++ /dev/null
@@ -1,326 +0,0 @@
- case FD_VINYL_REQ_TYPE_MOVE: {
-
- fd_vinyl_key_t const * req_key_src = MAP_REQ_GADDR( req->key_gaddr, fd_vinyl_key_t, batch_cnt );
- fd_vinyl_key_t const * req_key_dst = MAP_REQ_GADDR( req->val_gaddr_gaddr, fd_vinyl_key_t, batch_cnt );
- schar * req_err = MAP_REQ_GADDR( req->err_gaddr, schar, batch_cnt );
-
- if( FD_UNLIKELY( (!!batch_cnt) & ((!req_key_src) | (!req_key_dst) | (!req_err)) ) ) {
- comp_err = FD_VINYL_ERR_INVAL;
- break;
- }
-
- for( ulong batch_idx=0UL; batch_idxline_idx==line_idx_src, "corruption detected" );
-
- phdr_src = fd_vinyl_data_obj_phdr( obj_src );
-
- line[ line_idx_src ].ctl = fd_vinyl_line_ctl( ver_src+1UL, 0L );
-
- } else {
-
- FD_CRIT( line_idx_src==ULONG_MAX, "corruption detected" );
-
- /* Read the encoded pair from the bstream */
-
- ulong ctl = ele0[ ele_idx_src ].phdr.ctl;
-
- int type = fd_vinyl_bstream_ctl_type ( ctl );
- int style = fd_vinyl_bstream_ctl_style( ctl );
- ulong val_esz = fd_vinyl_bstream_ctl_sz ( ctl );
-
- FD_CRIT( type==FD_VINYL_BSTREAM_CTL_TYPE_PAIR, "corruption detected" );
- FD_CRIT( (style==FD_VINYL_BSTREAM_CTL_STYLE_RAW) | (style==FD_VINYL_BSTREAM_CTL_STYLE_LZ4), "corruption detected" );
- FD_CRIT( val_esz<=FD_VINYL_VAL_MAX, "corruption detected" );
-
- fd_vinyl_data_obj_t * cobj = fd_vinyl_data_alloc( data, fd_vinyl_data_szc( val_esz ) );
- if( FD_UNLIKELY( !cobj ) ) FD_LOG_CRIT(( "increase data cache size" ));
-
- fd_vinyl_bstream_phdr_t * cphdr = fd_vinyl_data_obj_phdr( cobj );
- ulong cpair_sz = fd_vinyl_bstream_pair_sz( val_esz );
-
- fd_vinyl_io_read_imm( io, seq_src, cphdr, cpair_sz );
- /* not an async read (so no read_cnt increment) */
-
- /* Verify data integrity */
-
- FD_ALERT( !fd_vinyl_bstream_pair_test( io_seed, seq_src, (fd_vinyl_bstream_block_t *)cphdr, cpair_sz ),
- "corruption detected" );
-
- /* Decode the pair */
-
- if( FD_LIKELY( style==FD_VINYL_BSTREAM_CTL_STYLE_RAW ) ) {
-
- FD_CRIT( val_esz==val_sz, "corruption detected" );
-
- obj_src = cobj;
- phdr_src = cphdr;
-
- } else {
-
- obj_src = fd_vinyl_data_alloc( data, fd_vinyl_data_szc( val_sz ) );
- if( FD_UNLIKELY( !obj_src ) ) FD_LOG_CRIT(( "increase data cache size" ));
-
- char const * cval = (char const *)fd_vinyl_data_obj_val( cobj );
- char * val = (char *) fd_vinyl_data_obj_val( obj_src );
- if( FD_UNLIKELY( (ulong)LZ4_decompress_safe( cval, val, (int)val_esz, (int)val_sz )!=val_sz ) )
- FD_LOG_CRIT(( "LZ4_decompress_safe failed" ));
-
- phdr_src = fd_vinyl_data_obj_phdr( obj_src );
-
- phdr_src->ctl = fd_vinyl_bstream_ctl( FD_VINYL_BSTREAM_CTL_TYPE_PAIR, FD_VINYL_BSTREAM_CTL_STYLE_RAW, val_sz );
- phdr_src->key = cphdr->key;
- phdr_src->info = cphdr->info;
-
- fd_vinyl_data_free( data, cobj );
-
- }
-
- line_idx_src = fd_vinyl_line_evict_lru( &vinyl->line_idx_lru, line, line_cnt, ele0, ele_max, data );
-
- ulong line_ctl_src = line[ line_idx_src ].ctl;
-
- ulong ver_src = fd_vinyl_line_ctl_ver( line_ctl_src );
-
- line[ line_idx_src ].obj = obj_src; obj_src->line_idx = line_idx_src; obj_src->rd_active = (short)0;
- line[ line_idx_src ].ele_idx = ele_idx_src; ele0[ ele_idx_src ].line_idx = line_idx_src;
- line[ line_idx_src ].ctl = fd_vinyl_line_ctl( ver_src+1UL, 0L );
-
- fd_vinyl_line_evict_prio( &vinyl->line_idx_lru, line, line_cnt, line_idx_src, FD_VINYL_LINE_EVICT_PRIO_LRU );
-
- if( line_idx_src==line_idx_dst ) line_idx_dst = ULONG_MAX; /* Handle evict_lru evicting the dst */
-
- }
-
- /* At this point, pair key_src is cached but not acquired and pair
- key_dst is not acquired. We are clear to move. If pair
- key_dst exists, we are replacing pair key_dst with pair
- key_src. In this case, we remove pair key_dst from cache and
- remove pair key_dst from the meta. This remove might move the
- location of pair key_src's meta element. So we reload if
- necessary. */
-
- FD_CRIT( fd_vinyl_bstream_ctl_type( phdr_src->ctl )==fd_vinyl_bstream_ctl_type( ele0[ ele_idx_src ].phdr.ctl ),
- "corruption detected" );
- FD_CRIT( fd_vinyl_key_eq( &phdr_src->key, &ele0[ ele_idx_src ].phdr.key ), "corruption detected" );
- FD_CRIT( !memcmp( &phdr_src->info, &ele0[ ele_idx_src ].phdr.info, sizeof(fd_vinyl_info_t) ), "corruption detected" );
-
- accum_garbage_cnt += 2UL; /* old src and new move block */
- accum_garbage_sz += fd_vinyl_bstream_pair_sz( fd_vinyl_bstream_ctl_sz( ele0[ ele_idx_src ].phdr.ctl ) ) +
- FD_VINYL_BSTREAM_BLOCK_SZ;
-
- if( FD_UNLIKELY( !err_dst ) ) {
-
- accum_garbage_cnt++; /* old dst */
- accum_garbage_sz += fd_vinyl_bstream_pair_sz( fd_vinyl_bstream_ctl_sz( ele0[ ele_idx_dst ].phdr.ctl ) );
-
- if( FD_UNLIKELY( line_idx_dst < line_cnt ) ) {
-
- FD_CRIT( line[ line_idx_dst ].ele_idx==ele_idx_dst, "corruption detected" );
-
- fd_vinyl_data_obj_t * obj_dst = line[ line_idx_dst ].obj;
-
- FD_ALERT( fd_vinyl_data_is_valid_obj( obj_dst, vol, vol_cnt ), "corruption detected" );
- FD_CRIT ( obj_dst->line_idx==line_idx_dst, "corruption detected" );
-
- ulong line_ctl_dst = line[ line_idx_dst ].ctl;
-
- ulong ver_dst = fd_vinyl_line_ctl_ver( line_ctl_dst );
-
- fd_vinyl_data_free( data, obj_dst );
-
- line[ line_idx_dst ].obj = NULL;
- line[ line_idx_dst ].ele_idx = ULONG_MAX; // ele0[ ele_idx_dst ].line_idx = ULONG_MAX; /* Technically not necessary given below */
- line[ line_idx_dst ].ctl = fd_vinyl_line_ctl( ver_dst+1UL, 0L );
-
- fd_vinyl_line_evict_prio( &vinyl->line_idx_lru, line, line_cnt, line_idx_dst, FD_VINYL_LINE_EVICT_PRIO_LRU );
- }
-
- fd_vinyl_meta_remove_fast( ele0, ele_max, lock, lock_shift, line, line_cnt, ele_idx_dst ); /* See note below about atomicity for concurrent meta readers */
-
- ulong pair_cnt = vinyl->pair_cnt;
- FD_CRIT( pair_cnt, "corruption detected" );
- vinyl->pair_cnt = pair_cnt - 1UL;
-
- err_src = fd_vinyl_meta_query_fast( ele0, ele_max, key_src, memo_src, &_ele_idx_src );
- ele_idx_src = _ele_idx_src; /* In [0,ele_max) */
- FD_CRIT( !err_src, "corruption detected" );
- /* Note: could test other fields post move too */
-
- }
-
- /* At this point, pair key_src is cached but not acquired and pair
- key_dst is not cached and not in the meta (the move block that
- will official erase if it already exists will be written
- below). Update the cached phdr to reflect the move. Remove
- the meta entry for pair key_src and insert a meta entry for
- pair key_dst.
-
- Note: this means from the point of view of concurrent meta
- queries, there will be a brief time interval when pair key_src
- and pair key_dst are both reported as not existing.
-
- As an alternative with more overhead we could instead insert
- the meta element for key_dst, remove the meta element for
- key_src and requery meta for key_dst (as the remove could move
- it). In this case, there will be a gap where both key_src and
- key_dst are both reported as available (and they will point to
- the same cache entry during this interval).
-
- With even more complexity and overhead, we could eliminate the
- gap and overhead and make this atomic from the point of view of
- concurrent meta readers. (Would have compute a lock set that
- cover the target key_dst insert location and the key_src probe
- sequence assuming key_dst has been inserted, lock the locks, do
- the insert, do the remove without any locking behavior, free
- the lock set and then requery where key_dst ended up.) Also
- note that, if we are replacing pair key_dst, at this point,
- pair key_dst is already reported to concurrent meta readers as
- not existing. Would need to extend this to the above.
-
- But it isn't clear that concurrent meta readers care at all.
- So we go with the fast simple method below (it still is atomic
- from the point of view of clients and the bstream). */
-
- ulong pair_sz = fd_vinyl_bstream_pair_sz( val_sz );
- ulong seq_move = fd_vinyl_io_hint( io, FD_VINYL_BSTREAM_BLOCK_SZ + pair_sz );
- ulong seq_dst = seq_move + FD_VINYL_BSTREAM_BLOCK_SZ;
-
- //phdr_src->ctl = ... already init
- phdr_src->key = *key_dst;
- //phdr_src->info = ... already init
-
- fd_vinyl_meta_remove_fast( ele0, ele_max, lock, lock_shift, line, line_cnt, ele_idx_src );
-
- err_dst = fd_vinyl_meta_query_fast( ele0, ele_max, key_dst, memo_dst, &_ele_idx_dst );
- ele_idx_dst = _ele_idx_dst; /* In [0,ele_max) */
-
- FD_CRIT( err_dst==FD_VINYL_ERR_KEY, "corruption detected" );
-
- ele0[ ele_idx_dst ].memo = memo_dst;
- //ele0[ ele_idx_dst ].phdr.ctl = ... init below for concurrent safe insert
- ele0[ ele_idx_dst ].phdr.key = phdr_src->key;
- ele0[ ele_idx_dst ].phdr.info = phdr_src->info;
- ele0[ ele_idx_dst ].line_idx = line_idx_src;
- ele0[ ele_idx_dst ].seq = seq_dst;
-
- FD_COMPILER_MFENCE();
- ele0[ ele_idx_dst ].phdr.ctl = phdr_src->ctl;
- FD_COMPILER_MFENCE();
-
- line[ line_idx_src ].ele_idx = ele_idx_dst;
-
- fd_vinyl_io_append_move( io, phdr_src, key_dst, NULL, 0UL );
- append_cnt++;
- accum_move_cnt++;
-
- fd_vinyl_bstream_pair_hash( io_seed, (fd_vinyl_bstream_block_t *)phdr_src );
-
- ulong seq = fd_vinyl_io_append( io, phdr_src, pair_sz );
- append_cnt++;
- FD_CRIT( fd_vinyl_seq_eq( seq, seq_dst ), "unexpected append location" );
-
- DONE( FD_VINYL_SUCCESS );
-
- next_move: /* silly language restriction */;
-
-# undef DONE
-
- }
-
- comp_err = FD_VINYL_SUCCESS;
- break;
- }
diff --git a/src/vinyl/fd_vinyl_case_release.c b/src/vinyl/fd_vinyl_case_release.c
deleted file mode 100644
index 12c5ae4482f..00000000000
--- a/src/vinyl/fd_vinyl_case_release.c
+++ /dev/null
@@ -1,352 +0,0 @@
- case FD_VINYL_REQ_TYPE_RELEASE: {
-
- ulong req_flags = (ulong)req->flags;
- fd_vinyl_key_t const * req_key = MAP_REQ_GADDR( req->key_gaddr, fd_vinyl_key_t const, batch_cnt );
- ulong * req_val_gaddr = MAP_REQ_GADDR( req->val_gaddr_gaddr, ulong, batch_cnt );
- schar * req_err = MAP_REQ_GADDR( req->err_gaddr, schar, batch_cnt );
-
- int req_flag_modify = fd_vinyl_req_flag_modify( req_flags );
- int req_flag_ignore = fd_vinyl_req_flag_ignore( req_flags );
- int req_flag_erase = fd_vinyl_req_flag_erase ( req_flags );
- int req_flag_by_key = fd_vinyl_req_flag_by_key( req_flags );
- int req_evict_prio = fd_vinyl_req_evict_prio ( req_flags );
-
- if( FD_UNLIKELY( (!!batch_cnt) & ( ((!req_key ) & req_flag_by_key ) |
- ((!req_val_gaddr) & (!req_flag_by_key)) |
- ( !req_err ) ) ) ) {
- comp_err = FD_VINYL_ERR_INVAL;
- break;
- }
-
- for( ulong batch_idx=0UL; batch_idxrd_active ) ) DONE( FD_VINYL_ERR_INVAL );
-
- line_idx = obj->line_idx;
- if( FD_UNLIKELY( line_idx>=line_cnt ) || FD_UNLIKELY( obj!=line[ line_idx ].obj ) ) DONE( FD_VINYL_ERR_INVAL );
-
- ele_idx = line[ line_idx ].ele_idx;
- if( FD_UNLIKELY( ele_idx>=ele_max ) || FD_UNLIKELY( ele0[ ele_idx ].line_idx!=line_idx ) ) DONE( FD_VINYL_ERR_INVAL );
- /* FIXME: MAKE SURE ELE0[ ELE_IDX ] IS IN USE FOR DATA INTEGRITY! */
-
- ulong ctl = line[ line_idx ].ctl;
-
- ver = fd_vinyl_line_ctl_ver( ctl );
- ref = fd_vinyl_line_ctl_ref( ctl );
-
- if( FD_UNLIKELY( !ref ) ) DONE( FD_VINYL_ERR_INVAL ); /* Pair key exists and is cached ... but not acquired */
-
- } else { /* Release by key */
-
- fd_vinyl_key_t const * key = req_key + batch_idx;
-
- ulong memo = fd_vinyl_key_memo( meta_seed, key ); /* This can be slow which is why releasing by val_gaddr is preferred */
-
- ulong _ele_idx; /* avoid pointer escape */
- int err = fd_vinyl_meta_query_fast( ele0, ele_max, key, memo, &_ele_idx );
- ele_idx = _ele_idx; /* in [0,ele_max) */
-
- if( FD_UNLIKELY( err ) ) DONE( FD_VINYL_ERR_INVAL ); /* Pair key does not exist ... can't have been acquired */
-
- line_idx = ele0[ ele_idx ].line_idx;
-
- if( FD_UNLIKELY( line_idx>=line_cnt ) ) { /* Pair key exists but is not cached ... can't have been acquired */
- FD_CRIT( line_idx==ULONG_MAX, "corruption detected" );
- DONE( FD_VINYL_ERR_INVAL );
- }
-
- FD_CRIT( ele_idx==line[ line_idx ].ele_idx, "corruption detected" );
-
- obj = line[ line_idx ].obj;
-
- FD_ALERT( fd_vinyl_data_is_valid_obj( obj, vol, vol_cnt ), "corruption detected" );
- FD_CRIT ( obj->line_idx==line_idx, "corruption detected" );
- FD_CRIT ( !obj->rd_active, "corruption detected" );
-
- ulong ctl = line[ line_idx ].ctl;
-
- ver = fd_vinyl_line_ctl_ver( ctl );
- ref = fd_vinyl_line_ctl_ref( ctl );
-
- if( FD_UNLIKELY( !ref ) ) DONE( FD_VINYL_ERR_INVAL ); /* Pair key exists and is cached ... but not acquired */
-
- }
-
- /* At this point, we are releasing an acquire of the object obj,
- cached at line line_idx with metadata at ele_idx. */
-
- fd_vinyl_bstream_phdr_t * phdr = fd_vinyl_data_obj_phdr( obj );
-
- if( FD_LIKELY( ref>0L ) ) {
-
- /* At this point, we are releasing an acquire for read. If
- the client indicated they modified pair key, we don't have
- data integrity anymore and we CRIT. Otherwise, we update
- line eviction priority and ref count to do the release. */
-
- if( FD_UNLIKELY( req_flag_modify ) ) FD_LOG_CRIT(( "client modified read only acquire" ));
-
- FD_CRIT( phdr->ctl==fd_vinyl_bstream_ctl( FD_VINYL_BSTREAM_CTL_TYPE_PAIR,
- FD_VINYL_BSTREAM_CTL_STYLE_RAW,
- (ulong)ele0[ ele_idx ].phdr.info.val_sz ), "corruption detected" );
- FD_CRIT( fd_vinyl_key_eq( &phdr->key, &ele0[ ele_idx ].phdr.key ), "corruption detected" );
- FD_CRIT( !memcmp( &phdr->info, &ele0[ ele_idx ].phdr.info, sizeof(fd_vinyl_info_t) ), "corruption detected" );
-
- fd_vinyl_line_evict_prio( &vinyl->line_idx_lru, line, line_cnt, line_idx, req_evict_prio );
-
- line[ line_idx ].ctl = fd_vinyl_line_ctl( ver, ref-1L ); /* don't bump ver */
-
- DONE( FD_VINYL_SUCCESS );
- }
-
- /* At this point, we are releasing an acquire for modify */
-
- ulong phdr_ctl = phdr->ctl;
-
- int modifying_existing = (phdr_ctl!=ULONG_MAX);
-
- if( FD_LIKELY( req_flag_modify & (!req_flag_erase) ) ) {
-
- /* At this point, we are either finishing up modifying an
- existing pair (modifying_existing 1) or finishing up creating
- a new pair (modifying_existing 0). Cache the object in the
- smallest size class that supports it. Note that the client
- could have modified info so we only validate ctl and key
- (FIXME: consider validating memo too?). */
-
- FD_CRIT( (!modifying_existing) |
- (phdr_ctl==fd_vinyl_bstream_ctl( FD_VINYL_BSTREAM_CTL_TYPE_PAIR,
- FD_VINYL_BSTREAM_CTL_STYLE_RAW,
- (ulong)ele0[ ele_idx ].phdr.info.val_sz )), "corruption detected" );
- FD_CRIT( fd_vinyl_key_eq( &phdr->key, &ele0[ ele_idx ].phdr.key ), "corruption detected" );
-
- ulong val_sz_after = (ulong)phdr->info.val_sz;
-
- if( FD_UNLIKELY( val_sz_after > fd_vinyl_data_obj_val_max( obj ) ) ) FD_LOG_CRIT(( "client overran memory" ));
-
- ulong szc_before = (ulong)obj->szc;
- ulong szc_after = fd_vinyl_data_szc( val_sz_after );
-
- if( FD_UNLIKELY( szc_before!=szc_after ) ) {
-
- FD_CRIT( szc_afterctl = fd_vinyl_bstream_ctl( FD_VINYL_BSTREAM_CTL_TYPE_PAIR, FD_VINYL_BSTREAM_CTL_STYLE_RAW, val_sz_after );
- /*phdr->key already init */
- /*phdr->info already init */
-
- int style_after;
- ulong val_esz_after;
- ulong seq_after = fd_vinyl_io_append_pair_inplace( io, vinyl->style, phdr, &style_after, &val_esz_after );
- append_cnt++;
-
- /* Update the line and meta to match. Note that setting meta
- element ele_idx phdr.ctl to something other than ULONG_MAX
- marks a pair that was being created as no longer being
- created. For a pair that already existed, we also need to
- update phdr.ctl to reflect that we might be storing this in
- the stream in a different format than it was stored in
- bstream before. Since we are changing shared fields of meta
- element ele_idx, we need to use prepare / publish semantics. */
-
- line[ line_idx ].obj = obj; obj->line_idx = line_idx; obj->rd_active = (short)0;
- //line[ line_idx ].ele_idx ... already init
- line[ line_idx ].ctl = fd_vinyl_line_ctl( ver+1L, 0L ); /* bump ver */
-
- fd_vinyl_line_evict_prio( &vinyl->line_idx_lru, line, line_cnt, line_idx, req_evict_prio );
-
- fd_vinyl_meta_prepare_fast( lock, lock_shift, ele_idx );
-
- //ele0[ ele_idx ].memo = already init
- ele0[ ele_idx ].phdr.ctl = fd_vinyl_bstream_ctl( FD_VINYL_BSTREAM_CTL_TYPE_PAIR, style_after, val_esz_after );
- //ele0[ ele_idx ].phdr.key = already init
- ele0[ ele_idx ].phdr.info = phdr->info;
- ele0[ ele_idx ].seq = seq_after;
- //ele0[ ele_idx ].line_idx = already init
-
- fd_vinyl_meta_publish_fast( lock, lock_shift, ele_idx );
-
- DONE( FD_VINYL_SUCCESS );
-
- }
-
- /* At this point, we are either canceling a modification (modify
- 0, erase d/c) or the modification is to erase the pair (modify
- 1, erase 1). If we are canceling the modification of an
- existing pair and the client indicated the cached pair info and
- cached pair val are still valid, (i.e. release-cancel of an
- acquire-for-modify of an existing pair), we revert the line
- state and adjust the line evict priority. (This code path can
- be omitted if we don't trust the clients to report correctly.
- We do test at least the client is correctly reporting the info
- is not modified.) Note that we might have put this in a larged
- sized obj when we acquired it for modify. So we also move the
- object to the tightest location. */
-
- if( FD_LIKELY( modifying_existing & (!req_flag_modify) & (!req_flag_ignore) ) ) {
-
- /* FIXME: consider allowing the client to always clobber the
- pair info and just restore info from the meta cache? */
-
- if( FD_UNLIKELY( !( (phdr->ctl==fd_vinyl_bstream_ctl( FD_VINYL_BSTREAM_CTL_TYPE_PAIR,
- FD_VINYL_BSTREAM_CTL_STYLE_RAW,
- (ulong)ele0[ ele_idx ].phdr.info.val_sz ) ) &
- (fd_vinyl_key_eq( &phdr->key, &ele0[ ele_idx ].phdr.key ) ) &
- (!memcmp( &phdr->info, &ele0[ ele_idx ].phdr.info, sizeof(fd_vinyl_info_t) )) ) ) )
- FD_LOG_CRIT(( "client clobbered pair info" ));
-
- ulong val_sz_before = (ulong)phdr->info.val_sz;
-
- ulong szc_after = (ulong)obj->szc;
- ulong szc_before = fd_vinyl_data_szc( val_sz_before );
-
- if( FD_UNLIKELY( szc_before!=szc_after ) ) {
-
- FD_CRIT( szc_beforeline_idx = line_idx; obj_before->rd_active = (short)0;
-
- }
-
- line[ line_idx ].ctl = fd_vinyl_line_ctl( ver-1UL, 0L ); /* revert ver */
-
- fd_vinyl_line_evict_prio( &vinyl->line_idx_lru, line, line_cnt, line_idx, req_evict_prio );
-
- DONE( FD_VINYL_SUCCESS );
-
- }
-
- /* At this point, we are canceling a modification of an existing
- pair that no longer has valid cached pair info or cached pair
- val, erasing an existing pair, canceling the creation of a new
- pair or erasing a pair in the process of being created (which
- we treat the same as cancelling the creation).
-
- Since there was nothing cached originally (canceling / erasing
- a pair being created), the cached data is no longer valid
- (cancel with ignore of an existing pair) or the the cached data
- is no longer needed (erase of an existing pair), we free the
- data obj, mark the line as empty, move the line to LRU
- position. */
-
- /* FIXME: INTEGRITY CHECKS ON PHDR HERE? (TRICKY AS WE'D HAVE TO
- MAP OUT EXACTLY WHICH FIELDS CAN BE TRUSTED AT THIS POINT AND
- IT ISN'T OBVIOUS IT MATTERS) */
-
- fd_vinyl_data_free( data, obj );
-
- line[ line_idx ].obj = NULL;
- line[ line_idx ].ele_idx = ULONG_MAX; ele0[ ele_idx ].line_idx = ULONG_MAX;
- line[ line_idx ].ctl = fd_vinyl_line_ctl( ver+1UL, 0L ); /* bump ver */
-
- fd_vinyl_line_evict_prio( &vinyl->line_idx_lru, line, line_cnt, line_idx, FD_VINYL_LINE_EVICT_PRIO_LRU );
-
- /* If we are erasing an existing pair, append a dead block to
- the bstream. This generates two pieces of bstream garbage (the
- old pair and the dead block itself). Likewise, if we are
- erasing an existing pair or cancelling / erasing a pair
- creation, remove the element from the meta. Note that
- req_flag_modify==1 implies req_flag_erase==1 but not vice versa
- at this point. */
-
- if( FD_LIKELY( req_flag_modify & modifying_existing ) ) {
-
- ulong val_esz_before = fd_vinyl_bstream_ctl_sz( ele0[ ele_idx ].phdr.ctl );
-
- accum_garbage_cnt += 2UL;
- accum_garbage_sz += fd_vinyl_bstream_pair_sz( val_esz_before ) + FD_VINYL_BSTREAM_BLOCK_SZ;
-
- fd_vinyl_io_append_dead( io, &ele0[ ele_idx ].phdr, NULL, 0UL );
- append_cnt++;
- accum_dead_cnt++;
-
- }
-
- if( FD_LIKELY( req_flag_modify | (!modifying_existing) ) ) {
- fd_vinyl_meta_remove_fast( ele0, ele_max, lock, lock_shift, line, line_cnt, ele_idx );
-
- ulong pair_cnt = vinyl->pair_cnt;
- FD_CRIT( (0ULpair_cnt = pair_cnt - 1UL;
- }
-
- DONE( FD_VINYL_SUCCESS );
-
- next_release: /* silly language restriction */;
-
-# undef DONE
-
- } /* for batch_idx */
-
- comp_err = FD_VINYL_SUCCESS;
- break;
- }
diff --git a/src/vinyl/fd_vinyl_case_test.c b/src/vinyl/fd_vinyl_case_test.c
deleted file mode 100644
index 3e03319aeea..00000000000
--- a/src/vinyl/fd_vinyl_case_test.c
+++ /dev/null
@@ -1,39 +0,0 @@
- case FD_VINYL_REQ_TYPE_TEST: {
-
- ulong const * req_val_gaddr = MAP_REQ_GADDR( req->val_gaddr_gaddr, ulong, 2UL*batch_cnt );
- schar * req_err = MAP_REQ_GADDR( req->err_gaddr, schar, batch_cnt );
-
- if( FD_UNLIKELY( (!!batch_cnt) & ((!req_val_gaddr) | (!req_err)) ) ) {
- comp_err = FD_VINYL_ERR_INVAL;
- break;
- }
-
- ulong const * req_try = req_val_gaddr + batch_cnt;
-
- for( ulong batch_idx=0UL; batch_idx> 32;
- ulong line_idx = try & FD_VINYL_LINE_MAX;
-
- int err = FD_UNLIKELY( line_idx>=line_cnt ) ? FD_VINYL_ERR_INVAL
- : FD_UNLIKELY( fd_vinyl_line_ctl_ver( line[ line_idx ].ctl )!=ver ) ? FD_VINYL_ERR_CORRUPT
- : FD_VINYL_SUCCESS;
-
- FD_COMPILER_MFENCE();
- req_err[ batch_idx ] = (schar)err;
- FD_COMPILER_MFENCE();
-
- fail_cnt += (ulong)!!err;
-
- }
-
- comp_err = FD_VINYL_SUCCESS;
- break;
- }
diff --git a/src/vinyl/fd_vinyl_case_try.c b/src/vinyl/fd_vinyl_case_try.c
deleted file mode 100644
index 63ef050b6c7..00000000000
--- a/src/vinyl/fd_vinyl_case_try.c
+++ /dev/null
@@ -1,180 +0,0 @@
- case FD_VINYL_REQ_TYPE_TRY: {
-
- FD_STATIC_ASSERT( FD_VINYL_LINE_VER_MAX==((1UL<<32)-1UL), update_impl_for_ver_max );
-
- ulong req_flags = (ulong)req->flags;
- fd_vinyl_key_t const * req_key = MAP_REQ_GADDR( req->key_gaddr, fd_vinyl_key_t, batch_cnt );
- ulong * req_val_gaddr = MAP_REQ_GADDR( req->val_gaddr_gaddr, ulong, 2UL*batch_cnt );
- schar * req_err = MAP_REQ_GADDR( req->err_gaddr, schar, batch_cnt );
-
- int req_evict_prio = fd_vinyl_req_evict_prio( req_flags );
-
- if( FD_UNLIKELY( (!!batch_cnt) & ((!req_key) | (!req_val_gaddr) | (!req_err)) ) ) {
- comp_err = FD_VINYL_ERR_INVAL;
- break;
- }
-
- ulong * req_try = req_val_gaddr + batch_cnt;
-
- for( ulong batch_idx=0UL; batch_idxline_idx==line_idx, "corruption detected" );
-
- ulong line_ctl = line[ line_idx ].ctl;
-
- ulong ver = fd_vinyl_line_ctl_ver( line_ctl );
- long ref = fd_vinyl_line_ctl_ref( line_ctl );
-
- if( FD_UNLIKELY( ref<0L ) ) DONE( FD_VINYL_ERR_AGAIN, ULONG_MAX );
-
- if( FD_LIKELY( !obj->rd_active ) ) {
- fd_vinyl_bstream_phdr_t * phdr = fd_vinyl_data_obj_phdr( obj );
-
- FD_CRIT( fd_vinyl_data_obj_val_max( obj ) >= val_sz, "corruption detected" );
- FD_CRIT( phdr->ctl==fd_vinyl_bstream_ctl( FD_VINYL_BSTREAM_CTL_TYPE_PAIR,
- FD_VINYL_BSTREAM_CTL_STYLE_RAW, val_sz ), "corruption detected" );
- FD_CRIT( fd_vinyl_key_eq( &phdr->key, key ), "corruption detected" );
- FD_CRIT( !memcmp( &phdr->info, &ele0[ ele_idx ].phdr.info, sizeof(fd_vinyl_info_t) ), "corruption detected" );
- }
-
- fd_vinyl_line_evict_prio( &vinyl->line_idx_lru, line, line_cnt, line_idx, req_evict_prio );
-
- req_val_gaddr[ batch_idx ] = (ulong)fd_vinyl_data_obj_val( obj ) - data_laddr0;
-
- DONE( FD_VINYL_SUCCESS, (ver<<32) | line_idx );
-
- }
-
- /* At this point, pair key exists but is not cached. Evict the
- least recently used evictable line to make room to cache this
- pair. Connect this line to meta element ele_idx, set the
- line's reference count to zero, bump the line's version and set
- the eviction priority as desired. We don't modify any shared
- fields in meta element ele_idx so we can do the modification
- fast.
-
- We do this upfront to free data cache for the alloc if the LRU
- line is in use and to handle the same pair appearing multiple
- times in an acquire.
-
- The mechanics for try requests with redundant keys are the same
- as acquire-for-read requests. */
-
- line_idx = fd_vinyl_line_evict_lru( &vinyl->line_idx_lru, line, line_cnt, ele0, ele_max, data );
-
- ulong line_ctl = line[ line_idx ].ctl;
-
- ulong ver = fd_vinyl_line_ctl_ver( line_ctl );
-
- line[ line_idx ].ele_idx = ele_idx; ele0[ ele_idx ].line_idx = line_idx;
- line[ line_idx ].ctl = fd_vinyl_line_ctl( ver+1UL, 0L );
-
- fd_vinyl_line_evict_prio( &vinyl->line_idx_lru, line, line_cnt, line_idx, req_evict_prio );
-
- /* Allocate an appropriately sized object to hold this pair,
- connect it to this line and report the location to the client. */
-
- ulong szc = fd_vinyl_data_szc( val_sz );
-
- fd_vinyl_data_obj_t * obj = fd_vinyl_data_alloc( data, szc );
- if( FD_UNLIKELY( !obj ) ) FD_LOG_CRIT(( "increase data cache size" ));
-
- line[ line_idx ].obj = obj; obj->line_idx = line_idx;
-
- void * val = fd_vinyl_data_obj_val( obj );
-
- req_val_gaddr[ batch_idx ] = (ulong)val - data_laddr0;
- req_try [ batch_idx ] = ((ver+1UL)<<32) | line_idx;
-
- /* Start reading encoded pair data and defer validation and
- decoding to later (and then in whatever order the I/O layer
- sees fit). */
-
- obj->rd_active = (short)1;
-
- int style = fd_vinyl_bstream_ctl_style( pair_ctl );
- ulong val_esz = fd_vinyl_bstream_ctl_sz ( pair_ctl );
-
- FD_CRIT( val_esz<=FD_VINYL_VAL_MAX, "corruption detected" );
- FD_CRIT( (style!=FD_VINYL_BSTREAM_CTL_STYLE_RAW) | (val_sz==val_esz), "corruption detected" );
-
- fd_vinyl_data_obj_t * cobj;
-
- if( FD_LIKELY( style==FD_VINYL_BSTREAM_CTL_STYLE_RAW ) ) cobj = obj;
- else {
- cobj = fd_vinyl_data_alloc( data, fd_vinyl_data_szc( val_esz ) );
- if( FD_UNLIKELY( !cobj ) ) FD_LOG_CRIT(( "increase data cache size" ));
- }
-
- cobj->rd->ctx = (ulong)obj;
- cobj->rd->seq = ele0[ ele_idx ].seq;
- cobj->rd->dst = fd_vinyl_data_obj_phdr( cobj );
- cobj->rd->sz = fd_vinyl_bstream_pair_sz( val_esz );
-
- cobj->rd_err = req_err + batch_idx;
-
- fd_vinyl_io_read( io, cobj->rd );
- read_cnt++;
-
- next_try: /* silly language restriction */;
-
-# undef DONE
-
- } /* for batch_idx */
-
- comp_err = FD_VINYL_SUCCESS;
- break;
- }
diff --git a/src/vinyl/fd_vinyl_compact.c b/src/vinyl/fd_vinyl_compact.c
deleted file mode 100644
index 8f79695a672..00000000000
--- a/src/vinyl/fd_vinyl_compact.c
+++ /dev/null
@@ -1,377 +0,0 @@
-#include
-#include "fd_vinyl.h"
-
-void
-fd_vinyl_compact( fd_vinyl_t * vinyl,
- ulong compact_max ) {
-
- fd_vinyl_io_t * io = vinyl->io;
- ulong gc_thresh = vinyl->gc_thresh;
- int gc_eager = vinyl->gc_eager;
- int style = vinyl->style;
-
- ulong io_seed = fd_vinyl_io_seed ( io ); (void)io_seed;
- ulong seq_past = fd_vinyl_io_seq_past ( io );
- ulong seq_present = fd_vinyl_io_seq_present( io );
-
- if( FD_UNLIKELY( (!compact_max) | ((seq_present-seq_past)<=gc_thresh) | (gc_eager<0) ) ) return;
-
- fd_vinyl_meta_t * meta = vinyl->meta;
- fd_vinyl_line_t * line = vinyl->line;
- ulong line_cnt = vinyl->line_cnt;
- ulong garbage_sz = vinyl->garbage_sz;
-
- fd_vinyl_meta_ele_t * ele0 = meta->ele;
- ulong ele_max = meta->ele_max;
- ulong meta_seed = meta->seed;
-
- fd_vinyl_data_t * data = vinyl->data;
-
- fd_vinyl_data_vol_t * vol = data->vol; (void)vol;
- ulong vol_cnt = data->vol_cnt; (void)vol_cnt;
-
- ulong seq = seq_past;
-
- for( ulong rem=compact_max; rem; rem-- ) {
-
- /* At this point, we've compacted [seq_past,seq) (cyclic), with
- items still needed in this range at [seq_present,seq_future)
- (cyclic). We still have [seq,seq_present) (cyclic), containing
- garbage_sz bytes to compact.
-
- If the new past region is small enough or there is a relatively
- small amount of garbage in this region, we consider the bstream's
- past fully compacted. */
-
- ulong past_sz_new = fd_vinyl_io_seq_future( io ) - seq;
- if( FD_UNLIKELY( (past_sz_new <= gc_thresh ) |
- (garbage_sz <= (past_sz_new >> gc_eager)) |
- (fd_vinyl_seq_ge( seq, seq_present ) ) ) ) {
- FD_CRIT( fd_vinyl_seq_le( seq, seq_present ), "corruption detected" );
- if( FD_UNLIKELY( fd_vinyl_seq_eq( seq, seq_present ) ) ) FD_CRIT( !garbage_sz, "corruption detected" );
- break;
- }
-
- /* At this point, there is enough garbage to do some more
- compaction. Load the leading block of the object at seq and
- determine if this object is needed to recover the bstream's state
- at seq_present.
-
- That is, we determine if the object at bstream_past_new is the
- version of a pair that exists at bstream seq_present. If so, we
- append a copy to the bstream's present.
-
- When compacting is complete, we forget the region containing the
- copy at seq. This then effectively moves the copy from seq to
- seq_future without any risk of losing data while allowing
- compaction to be done with large amounts of async I/O overlapped
- with compaction processing (metadata lookups, hash validation,
- etc).
-
- This move will not move the pair past any conflicting operations
- later in the bstream's past (almost definitionally so as the pair
- is the most recent version). Thus set of pairs recovered at
- seq_future will be identical to the set of pairs recovered at
- seq_present. */
-
- fd_vinyl_bstream_block_t block[1];
-
- fd_vinyl_io_read_imm( io, seq, block, FD_VINYL_BSTREAM_BLOCK_SZ );
-
- ulong ctl = block->ctl;
-
- int type = fd_vinyl_bstream_ctl_type( ctl );
-
- switch( type ) {
-
- case FD_VINYL_BSTREAM_CTL_TYPE_PAIR: {
-
- /* At this point, we've read a pair's leading block into block.
- Validate the pair was completely written. It's okay if we are
- in a move (move block processing the previous iteration already
- confirmed this pair is the proper). */
-
- int pair_style = fd_vinyl_bstream_ctl_style( ctl );
- ulong pair_val_esz = fd_vinyl_bstream_ctl_sz ( ctl );
- fd_vinyl_key_t const * pair_key = &block->phdr.key;
- ulong pair_val_sz = (ulong)block->phdr.info.val_sz;
-
- ulong pair_sz = fd_vinyl_bstream_pair_sz( pair_val_esz );
-
- int truncated = (pair_sz > (seq_present - seq)); /* Wrapping safe */
- int bad_esz = (pair_val_esz > FD_VINYL_VAL_MAX);
- int bad_sz = (pair_val_sz > FD_VINYL_VAL_MAX);
-
- FD_CRIT( !(truncated | bad_esz | bad_sz), truncated ? "truncated pair" :
- bad_esz ? "unexpected pair value encoded size" :
- "pair value size too large" );
-
-# if FD_PARANOID
- fd_vinyl_bstream_block_t _ftr[1];
- fd_vinyl_bstream_block_t * ftr = _ftr;
-
- if( FD_UNLIKELY( pair_sz <= FD_VINYL_BSTREAM_BLOCK_SZ ) ) ftr = block;
- else fd_vinyl_io_read_imm( io, seq + pair_sz - FD_VINYL_BSTREAM_BLOCK_SZ, ftr, FD_VINYL_BSTREAM_BLOCK_SZ );
-
- FD_ALERT( !fd_vinyl_bstream_pair_test_fast( io_seed, seq, block, ftr ), "corruption detected" );
-# endif
-
- /* At this point, we appear to have a valid pair. Query the
- vinyl's meta to determine if this is the version of the pair at
- bstream seq_present. Since this implementation is doing single
- threaded recovery, we can use the single threaded optimized
- meta APIs. */
-
- ulong pair_memo = fd_vinyl_key_memo( meta_seed, pair_key );
-
- ulong _ele_idx; /* avoid pointer escape */
- int err = fd_vinyl_meta_query_fast( ele0, ele_max, pair_key, pair_memo, &_ele_idx );
- ulong ele_idx = _ele_idx;
-
- if( FD_LIKELY( !err ) ) {
-
- /* At this point, a version of pair key is mapped */
-
- if( FD_LIKELY( fd_vinyl_meta_ele_in_bstream( &ele0[ ele_idx ] ) ) ) {
-
- /* At this point, a version of pair key exists at bstream
- seq_present (i.e. is not in the process of being created by
- a client). */
-
- ulong pair_seq = ele0[ ele_idx ].seq;
-
- if( FD_LIKELY( fd_vinyl_seq_eq( pair_seq, seq ) ) ) {
-
- /* At this point, the version of pair key at seq is the
- version of pair key that exists at bstream seq_present.
- Validate the metadata. */
-
- FD_CRIT( !memcmp( &ele0[ ele_idx ].phdr, &block->phdr, sizeof(fd_vinyl_bstream_phdr_t) ), "corruption detected" );
-
- /* If the pair is cached and not acquired for modify, append
- the cached copy in the target style. Otherwise, append a
- (possibly recoded) copy from the bstream. */
-
- int pair_style_new;
- ulong pair_val_esz_new;
- ulong pair_seq_new;
-
- int do_copy = 1;
-
- ulong line_idx = ele0[ ele_idx ].line_idx;
-
- if( FD_LIKELY( line_idx!=ULONG_MAX ) ) { /* Pair is in cache */
-
- FD_CRIT( line_idxline_idx==line_idx, "corruption detected" );
- FD_CRIT ( !obj->rd_active, "corruption detected" );
-
- ulong line_ctl = line[ line_idx ].ctl;
-
- if( FD_LIKELY( fd_vinyl_line_ctl_ref( line_ctl )>=0L ) ) { /* Pair cached and not acquired for modify */
-
- fd_vinyl_bstream_phdr_t * phdr = fd_vinyl_data_obj_phdr( obj );
-
- FD_ALERT( !memcmp( phdr, &block->phdr, sizeof(fd_vinyl_bstream_phdr_t) ), "corruption detected" );
-
- pair_seq_new = fd_vinyl_io_append_pair_inplace( io, style, phdr, &pair_style_new, &pair_val_esz_new );
-
- do_copy = 0;
-
- }
-
- }
-
- if( do_copy ) { /* Pair is either in cache or acquired for modify, append from the bstream */
-
- if( FD_LIKELY( (pair_style!=FD_VINYL_BSTREAM_CTL_STYLE_RAW) |
- (style ==FD_VINYL_BSTREAM_CTL_STYLE_RAW) |
- (pair_sz ==FD_VINYL_BSTREAM_BLOCK_SZ ) ) ) {
-
- /* At this point, the pair is already stored in an
- encoded format, the preferred format for storing
- encoded pairs is raw and/or encoding the pair will
- not make it any smaller in the bstream. Copy the
- pair as is from seq to seq_future. The reason we
- don't reencode the pair in the second case is that
- this pair has likely not been touched since it last
- got to the bstream's seq_past. It would be waste to
- compute and bstream storage to uncompress it as we
- copy it. */
-
- pair_style_new = pair_style;
- pair_val_esz_new = fd_vinyl_bstream_ctl_sz( ele0[ ele_idx ].phdr.ctl );
- pair_seq_new = fd_vinyl_io_copy( io, pair_seq, pair_sz );
-
- } else {
-
- /* At this point, the pair is stored in a raw encoded
- format, the preferred format is an encoded format and
- there is a possibility that encoding it will make it
- smaller. Encode the pair as we copy it from seq to
- seq_future.
-
- To do this, we allocate enough scratch from the io
- append spad to cover the worst case encoded pair and
- the raw pair (this sets the lower bound on how large
- the io append spad must be). Then we read the raw
- pair into the trailing part of the scratch and encode
- from that into the leading part of the scratch.
-
- We play some games with the spad_used so that the
- append_pair_inplace will not invalidate the read and
- so that we use scratch as efficiently as possible
- when there is lots of stuff to compress. */
-
- ulong cpair_max = fd_vinyl_bstream_pair_sz( (ulong)LZ4_COMPRESSBOUND( (int)pair_val_sz ) );
- ulong scratch_max = cpair_max + pair_sz;
-
- fd_vinyl_bstream_phdr_t * cphdr = (fd_vinyl_bstream_phdr_t *)
- fd_vinyl_io_alloc( io, scratch_max, FD_VINYL_IO_FLAG_BLOCKING );
-
- fd_vinyl_bstream_phdr_t * phdr = (fd_vinyl_bstream_phdr_t *)((ulong)cphdr + cpair_max);
-
- fd_vinyl_io_read_imm( io, seq, phdr, pair_sz );
-
- fd_vinyl_io_trim( io, scratch_max );
-
- pair_seq_new = fd_vinyl_io_append_pair_inplace( io, style, phdr, &pair_style_new, &pair_val_esz_new );
-
- /* At this point, we either are appending the encoded
- pair from the leading part of the scratch and
- spad_used is correct or we are appending the pair
- from the trailing part and spad_used does not include
- it. Adjust the spad used for the later case. In
- this second case, we end up with a temporary hole in
- the scratch when we decided not to copy into an
- encoded form. This just scratch is used less
- efficiently in the unlikely case in order to use it
- more efficiently in the likely case (the correct
- tradeoff). */
-
- if( FD_UNLIKELY( pair_style_new==FD_VINYL_BSTREAM_CTL_STYLE_RAW ) ) io->spad_used += scratch_max;
-
- }
- }
-
- /* Note: we don't need to prepare here because we aren't
- modifying shared fields. */
-
- ele0[ ele_idx ].phdr.ctl = fd_vinyl_bstream_ctl( FD_VINYL_BSTREAM_CTL_TYPE_PAIR, pair_style_new, pair_val_esz_new );
- ele0[ ele_idx ].seq = pair_seq_new;
-
- } else {
-
- /* The version of the pair at bstream seq was replaced. The
- most recent version of this pair is at pair_seq. */
-
- FD_CRIT( fd_vinyl_seq_gt( pair_seq, seq ), "corruption detected" );
-
- garbage_sz -= pair_sz;
-
- }
-
- } else {
-
- /* The pair at bstream seq does not exist in the bstream at
- bstream seq_present. It is in the vinyl meta because it is
- being created. We wouldn't be in the process of creating
- it unless this pair (or a subsequent version of it) was
- erased or moved before seq_present. So this pair is
- garbage. */
-
- garbage_sz -= pair_sz;
-
- }
-
- } else {
-
- /* The pair at bstream seq does not exist in the bstream at
- bstream seq_present. This pair (or a subsequent version of
- it) was erased or moved before seq_present. So this pair
- is garbage. */
-
- garbage_sz -= pair_sz;
-
- }
-
- seq += pair_sz;
- break;
-
- }
-
- case FD_VINYL_BSTREAM_CTL_TYPE_DEAD:
- case FD_VINYL_BSTREAM_CTL_TYPE_MOVE:
- case FD_VINYL_BSTREAM_CTL_TYPE_PART: {
-
- /* DEAD blocks can always be compacted out because the version of
- the pair they reference is not in the current view of the
- bstream (because that version was unmapped when the DEAD was
- written), that version was located at an earlier location than
- the DEAD (because blocks are appended sequentially) and thus
- that version has already been compacted out (because a previous
- iteration of this would have encountered it before getting this
- DEAD block, would have detecting that version was no longer
- needed and compacted it at that time instead of moving it to a
- higher sequence number).
-
- MOVE blocks can always be compacted out for the same reasons as
- the above with the twist that, compacting the move block makes
- the pair following look like a create from the point of view of
- a recovery starting at the pair. This is immaterial though
- because doesn't change the recovered view if recovery starts
- on the block after the move.
-
- PART blocks can always be compacted because they are just
- informational (to help partition the bstream past in parallel
- recovery) and this partition ends bstream blocks that have
- already been compacted out.
-
- We validate the block because we already have the data anyway. */
-
- FD_ALERT( !fd_vinyl_bstream_block_test( io_seed, block ), "corruption detected" );
-
- garbage_sz -= FD_VINYL_BSTREAM_BLOCK_SZ;
- seq += FD_VINYL_BSTREAM_BLOCK_SZ;
- break;
-
- }
-
- case FD_VINYL_BSTREAM_CTL_TYPE_ZPAD: {
-
- /* ZPAD blocks can always be compacted out because they are no-ops
- from the point of view of bstream processing (the underlying
- I/O layer can insert these so that, for example, a multi-block
- pair is never split across two different physical volumes).
- Note that zpad blocks aren't included in garbage_sz because we
- don't control when they get created (and thus can't easily
- update garbage_sz to account for them when they are created). */
-
- FD_ALERT( !fd_vinyl_bstream_zpad_test( io_seed, seq, block ), "corruption detected" );
-
- seq += FD_VINYL_BSTREAM_BLOCK_SZ;
- break;
-
- }
-
- default: FD_LOG_CRIT(( "%016lx: unknown type (%x)", seq, (uint)type ));
-
- }
-
- }
-
- /* At this point, we've made copies of all info in [seq_past,seq)
- (cyclic) to [seq_present,seq_future) (cyclic) needed to recover the
- bstream's state at seq_present. We commit the new, forget the old
- and update the garbage size to finish this compaction. */
-
- fd_vinyl_io_commit( io, FD_VINYL_IO_FLAG_BLOCKING );
- fd_vinyl_io_forget( io, seq );
-
- vinyl->garbage_sz = garbage_sz;
-}
diff --git a/src/vinyl/fd_vinyl_ctl.c b/src/vinyl/fd_vinyl_ctl.c
deleted file mode 100644
index 3a5ade25bf8..00000000000
--- a/src/vinyl/fd_vinyl_ctl.c
+++ /dev/null
@@ -1,712 +0,0 @@
-/* For O_DIRECT and O_NOATIME */
-#define _GNU_SOURCE
-
-#include "fd_vinyl.h"
-#include "../util/pod/fd_pod.h"
-
-#include
-#include
-#include
-#include
-#include
-
-FD_IMPORT_CSTR( fd_vinyl_ctl_help, "src/vinyl/fd_vinyl_ctl_help" );
-
-static int
-fd_vinyl_main( int argc,
- char ** argv ) {
-
- ulong seed_default = fd_cstr_hash_append( (ulong)fd_log_wallclock(), fd_log_host() );
-
- char const * _pod = fd_env_strip_cmdline_cstr ( &argc, &argv, "--pod", NULL, NULL );
- char const * _cfg = fd_env_strip_cmdline_cstr ( &argc, &argv, "--cfg", NULL, NULL );
- ulong seed = fd_env_strip_cmdline_ulong( &argc, &argv, "--seed", NULL, seed_default );
- char const * type = fd_env_strip_cmdline_cstr ( &argc, &argv, "--type", NULL, "mm" );
- char const * path = fd_env_strip_cmdline_cstr ( &argc, &argv, "--path", NULL, NULL );
- int dsync = fd_env_strip_cmdline_int ( &argc, &argv, "--dsync", NULL, 0 );
- int direct = fd_env_strip_cmdline_int ( &argc, &argv, "--direct", NULL, 0 );
- int noatime = fd_env_strip_cmdline_int ( &argc, &argv, "--noatime", NULL, 0 );
- char const * _page_sz = fd_env_strip_cmdline_cstr ( &argc, &argv, "--page-sz", NULL, "gigantic" );
- ulong page_cnt = fd_env_strip_cmdline_ulong( &argc, &argv, "--page-cnt", NULL, 1UL );
- ulong near_cpu = fd_env_strip_cmdline_ulong( &argc, &argv, "--near-cpu", NULL, fd_log_cpu_id() );
- int reset = fd_env_strip_cmdline_int ( &argc, &argv, "--reset", NULL, 0 );
- char const * info = fd_env_strip_cmdline_cstr ( &argc, &argv, "--info", NULL, NULL );
- ulong io_seed = fd_env_strip_cmdline_ulong( &argc, &argv, "--io-seed", NULL, 0UL );
-
- int open_flags = O_RDWR | (dsync ? O_DSYNC : 0 ) | (direct ? O_DIRECT : 0) | (noatime ? O_NOATIME : 0);
- ulong page_sz = fd_cstr_to_shmem_page_sz( _page_sz );
- ulong info_sz = info ? (strlen( info )+1UL) : 0UL;
-
- if( FD_UNLIKELY( !_pod ) ) FD_LOG_ERR(( "--pod not specified" ));
- if( FD_UNLIKELY( !page_sz ) ) FD_LOG_ERR(( "bad --page-sz" ));
-
- FD_LOG_NOTICE(( "Attaching to --pod %s", _pod ));
-
- uchar const * pod = fd_wksp_pod_attach( _pod ); /* logs details, guaranteed to succeed */
- uchar const * cfg;
- if( FD_UNLIKELY( !_cfg ) ) {
- FD_LOG_NOTICE(( "--cfg not specified (using pod root for config)" ));
- cfg = pod;
- } else {
- FD_LOG_NOTICE(( "Finding config --cfg %s", _cfg ));
- cfg = fd_pod_query_subpod( pod, _cfg );
- if( FD_UNLIKELY( !cfg ) ) FD_LOG_ERR(( "config not found" ));
- }
-
- FD_LOG_NOTICE(( "Extracting pod configuration" ));
-
- /* See below for explanation of defaults */
- ulong spad_max = fd_pod_query_ulong( cfg, "spad_max", fd_vinyl_io_spad_est() );
- ulong async_min = fd_pod_query_ulong( cfg, "async_min", 2UL );
- ulong async_max = fd_pod_query_ulong( cfg, "async_max", 2UL*async_min );
- ulong part_thresh = fd_pod_query_ulong( cfg, "part_thresh", 1UL<<30 );
- ulong gc_thresh = fd_pod_query_ulong( cfg, "gc_thresh", 8UL<<30 );
- int gc_eager = fd_pod_query_int ( cfg, "gc_eager", 2 );
- int style = fd_pod_query_int ( cfg, "style", FD_VINYL_BSTREAM_CTL_STYLE_LZ4 );
- int level = fd_pod_query_int ( cfg, "level", 1 );
-
- FD_LOG_NOTICE(( "Processing command line configuration overrides" ));
-
- char const * _style = fd_env_strip_cmdline_cstr( &argc, &argv, "--style", NULL, NULL );
- if( _style ) style = fd_cstr_to_vinyl_bstream_ctl_style( _style );
-
- spad_max = fd_env_strip_cmdline_ulong( &argc, &argv, "--spad-max", NULL, spad_max );
- async_min = fd_env_strip_cmdline_ulong( &argc, &argv, "--async-min", NULL, async_min );
- async_max = fd_env_strip_cmdline_ulong( &argc, &argv, "--async-max", NULL, async_max );
- part_thresh = fd_env_strip_cmdline_ulong( &argc, &argv, "--part-thresh", NULL, part_thresh );
- gc_thresh = fd_env_strip_cmdline_ulong( &argc, &argv, "--gc-thresh", NULL, gc_thresh );
- gc_eager = fd_env_strip_cmdline_int ( &argc, &argv, "--gc-eager", NULL, gc_eager );
- level = fd_env_strip_cmdline_int ( &argc, &argv, "--level", NULL, level );
-
- FD_LOG_NOTICE(( "Mapping vinyl memory regions" ));
-
- void * _vinyl = fd_wksp_pod_map( cfg, "vinyl" ); ulong vinyl_footprint = fd_pod_query_ulong( cfg, "vinyl_footprint", 0UL );
- void * _cnc = fd_wksp_pod_map( cfg, "cnc" ); ulong cnc_footprint = fd_pod_query_ulong( cfg, "cnc_footprint", 0UL );
- void * _meta = fd_wksp_pod_map( cfg, "meta" ); ulong meta_footprint = fd_pod_query_ulong( cfg, "meta_footprint", 0UL );
- void * _line = fd_wksp_pod_map( cfg, "line" ); ulong line_footprint = fd_pod_query_ulong( cfg, "line_footprint", 0UL );
- void * _io = fd_wksp_pod_map( cfg, "io" ); ulong io_footprint = fd_pod_query_ulong( cfg, "io_footprint", 0UL );
- void * _ele = fd_wksp_pod_map( cfg, "ele" ); ulong ele_footprint = fd_pod_query_ulong( cfg, "ele_footprint", 0UL );
- void * _obj = fd_wksp_pod_map( cfg, "obj" ); ulong obj_footprint = fd_pod_query_ulong( cfg, "obj_footprint", 0UL );
-
-# define TEST( c, msg ) do { \
- if( FD_UNLIKELY( !(c) ) ) FD_LOG_ERR(( "FAIL: %s (%s)", #c, (msg) )); \
- } while(0)
-
- fd_wksp_t * wksp = fd_wksp_containing( _obj );
- TEST( wksp, "fd_wksp_containing failed" );
-
- TEST( fd_ulong_is_aligned( (ulong)_vinyl, fd_vinyl_io_mm_align() ), "bad alloc" );
- TEST( vinyl_footprint >= fd_vinyl_footprint(), "bad alloc" );
-
- int is_mmio = !strcmp( type, "mm" );
-
- FD_LOG_NOTICE(( "io config"
- "\n\t--type \"%s\""
- "\n\t--spad-max %lu bytes"
- "\n\t--path \"%s\""
- "\n\t--dsync %i"
- "\n\t--direct %i"
- "\n\t--noatime %i"
- "\n\t--page-sz \"%s\"%s"
- "\n\t--page-cnt %lu pages%s"
- "\n\t--near-cpu %lu%s"
- "\n\t--reset %i"
- "\n\t--info \"%s\" (info_sz %lu bytes%s)"
- "\n\t--io-seed 0x%016lx%s",
- type, spad_max, path ? path : "(null)", dsync, direct, noatime,
- _page_sz, is_mmio && !path ? "" : " (ignored)",
- page_cnt, is_mmio && !path ? "" : " (ignored)",
- near_cpu, is_mmio && !path ? "" : " (ignored)",
- reset, info ? info : "(null)", info_sz, reset ? "" : ", ignored", io_seed, reset ? "" : " (ignored)" ));
-
- FD_LOG_NOTICE(( "Joining bstream" ));
-
- int bstream_type;
- int fd = -1;
- void * mmio;
- ulong mmio_sz;
-
- fd_vinyl_io_t * io;
-
- if( FD_LIKELY( is_mmio ) ) {
-
- if( FD_LIKELY( path ) ) {
-
- fd = open( path, open_flags, (mode_t)0 );
-
- if( FD_LIKELY( fd!=-1 ) ) { /* --path seems to be file (e.g. testing or basic I/O with weak persistence) */
-
- TEST( !direct, "--direct 1 not supported with --type mm and file --path" );
- /* FIXME: is dsync valid for mmio? (unclear) noatime? (probably) */
-
- FD_LOG_NOTICE(( "Using file at --path as a memory mapped bstream" ));
-
- bstream_type = 0;
-
- int err = fd_io_mmio_init( fd, FD_IO_MMIO_MODE_READ_WRITE, &mmio, &mmio_sz );
- if( FD_UNLIKELY( err ) ) FD_LOG_ERR(( "fd_io_mmio_init failed (%i-%s)", err, fd_io_strerror( err ) ));
-
- } else { /* --path doesn't seem to be a file, use shmem (e.g. testing or ultra HPC with weak persistence) */
-
- FD_LOG_NOTICE(( "Using shmem region at --path as a memory mapped bstream (ignoring --dsync, --direct and --noatime)" ));
-
- bstream_type = 1;
-
- fd_shmem_join_info_t info[1];
- mmio = fd_shmem_join( path, FD_SHMEM_JOIN_MODE_READ_WRITE, 0, NULL, NULL, info );
- TEST( mmio, "fd_shmem_join failed" );
- mmio_sz = info->page_sz * info->page_cnt;
-
- }
-
- } else { /* No --path, use an anonymous region (e.g. testing or ultra HPC with no persistence) */
-
- FD_LOG_NOTICE(( "Using an anonymous shmem region as a memory mapped bstream "
- "(ignoring --dsync, --direct and --noatime, setting --reset to 1)" ));
-
- bstream_type = 2;
- reset = 1;
-
- mmio = fd_shmem_acquire( page_sz, page_cnt, near_cpu );
- TEST( mmio, "fd_shmem_acquire failed" );
- mmio_sz = page_sz*page_cnt;
-
- }
-
- TEST( fd_ulong_is_aligned( (ulong)_io, fd_vinyl_io_mm_align() ), "bad alloc" );
- TEST( io_footprint >= fd_vinyl_io_mm_footprint( spad_max ), "bad alloc" );
-
- io = fd_vinyl_io_mm_init( _io, spad_max, mmio, mmio_sz, reset, info, info_sz, io_seed );
- TEST( io, "fd_vinyl_io mm_init failed" );
-
- } else if( !strcmp( type, "bd" ) ) {
-
- if( FD_VINYL_BSTREAM_BLOCK_SZ<512UL ) TEST( !direct, "--direct 1 not supported with --type bd and BLOCK_SZ<512" );
-
- TEST( path, "--path not specified for --type bd" );
-
- FD_LOG_NOTICE(( "Using --path as a block device bstream" ));
-
- bstream_type = 3;
-
- fd = open( path, open_flags, 0 );
- if( FD_UNLIKELY( fd==-1 ) ) FD_LOG_ERR(( "open failed (%i-%s)", errno, fd_io_strerror( errno ) ));
-
- TEST( fd_ulong_is_aligned( (ulong)_io, fd_vinyl_io_bd_align() ), "bad wksp alloc" );
- TEST( io_footprint >= fd_vinyl_io_bd_footprint( spad_max ), "bad wksp alloc" );
-
- io = fd_vinyl_io_bd_init( _io, spad_max, fd, reset, info, info_sz, io_seed );
- TEST( io, "fd_vinyl_io bd_init failed" );
-
- } else {
-
- FD_LOG_ERR(( "Unsupported io type" ));
-
- }
-
- FD_LOG_NOTICE(( "Creating vinyl" ));
-
- fd_tpool_t * tpool = NULL;
-
- ulong thread_cnt = fd_tile_cnt();
-
- if( thread_cnt>1UL ) {
- FD_LOG_NOTICE(( "Creating temporary tpool from all %lu tiles for thread parallel init", thread_cnt ));
-
- static uchar _tpool[ FD_TPOOL_FOOTPRINT( FD_TILE_MAX ) ] __attribute__((aligned(FD_TPOOL_ALIGN)));
-
- tpool = fd_tpool_init( _tpool, thread_cnt, 0UL ); /* logs details */
- if( FD_UNLIKELY( !tpool ) ) FD_LOG_ERR(( "fd_tpool_init failed" ));
-
- for( ulong thread_idx=1UL; thread_idx~ 8 GiB used */
- int gc_eager = 2; /* target <~25% garbage items */
- int style = FD_VINYL_BSTREAM_CTL_STYLE_LZ4; /* enable data compression */
- int level = 1; /* do a hard reset by default */
- ulong obj_footprint_avg = 2UL*FD_VINYL_BSTREAM_BLOCK_SZ + 8UL + 1UL; /* see note above */
-
- int err = 0;
- int cnt = 0;
-
- while( argc ) {
- char const * cmd = argv[0];
- SHIFT(1);
-
- if( !strcmp( cmd, "help" ) ) {
-
- fflush( stdout ); fflush( stderr );
- fputs( fd_vinyl_ctl_help, stdout );
- fflush( stdout ); fflush( stderr );
-
- FD_LOG_NOTICE(( "%i: %s: success", cnt, cmd ));
-
- } else if( !strcmp( cmd, "set" ) ) {
-
- if( FD_UNLIKELY( argc<2 ) ) FD_LOG_ERR(( "%i: %s: too few arguments\n\tDo %s help for help", cnt, cmd, bin ));
-
- char const * key = argv[0];
- char const * val = argv[1];
-
- /**/ if( !strcmp( key, "wksp_tag" ) ) wksp_tag = fd_cstr_to_ulong ( val );
- else if( !strcmp( key, "pod_max" ) ) pod_max = fd_cstr_to_ulong ( val );
- else if( !strcmp( key, "cfg_path" ) ) cfg_path = val;
- else if( !strcmp( key, "cnc_app_sz" ) ) cnc_app_sz = fd_cstr_to_ulong ( val );
- else if( !strcmp( key, "spad_max" ) ) spad_max = fd_cstr_to_ulong ( val );
- else if( !strcmp( key, "async_min" ) ) async_min = fd_cstr_to_ulong ( val );
- else if( !strcmp( key, "async_max" ) ) async_max = fd_cstr_to_ulong ( val );
- else if( !strcmp( key, "part_thresh" ) ) part_thresh = fd_cstr_to_ulong ( val );
- else if( !strcmp( key, "gc_thresh" ) ) gc_thresh = fd_cstr_to_ulong ( val );
- else if( !strcmp( key, "gc_eager" ) ) gc_eager = fd_cstr_to_int ( val );
- else if( !strcmp( key, "style" ) ) style = fd_cstr_to_vinyl_bstream_ctl_style( val );
- else if( !strcmp( key, "level" ) ) level = fd_cstr_to_int ( val );
- else if( !strcmp( key, "obj_footprint_avg" ) ) obj_footprint_avg = fd_cstr_to_ulong ( val );
- else FD_LOG_ERR(( "%i: %s %s %s: unknown key", cnt, cmd, key, val));
-
- FD_LOG_NOTICE(( "%i: %s %s %s: success", cnt, cmd, key, val ));
- SHIFT(2);
-
- } else if( !strcmp( cmd, "alloc-memory" ) ) {
-
- if( FD_UNLIKELY( argc<5 ) ) FD_LOG_ERR(( "%i: %s: too few arguments\n\tDo %s help for help", cnt, cmd, bin ));
-
- char const * mem = argv[0];
- ulong page_cnt = fd_cstr_to_ulong ( argv[1] );
- ulong page_sz = fd_cstr_to_shmem_page_sz( argv[2] );
- char const * seq = argv[3];
- ulong mode = fd_cstr_to_ulong_octal ( argv[4] );
-
- if( FD_UNLIKELY( !page_cnt ) )
- FD_LOG_ERR(( "%i: %s %s %lu %s %s 0%03lo: bad page count\n\t"
- "Do %s help for help", cnt, cmd, mem, page_cnt, argv[2], seq, mode, bin ));
-
- if( FD_UNLIKELY( !page_sz ) )
- FD_LOG_ERR(( "%i: %s %s %lu %s %s 0%03lo: bad page size\n\t"
- "Do %s help for help", cnt, cmd, mem, page_cnt, argv[2], seq, mode, bin ));
-
- /* Partition the pages over the seq */
-
- ulong sub_page_cnt[ 512UL ];
- ulong sub_cpu_idx [ 512UL ];
- ulong sub_cnt = fd_cstr_to_ulong_seq( seq, sub_cpu_idx, 512UL );
-
- if( FD_UNLIKELY( !sub_cnt ) )
- FD_LOG_ERR(( "%i: %s %s %lu %s %s 0%03lo: empty or invalid cpu sequence\n\t"
- "Do %s help for help", cnt, cmd, mem, page_cnt, argv[2], seq, mode, bin ));
-
- if( FD_UNLIKELY( sub_cnt>512UL ) )
- FD_LOG_ERR(( "%i: %s %s %lu %s %s 0%03lo: sequence too long, increase limit in fd_vinyl_ctl.c\n\t"
- "Do %s help for help", cnt, cmd, mem, page_cnt, argv[2], seq, mode, bin ));
-
- /* TODO: consider striping instead of blocking */
-
- ulong sub_page_min = page_cnt / sub_cnt;
- ulong sub_page_rem = page_cnt % sub_cnt;
- for( ulong sub_idx=0UL; sub_idx(1UL<<32)) ) )
- FD_LOG_ERR(( "%i: %s %s %lu 0%03lo: bad number of gigabytes\n\t"
- "Do %s help for help", cnt, cmd, path, GiB_cnt, mode, bin ));
-
- ulong sz = GiB_cnt << 30;
-
- int fd = open( path, O_RDWR | O_CREAT | O_EXCL, (mode_t)mode );
- if( FD_UNLIKELY( fd==-1 ) )
- FD_LOG_ERR(( "%i: %s %s %lu 0%03lo: open failed (%i-%s)\n\tDo %s help for help",
- cnt, cmd, path, GiB_cnt, mode, errno, fd_io_strerror( errno ), bin ));
-
- int err = fd_io_truncate( fd, sz );
- if( FD_UNLIKELY( err ) )
- FD_LOG_ERR(( "%i: %s %s %lu 0%03lo: fd_io_truncate failed (%i-%s)\n\tDo %s help for help",
- cnt, cmd, path, GiB_cnt, mode, err, fd_io_strerror( err ), bin ));
-
- if( FD_UNLIKELY( close( fd ) ) )
- FD_LOG_WARNING(( "%i: %s %s %lu 0%03lo: close failed (%i-%s); attempting to continue",
- cnt, cmd, path, GiB_cnt, mode, errno, fd_io_strerror( errno ) ));
-
- FD_LOG_NOTICE(( "%i: %s %s %lu 0%03lo: success", cnt, cmd, path, GiB_cnt, mode ));
- SHIFT(3);
-
- } else if( !strcmp( cmd, "free-storage" ) ) {
-
- if( FD_UNLIKELY( argc<1 ) )
- FD_LOG_ERR(( "%i: %s: too few arguments\n\tDo %s help for help", cnt, cmd, bin ));
-
- char const * store = argv[0];
-
- if( FD_UNLIKELY( unlink( store ) ) )
- FD_LOG_ERR(( "%i: %s %s: unlink failed (%i-%s)\n\tDo %s help for help",
- cnt, cmd, store, errno, fd_io_strerror( errno ), bin ));
-
- FD_LOG_NOTICE(( "%i: %s %s: success", cnt, cmd, store ));
- SHIFT(1);
-
- } else if( !strcmp( cmd, "new" ) ) {
-
- if( FD_UNLIKELY( argc<3 ) )
- FD_LOG_ERR(( "%i: %s: too few arguments\n\tDo %s help for help", cnt, cmd, bin ));
-
- char const * mem = argv[0];
- ulong pair_max = fd_cstr_to_ulong( argv[1] );
- ulong GiB_max = fd_cstr_to_ulong( argv[2] );
-
-# define TEST( c, msg ) do { \
- if( FD_UNLIKELY( !(c) ) ) \
- FD_LOG_ERR(( "%i: %s %s %lu %lu: FAIL %s (%s)\n\tDo %s help for help", \
- cnt, cmd, mem, pair_max, GiB_max, #c, (msg), bin )); \
- } while(0)
-
- ulong ele_max = fd_ulong_pow2_up( pair_max + 1UL );
- ulong lock_cnt = fd_vinyl_meta_lock_cnt_est( ele_max );
- ulong probe_max = ele_max;
-
- TEST( (0UL=3UL, "increase maximum GiB allowed and/or decrease pair_max / spad_max / pod_max / cnc_app_sz" );
-
- ulong line_footprint = sizeof(fd_vinyl_line_t)*line_max;
-
- mem_req += line_footprint;
-
- ulong obj_footprint = fd_ulong_align_dn( mem_max - mem_req, alignof(fd_vinyl_data_obj_t) );
-
- mem_req += obj_footprint;
-
- TEST( mem_req<=mem_max, "internal error" );
-
- /* Attach to the memory that will contain this vinyl instance */
-
- fd_wksp_t * wksp = fd_wksp_attach( mem );
- TEST( wksp, "fd_wksp_attach failed" );
-
- /* Allocate all the needed regions. Note that, even though the
- vinyl io tile state is neither shared nor persistent, we
- allocate it here so the vinyl tile itself doesn't have to
- allocate it (it is dynamically sized and rather large). Since
- we want the vinyl tile to be able to pick the type of io
- interface and bstream store at startup without creating a new
- vinyl instance, we allocated an upper bound for all supported
- io types above (they are all roughly the same size anyway).
-
- Alternatively, we could have the vinyl tile do this allocation
- at tile startup. But this would create some additional
- complexity: the vinyl tile would need an allocator (and then
- one potentially has allocations left over from previous runs
- that did not terminate cleanly).
-
- Similar considerations apply for the data cache state, vinyl
- tile state, lines and data objects.
-
- Note also that, though meta is shared and persistent,
- persistence should only be used for post mortem debugging (the
- meta cache is recreated from scratch on vinyl tile startup). */
-
- void * _pod = fd_wksp_alloc_laddr( wksp, pod_align, pod_footprint, wksp_tag );
- void * _vinyl = fd_wksp_alloc_laddr( wksp, vinyl_align, vinyl_footprint, wksp_tag );
- void * _cnc = fd_wksp_alloc_laddr( wksp, cnc_align, cnc_footprint, wksp_tag );
- void * _meta = fd_wksp_alloc_laddr( wksp, meta_align, meta_footprint, wksp_tag );
- void * _io = fd_wksp_alloc_laddr( wksp, io_align, io_footprint, wksp_tag );
- void * _line = fd_wksp_alloc_laddr( wksp, line_align, line_footprint, wksp_tag ); /* This is kinda big */
- void * _ele = fd_wksp_alloc_laddr( wksp, ele_align, ele_footprint, wksp_tag ); /* This is really big */
- void * _obj = fd_wksp_alloc_laddr( wksp, obj_align, obj_footprint, wksp_tag );
-
- /* Note: the bigger obj gets, the better the performance (until it
- is large enough pairs always fit in cache but that would dwarf
- ele). In typical use cases, this is probably smaller to
- comparable to ele (resulting in much cheaper hardware at
- comparable speeds for typical usage patterns but less robust
- performance for extreme usage patterns). */
-
- TEST( (!!_pod) & (!!_vinyl) & (!!_cnc) & (!!_io) & (!!_line) & (!!_ele) & (!!_obj),
- "fd_wksp_alloc_laddr failed (free unneeded allocs or increase wksp size or partitions)" );
-
- /* Format and the join the pod and create the cfg subpod as
- necessary. */
-
- uchar * pod = fd_pod_join( fd_pod_new( _pod, pod_max ) );
- TEST( pod, "internal error" );
-
- uchar * cfg;
- if( !cfg_path ) cfg = pod;
- else {
- ulong off = fd_pod_alloc_subpod( pod, cfg_path, 1024UL );
- TEST( off, "use shorter cfg_path or increase pod_max?" );
- cfg = pod + off;
- }
-
- /* Populate the pod */
-
- char tmp[ FD_WKSP_CSTR_MAX ];
-
- TEST( fd_pod_insert_cstr( cfg, "vinyl", fd_wksp_cstr_laddr( _vinyl, tmp ) ), "increase pod_max?" );
- TEST( fd_pod_insert_cstr( cfg, "cnc", fd_wksp_cstr_laddr( _cnc, tmp ) ), "increase pod_max?" );
- TEST( fd_pod_insert_cstr( cfg, "meta", fd_wksp_cstr_laddr( _meta, tmp ) ), "increase pod_max?" );
- TEST( fd_pod_insert_cstr( cfg, "io", fd_wksp_cstr_laddr( _io, tmp ) ), "increase pod_max?" );
- TEST( fd_pod_insert_cstr( cfg, "line", fd_wksp_cstr_laddr( _line, tmp ) ), "increase pod_max?" );
- TEST( fd_pod_insert_cstr( cfg, "ele", fd_wksp_cstr_laddr( _ele, tmp ) ), "increase pod_max?" );
- TEST( fd_pod_insert_cstr( cfg, "obj", fd_wksp_cstr_laddr( _obj, tmp ) ), "increase pod_max?" );
-
- TEST( fd_pod_insert_ulong( cfg, "vinyl_footprint", vinyl_footprint ), "increase pod_max?" );
- TEST( fd_pod_insert_ulong( cfg, "cnc_footprint", cnc_footprint ), "increase pod_max?" );
- TEST( fd_pod_insert_ulong( cfg, "meta_footprint", meta_footprint ), "increase pod_max?" );
- TEST( fd_pod_insert_ulong( cfg, "io_footprint", io_footprint ), "increase pod_max?" );
- TEST( fd_pod_insert_ulong( cfg, "line_footprint", line_footprint ), "increase pod_max?" );
- TEST( fd_pod_insert_ulong( cfg, "ele_footprint", ele_footprint ), "increase pod_max?" );
- TEST( fd_pod_insert_ulong( cfg, "obj_footprint", obj_footprint ), "increase pod_max?" );
-
- TEST( fd_pod_insert_ulong( cfg, "spad_max", spad_max ), "increase pod_max?" );
- TEST( fd_pod_insert_ulong( cfg, "pair_max", pair_max ), "increase pod_max?" );
- TEST( fd_pod_insert_ulong( cfg, "line_max", line_max ), "increase pod_max?" );
- TEST( fd_pod_insert_ulong( cfg, "async_min", async_min ), "increase pod_max?" );
- TEST( fd_pod_insert_ulong( cfg, "async_max", async_max ), "increase pod_max?" );
- TEST( fd_pod_insert_ulong( cfg, "part_thresh", part_thresh ), "increase pod_max?" );
- TEST( fd_pod_insert_ulong( cfg, "gc_thresh", gc_thresh ), "increase pod_max?" );
- TEST( fd_pod_insert_int ( cfg, "gc_eager", gc_eager ), "increase pod_max?" );
- TEST( fd_pod_insert_int ( cfg, "style", style ), "increase pod_max?" );
- TEST( fd_pod_insert_int ( cfg, "level", level ), "increase pod_max?" );
-
- /* Tell the operator where the pod is */
- /* FIXME: consider putting the config pod in a normal page named
- shmem region or a flat file instead? Probably easier to pass
- between applications than a wksp gaddr. */
-
- printf( "%s\n", fd_wksp_cstr_laddr( _pod, tmp ) );
-
- /* Clean up */
-
- if( cfg!=pod ) TEST( fd_pod_compact( cfg, 1 ), "internal error" );
-
- TEST( fd_pod_leave( pod )==_pod, "internal error" );
-
- TEST( !fd_wksp_detach( wksp ), "internal error" );
-
-# undef TEST
-
- FD_LOG_NOTICE(( "%i: %s %s %lu %lu: success", cnt, cmd, mem, pair_max, GiB_max ));
- SHIFT(3);
-
- } else if( !strcmp( cmd, "delete" ) ) {
-
- if( FD_UNLIKELY( argc<1 ) )
- FD_LOG_ERR(( "%i: %s: too few arguments\n\tDo %s help for help", cnt, cmd, bin ));
-
- char const * cstr = argv[0];
-
-# define TEST( c, msg ) do { \
- if( FD_UNLIKELY( !(c) ) ) \
- FD_LOG_ERR(( "%i: %s %s: FAIL %s (%s)\n\tDo %s help for help", \
- cnt, cmd, cstr, #c, (msg), bin )); \
- } while(0)
-
- uchar const * pod = fd_pod_join( fd_wksp_map( cstr ) ); /* logs details */
- TEST( pod, "unable to join pod" );
-
- uchar const * cfg;
- if( !cfg_path ) cfg = pod;
- else {
- cfg = fd_pod_query_subpod( pod, cfg_path );
- TEST( cfg, "cfg not found at cfg_path" );
- }
-
- fd_wksp_cstr_free( fd_pod_query_cstr( cfg, "obj", NULL ) );
- fd_wksp_cstr_free( fd_pod_query_cstr( cfg, "ele", NULL ) );
- fd_wksp_cstr_free( fd_pod_query_cstr( cfg, "line", NULL ) );
- fd_wksp_cstr_free( fd_pod_query_cstr( cfg, "io", NULL ) );
- fd_wksp_cstr_free( fd_pod_query_cstr( cfg, "meta", NULL ) );
- fd_wksp_cstr_free( fd_pod_query_cstr( cfg, "cnc", NULL ) );
- fd_wksp_cstr_free( fd_pod_query_cstr( cfg, "vinyl", NULL ) );
-
- fd_wksp_unmap( fd_pod_leave( pod ) );
-
- fd_wksp_cstr_free( cstr );
-
- FD_LOG_NOTICE(( "%i: %s %s: success", cnt, cmd, cstr ));
- SHIFT(1);
-
- } else if( !strcmp( cmd, "exec" ) ) {
-
- err = fd_vinyl_main( argc, argv );
- break;
-
- } else {
-
- FD_LOG_ERR(( "%i: %s: unknown command\n\t"
- "Do %s help for help", cnt, cmd, bin ));
-
- }
- cnt++;
- }
-
- if( FD_UNLIKELY( cnt<1 ) ) FD_LOG_NOTICE(( "processed %i commands\n\tDo %s help for help", cnt, bin ));
- else FD_LOG_NOTICE(( "processed %i commands", cnt ));
-
-# undef SHIFT
-
- fd_halt();
- return err;
-}
diff --git a/src/vinyl/fd_vinyl_ctl_help b/src/vinyl/fd_vinyl_ctl_help
deleted file mode 100644
index e8a821cfd13..00000000000
--- a/src/vinyl/fd_vinyl_ctl_help
+++ /dev/null
@@ -1,91 +0,0 @@
-
-Usage: fd_vinyl_ctl [cmd] [cmd args] [cmd] [cmd args] ...
-
-Commands are:
-
-help
-- Prints this message.
-
-set [key] [val]
-- Set the [key] to [val]. keys include:
-
- key | val type | default | notes
- ------------------+----------+--------------------+-----------------------------------------------------------
- wksp_tag | ulong | 0xfdc12113c597a600 | allocation tag used for vinyl wksp allocs
- pod_max | ulong | 4KiB | byte size for the vinyl pod
- cfg_path | cstr | NULL | path to the vinyl cfg in the vinyl pod (NULL use pod root)
- cnc_app_sz | ulong | vinyl tile minimum | -
- spad_max | ulong | 32MiB | io append scratch pad byte size
- exec_max | ulong | 2UL | vinyl tile max request executed per run loop iteration
- async_min | ulong | 2UL | min run loop iterations per async handling
- async_max | ulong | 4UL | max run loop iterations per async handling
- part_thresh | ulong | 1GiB | target partition size
- gc_thresh | ulong | 8GiB | min bstream past size to consider compaction
- gc_eager | int | 2 | target less than <~2^-gc_eager garbage items
- style | cstr | "lz4" | preferred bstream pair encoding ("raw" and "lz4" are supported)
- level | int | 1 | preferred reset level at startup (0 soft, 1 hard)
- obj_footprint_avg | ulong | 1KiB + 9B | marginal byte cost of object in the data cache
-
-alloc-memory wksp page_cnt page_sz cpu_idx_seq mode
-
-- Create a workspace named wksp from page_cnt page_sz pages distributed
- over numa nodes near cpu_idx_seq. The region will have the unix
- permissions specified by mode (assumed octal). See fd_shmem_ctl help
- for more details how the pages will be distributed over numa nodes.
-
-- The purpose of this is to create a workspace which can be used to hold
- the memory regions needed by a vinyl tile. Since this is a normal
- workspace though, it can be used for other application allocations.
- (Or vice versa.)
-
-- This is identical to fd_wksp_ctl new and provided here as a
- convenience.
-
-free-memory wksp
-
-- Delete a workspace named wksp. If multiple shmem regions exist with
- same name, try to use the shmem region backed by the largest page size
-
-- This is identical to fd_wksp_ctl delete and provided here as a
- convenience and for symmetry with alloc-memory.
-
-alloc-storage path sz_in_GiB mode
-
-- Create a file at path with the given sz filled with zeros. The file
- will have the permissions specified by mode (assumed octal). Fails
- if the file already exists.
-
-- The purpose of this is create a file which can be used to hold
- the bstream needed by a vinyl tile. There is no requirement to use
- this though.
-
-- This is provided as a convenience. The user is free to create bstream
- storage via whatever means suits them. This includes using
- pre-existing block devices directly.
-
-free-storage path
-
-- Destroys the file at path. Fails if the file could not be destroyed.
-
-- This is provided here as a convenience and for symmetry with
- alloc-storage.
-
-new wksp pair_max sz_in_GiB
-
-- Allocate memory resources needed for a vinyl tile in the workspace
- wksp. The vinyl tile will be able to track up to pair_max key-val
- pairs total. The target amount of wksp memory to devote to the vinyl
- tile should be sz_in_GiB.
-
-- One of these memory resources will be a pod. The locations of these
- memory resources will be stashed in in this pod as wksp cstr gaddrs
- (along with other vinyl tile configurations).
-
-- The wksp cstr of this pod will be printed to stdout.
-
-delete pod
-
-- Frees all memory resources allocated for a vinyl tile. The location
- of the resources are given by pod. This includes freeing the pod
- itself. The tile should not be running when this is done.
-
diff --git a/src/vinyl/fd_vinyl_exec.c b/src/vinyl/fd_vinyl_exec.c
deleted file mode 100644
index 6ef4dda20fd..00000000000
--- a/src/vinyl/fd_vinyl_exec.c
+++ /dev/null
@@ -1,696 +0,0 @@
-#include "fd_vinyl.h"
-#include "../util/pod/fd_pod.h"
-#include
-#include
-#include
-#include
-
-struct fd_vinyl_client {
- fd_vinyl_rq_t * rq; /* Channel for requests from this client (could be shared by multiple vinyl instances) */
- fd_vinyl_cq_t * cq; /* Channel for completions from this client to this vinyl instance
- (could be shared by multiple receivers of completions from this vinyl instance). */
- ulong burst_max; /* Max requests receive from this client at a time */
- ulong seq; /* Sequence number of the next request to receive in the rq */
- ulong link_id; /* Identifies requests from this client to this vinyl instance in the rq */
- ulong laddr0; /* A valid non-zero gaddr from this client maps to the vinyl instance's laddr laddr0 + gaddr ... */
- ulong laddr1; /* ... and thus is in (laddr0,laddr1). A zero gaddr maps to laddr NULL. */
- ulong quota_rem; /* Num of remaining acquisitions this client is allowed on this vinyl instance */
- ulong quota_max; /* Max quota */
-};
-
-typedef struct fd_vinyl_client fd_vinyl_client_t;
-
-/* MAP_REQ_GADDR maps a request global address req_gaddr to an array of
- cnt T's into the local address space as a T * pointer. If the result
- is not properly aligned or the entire range does not completely fall
- within the shared region with the client, returns NULL. Likewise,
- gaadr 0 maps to NULL. Assumes sizeof(T)*(n) does not overflow (which
- is true where as n is at most batch_cnt which is at most 2^32 and
- sizeof(T) is at most 40. */
-
-#define MAP_REQ_GADDR( gaddr, T, n ) ((T *)fd_vinyl_laddr( (gaddr), alignof(T), sizeof(T)*(n), client_laddr0, client_laddr1 ))
-
-FD_FN_CONST static inline void *
-fd_vinyl_laddr( ulong req_gaddr,
- ulong align,
- ulong footprint,
- ulong client_laddr0,
- ulong client_laddr1 ) {
- ulong req_laddr0 = client_laddr0 + req_gaddr;
- ulong req_laddr1 = req_laddr0 + footprint;
- return (void *)fd_ulong_if( (!!req_gaddr) & fd_ulong_is_aligned( req_laddr0, align ) &
- (client_laddr0<=req_laddr0) & (req_laddr0<=req_laddr1) & (req_laddr1<=client_laddr1),
- req_laddr0, 0UL );
-}
-
-/* FIXME: STASH THESE IN THE VINYL TOO? */
-#define FD_VINYL_CLIENT_MAX (1024UL)
-#define FD_VINYL_REQ_MAX (1024UL)
-
-void
-fd_vinyl_exec( fd_vinyl_t * vinyl ) {
-
- /* Unpack shared objects */
-
- fd_cnc_t * cnc = vinyl->cnc;
- fd_vinyl_io_t * io = vinyl->io;
- fd_vinyl_line_t * line = vinyl->line;
- fd_vinyl_meta_t * meta = vinyl->meta;
- fd_vinyl_data_t * data = vinyl->data;
-
- /* Unpack config */
-
- ulong line_cnt = vinyl->line_cnt;
- ulong pair_max = vinyl->pair_max;
- ulong async_min = vinyl->async_min;
- ulong async_max = vinyl->async_max;
-
- /* Unpack cnc */
-
- if( FD_UNLIKELY( fd_cnc_signal_query( cnc )!=FD_VINYL_CNC_SIGNAL_BOOT ) ) {
- FD_LOG_WARNING(( "cnc not booting (restarting after an unclean termination?); forcing to boot and attempting to continue" ));
- fd_cnc_signal( cnc, FD_VINYL_CNC_SIGNAL_BOOT );
- }
-
- fd_vinyl_cmd_t * cmd = (fd_vinyl_cmd_t *)fd_cnc_app_laddr( cnc );
- ulong * diag = (ulong *)(cmd+1);
-
- /* Unpack io */
-
- ulong io_seed = fd_vinyl_io_seed( io );
-
- /* Unpack meta */
-
- fd_vinyl_meta_ele_t * ele0 = meta->ele;
- ulong ele_max = meta->ele_max;
- ulong meta_seed = meta->seed;
- ulong * lock = meta->lock;
- int lock_shift = meta->lock_shift;
-
- /* Unpack data */
-
- ulong data_laddr0 = (ulong)data->laddr0;
- fd_vinyl_data_vol_t const * vol = data->vol;
- ulong vol_cnt = data->vol_cnt;
-
- /* Connected clients */
-
- fd_vinyl_client_t _client[ FD_VINYL_CLIENT_MAX ];
- ulong client_cnt = 0UL; /* In [0,client_max) */
- ulong client_idx = 0UL; /* If client_cnt>0, next client to poll for requests, d/c otherwise */
-
- ulong quota_free = line_cnt - 1UL;
-
- /* Received requests */
-
- fd_vinyl_req_t _req[ FD_VINYL_REQ_MAX ];
- ulong req_head = 0UL; /* Requests [0,req_head) have been processed */
- ulong req_tail = 0UL; /* Requests [req_head,req_tail) are pending */
- /* Requests [req_tail,ULONG_MAX) have not been received */
- ulong burst_free = FD_VINYL_REQ_MAX;
- ulong exec_max = 0UL;
-
- /* accum_dead_cnt is the number of dead blocks that have been
- written since the last partition block.
-
- accum_move_cnt is the number of move blocks that have been
- written since this last partition block.
-
- accum_garbage_cnt / sz is the number of items / bytes garbage in
- the bstream that have accumulated since the last time we compacted
- the bstream. We use this to estimate the number of rounds of
- compaction to do in async handling.
-
- accum_drop_link is the number of requests that were silently
- dropped because the request link_id did not match the client's
- link_id.
-
- accum_drop_comp is the number of requests that were silently
- dropped because an out-of-band completion was requested to be sent
- to an unmappable client address.
-
- accumt_req_full is the number of times we detected the pending
- request queue being completely full. */
-
- ulong accum_dead_cnt = 0UL;
- ulong accum_move_cnt = 0UL;
- ulong accum_garbage_cnt = 0UL;
- ulong accum_garbage_sz = 0UL;
- ulong accum_drop_link = 0UL;
- ulong accum_drop_comp = 0UL;
- ulong accum_cache_hit = 0UL;
-
- ulong seq_part = fd_vinyl_io_seq_present( io );
-
- /* Run */
-
- fd_cnc_signal( cnc, FD_VINYL_CNC_SIGNAL_RUN );
-
- ulong async_rem = 1UL;
-
- for(;;) {
-
- /* Process background tasks this iteration if necessary */
-
- if( FD_UNLIKELY( !(--async_rem) ) ) {
- long now = fd_log_wallclock();
- async_rem = async_min + (fd_ulong_hash( (ulong)now ) % (async_max-async_min+1UL)); /* FIXME: FASTER ALGO */
-
- fd_cnc_heartbeat( cnc, now );
-
- /* If we've written enough to justify appending a parallel
- recovery partition, append one. */
-
- ulong seq_future = fd_vinyl_io_seq_future( io );
- if( FD_UNLIKELY( (seq_future - seq_part) > vinyl->part_thresh ) ) {
-
- ulong seq = fd_vinyl_io_append_part( io, seq_part, accum_dead_cnt, accum_move_cnt, NULL, 0UL );
- FD_CRIT( fd_vinyl_seq_eq( seq, seq_future ), "corruption detected" );
- seq_part = seq + FD_VINYL_BSTREAM_BLOCK_SZ;
-
- accum_dead_cnt = 0UL;
- accum_move_cnt = 0UL;
-
- accum_garbage_cnt++;
- accum_garbage_sz += FD_VINYL_BSTREAM_BLOCK_SZ;
-
- fd_vinyl_io_commit( io, FD_VINYL_IO_FLAG_BLOCKING );
-
- }
-
- diag[ FD_VINYL_DIAG_DROP_LINK ] += accum_drop_link; accum_drop_link = 0UL;
- diag[ FD_VINYL_DIAG_DROP_COMP ] += accum_drop_comp; accum_drop_comp = 0UL;
- diag[ FD_VINYL_DIAG_CACHE_HIT ] += accum_cache_hit; accum_cache_hit = 0UL;
-
- /* Let the number of items of garbage generated since the last
- compaction be accum_garbage_cnt and let the steady steady
- average number of live / garbage items in the bstream's past be
- L / G (i.e. L is the average value of pair_cnt). The average
- number pieces of garbage collected per garbage collection round
- is thus G / (L + G). If we do compact_max rounds garbage
- collection this async handling, we expect to collect
-
- compact_max G / (L + G)
-
- items of garbage on average. To make sure we collect garbage
- faster than we generate it on average, we then require:
-
- accum_garbage_cnt <~ compact_max G / (L + G)
- -> compact_max >~ (L + G) accum_garbage_cnt / G
-
- Let the be 2^-gc_eager be the maximum fraction of items in the
- bstream's past we are willing tolerate as garbage on average.
- We then have G = 2^-gc_eager (L + G). This implies:
-
- -> compact_max >~ accum_garbage_cnt 2^gc_eager
-
- When accum_garbage_cnt is 0, we use a compact_max of 1 to do
- compaction rounds at a minimum rate all the time. This allows
- transients (e.g. a sudden change to new steady state
- equilibrium, temporary disabling of garbage collection at key
- times for highest performance, etc) and unaccounted zero
- padding garbage to be absorbed when nothing else is going on. */
-
- int gc_eager = vinyl->gc_eager;
- if( FD_LIKELY( gc_eager>=0 ) ) {
-
- /* Saturating wide left shift */
- ulong overflow = (accum_garbage_cnt >> (63-gc_eager) >> 1); /* sigh ... avoid wide shift UB */
- ulong compact_max = fd_ulong_max( fd_ulong_if( !overflow, accum_garbage_cnt << gc_eager, ULONG_MAX ), 1UL );
-
- /**/ accum_garbage_cnt = 0UL;
- vinyl->garbage_sz += accum_garbage_sz; accum_garbage_sz = 0UL;
-
- fd_vinyl_compact( vinyl, compact_max );
-
- }
-
- ulong signal = fd_cnc_signal_query( cnc );
- if( FD_UNLIKELY( signal!=FD_VINYL_CNC_SIGNAL_RUN ) ) {
- if( FD_UNLIKELY( signal==FD_VINYL_CNC_SIGNAL_HALT ) ) break;
-
- switch( signal ) {
-
- case FD_VINYL_CNC_SIGNAL_SYNC: {
- fd_vinyl_io_sync( io, FD_VINYL_IO_FLAG_BLOCKING );
- break;
- }
-
- case FD_VINYL_CNC_SIGNAL_GET: {
- ulong old;
- int err = FD_VINYL_SUCCESS;
- switch( cmd->get.opt ) {
- case FD_VINYL_OPT_PART_THRESH: old = vinyl->part_thresh; break;
- case FD_VINYL_OPT_GC_THRESH: old = vinyl->gc_thresh; break;
- case FD_VINYL_OPT_GC_EAGER: old = (ulong)(long)vinyl->gc_eager; break;
- case FD_VINYL_OPT_STYLE: old = (ulong)(uint)vinyl->style; break;
- default: old = 0UL; err = FD_VINYL_ERR_INVAL; break;
- }
- cmd->get.val = old;
- cmd->get.err = err;
- break;
- }
-
- case FD_VINYL_CNC_SIGNAL_SET: { /* FIXME: ADD VALIDATION TO SET VALUES FOR OPT_GC_EAGER AND OPT_STYLE */
- ulong new = cmd->set.val;
- ulong old;
- int err = FD_VINYL_SUCCESS;
- switch( cmd->set.opt ) {
- case FD_VINYL_OPT_PART_THRESH: old = vinyl->part_thresh; vinyl->part_thresh = new; break;
- case FD_VINYL_OPT_GC_THRESH: old = vinyl->gc_thresh; vinyl->gc_thresh = new; break;
- case FD_VINYL_OPT_GC_EAGER: old = (ulong)(long)vinyl->gc_eager; vinyl->gc_eager = (int)new; break;
- case FD_VINYL_OPT_STYLE: old = (ulong)(uint)vinyl->style; vinyl->style = (int)new; break;
- default: old = 0UL; err = FD_VINYL_ERR_INVAL; break;
- }
- cmd->set.val = old;
- cmd->set.err = err;
- break;
- }
-
- case FD_VINYL_CNC_SIGNAL_CLIENT_JOIN: {
- int err;
-
- ulong link_id = cmd->join.link_id;
- ulong burst_max = cmd->join.burst_max;
- ulong quota_max = cmd->join.quota_max;
- char const * _rq = cmd->join.rq;
- char const * _cq = cmd->join.cq;
- char const * _wksp = cmd->join.wksp;
-
- if( FD_UNLIKELY( client_cnt>=FD_VINYL_CLIENT_MAX ) ) {
- FD_LOG_WARNING(( "Too many clients (increase FD_VINYL_CLIENT_MAX)" ));
- err = FD_VINYL_ERR_FULL;
- goto join_done;
- }
-
- if( FD_UNLIKELY( burst_max > burst_free ) ) {
- FD_LOG_WARNING(( "Too large burst_max (increase FD_VINYL_REQ_MAX or decrease burst_max)" ));
- err = FD_VINYL_ERR_FULL;
- goto join_done;
- }
-
- if( FD_UNLIKELY( quota_max > fd_ulong_min( quota_free, FD_VINYL_COMP_QUOTA_MAX ) ) ) {
- FD_LOG_WARNING(( "Too large quota_max (increase line_cnt or decrease quota_max)" ));
- err = FD_VINYL_ERR_FULL;
- goto join_done;
- }
-
- for( ulong client_idx=0UL; client_idxgaddr_hi; /* FIXME: HOW TO GET THIS CLEANLY */
- _client[ client_cnt ].quota_rem = quota_max;
- _client[ client_cnt ].quota_max = quota_max;
- client_cnt++;
-
- quota_free -= quota_max;
- burst_free -= burst_max;
-
- /* Every client_cnt run loop iterations we receive at most:
-
- sum_clients recv_max = FD_VINYL_REQ_MAX - burst_free
-
- requests. To guarantee we processe requests fast enough
- that we never overrun our receive queue, under maximum
- client load, we need to process:
-
- sum_clients recv_max / client_cnt
-
- requests per run loop iteration. We thus set exec_max
- to the ceil sum_clients recv_max / client_cnt. */
-
- exec_max = (FD_VINYL_REQ_MAX - burst_free + client_cnt - 1UL) / client_cnt;
-
- err = FD_VINYL_SUCCESS;
-
- join_done:
- cmd->join.err = err;
- break;
- }
-
- case FD_VINYL_CNC_SIGNAL_CLIENT_LEAVE: {
- int err;
-
- ulong link_id = cmd->leave.link_id;
-
- for( ulong client_idx=0UL; client_idxleave.err = err;
- break;
- }
-
- default: {
- FD_LOG_WARNING(( "unknown signal received (%lu); ignoring", signal ));
- break;
- }
-
- }
-
- fd_cnc_signal( cnc, FD_VINYL_CNC_SIGNAL_RUN );
- }
- }
-
- /* Receive requests from clients */
-
- if( FD_LIKELY( client_cnt ) ) {
-
- /* Select client to poll this run loop iteration */
-
- client_idx = fd_ulong_if( client_idx+1ULrq;
- ulong seq = client->seq;
- ulong burst_max = client->burst_max;
- ulong link_id = client->link_id;
-
- /* Enqueue up to burst_max requests from this client into the
- local request queue. Using burst_max << FD_VINYL_REQ_MAX
- allows applications to prevent a bursty client from starving
- other clients of resources while preserving the spatial and
- temporal locality of reasonably sized O(burst_max) bursts from
- an individual client in processing below. Each run loop
- iteration can enqueue up to burst_max requests per iterations. */
-
- for( ulong recv_rem=fd_ulong_min( FD_VINYL_REQ_MAX-(req_tail-req_head), burst_max ); recv_rem; recv_rem-- ) {
- fd_vinyl_req_t * req = _req + (req_tail & (FD_VINYL_REQ_MAX-1UL));
-
- long diff = fd_vinyl_rq_recv( rq, seq, req );
-
- if( FD_LIKELY( diff>0L ) ) break; /* No requests waiting in rq at this time */
-
- if( FD_UNLIKELY( diff ) ) FD_LOG_CRIT(( "client overran request queue" ));
-
- seq++;
-
- /* We got the next request. Decide if we should accept it.
-
- Specifically, we ignore requests whose link_id don't match
- link_id (e.g. an unknown link_id or matches a different
- client's link_id ... don't know if it is where or even if it
- is safe to the completion). Even if the request provided an
- out-of-band location to send the completion (comp_gaddr!=0),
- we have no reason to trust it given the mismatch.
-
- This also gives a mechanism for a client use a single rq to
- send requests to multiple vinyl instances ... the client
- should use a different link_id for each vinyl instance. Each
- vinyl instance will quickly filter out the requests not
- addressed to it.
-
- Since we know the client_idx at this point, given a matching
- link_id, we stash the client_idx in the pending req link_id
- to eliminate the need to maintain a link_id<>client_idx map
- in the execution loop below. */
-
- if( FD_UNLIKELY( req->link_id!=link_id ) ) {
- accum_drop_link++;
- continue;
- }
-
- req->link_id = client_idx;
-
- req_tail++;
- }
-
- client->seq = seq;
- }
-
- /* Execute received requests */
-
- for( ulong exec_rem=fd_ulong_min( req_tail-req_head, exec_max ); exec_rem; exec_rem-- ) {
- fd_vinyl_req_t * req = _req + ((req_head++) & (FD_VINYL_REQ_MAX-1UL));
-
- /* Determine the client that sent this request and unpack the
- completion fields. We ignore requests with non-NULL but
- unmappable out-of-band completion because we can't send the
- completion in the expected manner and, in lieu of that, the
- receivers aren't expecting any completion to come via the cq
- (if any). Note that this implies requests that don't produce a
- completion (e.g. FETCH and FLUSH) need to either provide NULL
- or a valid non-NULL location for comp_gaddr to pass this
- validation (this is not a burden practically). */
-
- ulong req_id = req->req_id;
- ulong client_idx = req->link_id; /* See note above about link_id / client_idx conversion */
- ulong batch_cnt = (ulong)req->batch_cnt;
- ulong comp_gaddr = req->comp_gaddr;
-
- fd_vinyl_client_t * client = _client + client_idx;
-
- fd_vinyl_cq_t * cq = client->cq;
- ulong link_id = client->link_id;
- ulong client_laddr0 = client->laddr0;
- ulong client_laddr1 = client->laddr1;
- ulong quota_rem = client->quota_rem;
-
- FD_CRIT( quota_rem<=client->quota_max, "corruption detected" );
-
- fd_vinyl_comp_t * comp = MAP_REQ_GADDR( comp_gaddr, fd_vinyl_comp_t, 1UL );
- if( FD_UNLIKELY( (!comp) & (!!comp_gaddr) ) ) {
- accum_drop_comp++;
- continue;
- }
-
- int comp_err = 1;
- ulong fail_cnt = 0UL;
-
- ulong read_cnt = 0UL;
- ulong append_cnt = 0UL;
-
- switch( req->type ) {
-
-# include "fd_vinyl_case_acquire.c"
-# include "fd_vinyl_case_release.c"
-# include "fd_vinyl_case_erase.c"
-# include "fd_vinyl_case_move.c"
-# include "fd_vinyl_case_fetch.c"
-# include "fd_vinyl_case_flush.c"
-# include "fd_vinyl_case_try.c"
-# include "fd_vinyl_case_test.c"
-
- default:
- comp_err = FD_VINYL_ERR_INVAL;
- break;
- }
-
- for( ; read_cnt; read_cnt-- ) {
- fd_vinyl_io_rd_t * _rd; /* avoid pointer escape */
- fd_vinyl_io_poll( io, &_rd, FD_VINYL_IO_FLAG_BLOCKING );
- fd_vinyl_io_rd_t * rd = _rd;
-
- fd_vinyl_data_obj_t * obj = (fd_vinyl_data_obj_t *) rd->ctx;
- ulong seq = rd->seq; (void)seq;
- fd_vinyl_bstream_phdr_t * cphdr = (fd_vinyl_bstream_phdr_t *)rd->dst;
- ulong cpair_sz = rd->sz; (void)cpair_sz;
-
- fd_vinyl_data_obj_t * cobj = (fd_vinyl_data_obj_t *)fd_ulong_align_dn( (ulong)rd, FD_VINYL_BSTREAM_BLOCK_SZ );
-
- FD_CRIT( cphdr==fd_vinyl_data_obj_phdr( cobj ), "corruption detected" );
-
- ulong cpair_ctl = cphdr->ctl;
-
- int cpair_type = fd_vinyl_bstream_ctl_type ( cpair_ctl );
- int cpair_style = fd_vinyl_bstream_ctl_style( cpair_ctl );
- ulong cpair_val_esz = fd_vinyl_bstream_ctl_sz ( cpair_ctl );
-
- FD_CRIT( cpair_type==FD_VINYL_BSTREAM_CTL_TYPE_PAIR, "corruption detected" );
- FD_CRIT( cpair_sz ==fd_vinyl_bstream_pair_sz( cpair_val_esz ), "corruption detected" );
-
- schar * rd_err = cobj->rd_err;
-
- FD_CRIT ( rd_err, "corruption detected" );
- FD_ALERT( fd_vinyl_data_is_valid_obj( obj, vol, vol_cnt ), "corruption detected" );
-
- ulong line_idx = obj->line_idx;
-
- FD_CRIT( line_idxinfo.val_sz;
-
- FD_CRIT( val_sz <= FD_VINYL_VAL_MAX, "corruption detected" );
- FD_CRIT( fd_vinyl_data_obj_val_max( obj ) >= val_sz, "corruption detected" );
-
- if( FD_LIKELY( cpair_style==FD_VINYL_BSTREAM_CTL_STYLE_RAW ) ) {
-
- FD_CRIT( obj==cobj, "corruption detected" );
- FD_CRIT( cpair_val_esz==val_sz, "corruption detected" );
-
- } else {
-
- char const * cval = (char const *)fd_vinyl_data_obj_val( cobj );
- ulong cval_sz = fd_vinyl_bstream_ctl_sz( cpair_ctl );
-
- ulong _val_sz = (ulong)LZ4_decompress_safe( cval, val, (int)cval_sz, (int)val_sz );
- if( FD_UNLIKELY( _val_sz!=val_sz ) ) FD_LOG_CRIT(( "LZ4_decompress_safe failed" ));
-
- fd_vinyl_data_free( data, cobj );
-
- fd_vinyl_bstream_phdr_t * phdr = fd_vinyl_data_obj_phdr( obj );
-
- phdr->ctl = fd_vinyl_bstream_ctl( FD_VINYL_BSTREAM_CTL_TYPE_PAIR, FD_VINYL_BSTREAM_CTL_STYLE_RAW, val_sz );
- phdr->key = cphdr->key;
- phdr->info = cphdr->info;
-
- }
-
- obj->rd_active = (short)0;
-
- /* Fill any trailing region with zeros (there is at least
- FD_VINYL_BSTREAM_FTR_SZ) and tell the client the item was
- successfully processed. */
-
- memset( val + val_sz, 0, fd_vinyl_data_szc_obj_footprint( (ulong)obj->szc )
- - (sizeof(fd_vinyl_data_obj_t) + sizeof(fd_vinyl_bstream_phdr_t) + val_sz) );
-
- FD_COMPILER_MFENCE();
- *rd_err = (schar)FD_VINYL_SUCCESS;
- FD_COMPILER_MFENCE();
-
- }
-
- if( FD_UNLIKELY( append_cnt ) ) fd_vinyl_io_commit( io, FD_VINYL_IO_FLAG_BLOCKING );
-
- if( FD_LIKELY( comp_err<=0 ) ) fd_vinyl_cq_send( cq, comp, req_id, link_id, comp_err, batch_cnt, fail_cnt, quota_rem );
-
- client->quota_rem = quota_rem;
-
- }
-
- } /* run loop */
-
- ulong discard_cnt = req_tail - req_head;
-
- /* Append the final partition and sync so we can resume with a fast
- parallel recovery */
-
- fd_vinyl_io_append_part( io, seq_part, accum_dead_cnt, accum_move_cnt, NULL, 0UL );
-
- accum_dead_cnt = 0UL;
- accum_move_cnt = 0UL;
-
- accum_garbage_cnt++;
- accum_garbage_sz += FD_VINYL_BSTREAM_BLOCK_SZ;
-
- fd_vinyl_io_commit( io, FD_VINYL_IO_FLAG_BLOCKING );
-
- fd_vinyl_io_sync( io, FD_VINYL_IO_FLAG_BLOCKING );
-
- /* Drain outstanding accumulators */
-
- /**/ accum_garbage_cnt = 0UL;
- vinyl->garbage_sz += accum_garbage_sz; accum_garbage_sz = 0UL;
-
- diag[ FD_VINYL_DIAG_DROP_LINK ] += accum_drop_link; accum_drop_link = 0UL;
- diag[ FD_VINYL_DIAG_DROP_COMP ] += accum_drop_comp; accum_drop_comp = 0UL;
- diag[ FD_VINYL_DIAG_CACHE_HIT ] += accum_cache_hit; accum_cache_hit = 0UL;
-
- /* Disconnect from the clients */
-
- ulong released_cnt = 0UL;
- for( ulong client_idx=0UL; client_idxpart.seq0;
- seq0 = fd_vinyl_seq_gt( seq0, seq_past ) ? seq0 : seq_past;
-
-# if 0
- /* Compute the maximum number of deads the portion of this partition
- in the bstream's past that could produce as the lesser the number
- of deads reported in the partition and the number of blocks in
- the partition. Similarly for move (note that each move makes two
- tombstone but also requires at least two blocks ... so moves also
- make, at most, 1 tombstone per block on average). */
-
- ulong part_sz = seq1 - seq0 - FD_VINYL_BSTREAM_BLOCK_SZ; /* exclude trailing part block for below */
-
- ulong dead_max = fd_ulong_min( block->part.dead_cnt, part_sz );
- ulong move_max = fd_ulong_min( block->part.move_cnt, part_sz );
-
- tstone_req += fd_ulong_min( dead_max + 2UL*move_max, part_sz );
-# endif
-
- /* Move to the previous partition */
-
- seq1 = seq0;
- }
-
- /* We seem to have a valid partitioning for parallel recovery */
-
-# if 0
- if( FD_UNLIKELY( tstone_req > tstone_max ) ) {
- FD_LOG_WARNING(( "insufficient scratch space for parallel recovery"
- "\n\tincrease data cache size"
- "\n\tfalling back to serial recovery" ));
- return FD_VINYL_ERR_FULL;
- }
-# endif
-
- return FD_VINYL_SUCCESS;
-}
-
-/* fd_vinyl_recover_line_task tests parallel flushes all vinyl
- lines and resets the evicition priority sequence. */
-
-static FD_FOR_ALL_BEGIN( fd_vinyl_recover_line_task, 1L ) {
- fd_vinyl_t * vinyl = (fd_vinyl_t *)arg[0];
-
- fd_vinyl_line_t * line = vinyl->line;
- ulong line_cnt = vinyl->line_cnt;
-
- ulong line0 = (ulong)block_i0;
- ulong line1 = (ulong)block_i1;
-
- for( ulong line_idx=line0; line_idxmeta->lock;
-
- ulong reclaim_cnt = 0UL;
-
- for( long lock_idx=block_i0; lock_idxmeta->ele;
-
- fd_vinyl_meta_ele_t init_ele[1];
- memset( init_ele, 0, sizeof(fd_vinyl_meta_ele_t) );
- init_ele->line_idx = ULONG_MAX;
-
- for( long ele_idx=block_i0; ele_idxmeta->lock;
-
- for( long lock_idx=block_i0; lock_idxphdr.ctl ) ) {
-
- /* There is no version or tstone for pair key in the meta currently.
- Insert a tstone at seq for key so any versions or tstone for pair
- key encountered later in parallel recovery can tell if they are
- before or after this tstone. Because we don't know if there will
- version of key after this, we need to append key to the tstone
- array. */
-
- //pair_cnt unchanged
- //garbage_sz unchanged
- (*_tstone_cnt)++;
-
- ele->memo = fd_vinyl_meta_query_memo( query );
- ele->phdr.ctl = 1UL;
- ele->phdr.key = *key;
- //ele->phdr.info = d/c
- ele->line_idx = ULONG_MAX - 1UL; // tstone
- ele->seq = seq;
-
- fd_vinyl_meta_publish( query );
-
- } else if( FD_LIKELY( fd_vinyl_seq_lt( ele->seq, seq ) ) ) {
-
- /* The version (or tstone) for pair key in the meta is older than
- seq. We append a key to the tstone array if we haven't already. */
-
- int old_ele_is_pair = (ele->line_idx==ULONG_MAX);
-
- (*_pair_cnt) -= (ulong)old_ele_is_pair;
- (*_garbage_sz) += old_ele_is_pair ? fd_vinyl_bstream_pair_sz( fd_vinyl_bstream_ctl_sz( ele->phdr.ctl ) ) : 0UL;
- (*_tstone_cnt) += (ulong)old_ele_is_pair;
-
- //ele->memo = already init
- //ele->phdr.ctl = already init
- //ele->phdr.key = already init
- //ele->phdr.info = d/c
- ele->line_idx = ULONG_MAX - 1UL; // tstone
- ele->seq = seq;
-
- fd_vinyl_meta_publish( query );
-
- } else {
-
- /* The meta entry (pair or tstone) for pair key in the meta is newer
- than seq. We can skip this tstone. */
-
- //pair_cnt unchanged
- //garbage_sz unchanged
- //tstone_cnt unchanged
-
- int corrupt = fd_vinyl_seq_eq( ele->seq, seq );
-
- fd_vinyl_meta_cancel( query );
-
- if( FD_UNLIKELY( corrupt ) ) {
- FD_LOG_WARNING(( "%016lx: probable corruption detected", seq ));
- return FD_VINYL_ERR_CORRUPT;
- }
-
- }
-
- return FD_VINYL_SUCCESS;
-}
-
-/* fd_vinyl_recover_part_task dynamically assigns the partitions of the
- bstream's past to threads for recovery and then recovers them in
- parallel. The bstream past partition iteration is near identical
- to bstream past iteration in serial recovery. See
- fd_vinyl_recover_serial.c for more details. */
-
-/* FIXME: ADD MORE EXTENSIVE DATA INTEGRITY CHECKING LIKE SERIAL IMPL */
-
-static FD_FN_UNUSED FD_MAP_REDUCE_BEGIN( fd_vinyl_recover_part_task, 1UL, alignof(ulong), sizeof(ulong), 4UL ) {
- ulong * _rlocal = (ulong *) arg[0];
- fd_vinyl_t * vinyl = (fd_vinyl_t *) arg[1];
- ulong * _lock = (ulong *) arg[2];
-
- fd_vinyl_io_t * io = vinyl->io;
- fd_vinyl_meta_t * meta = vinyl->meta;
-
- ulong io_seed = fd_vinyl_io_seed ( io );
- ulong seq_past = fd_vinyl_io_seq_past( io );
- uchar * mmio = (uchar *)fd_vinyl_mmio ( io );
- ulong mmio_sz = fd_vinyl_mmio_sz ( io );
-
- ulong fail = 1UL;
- ulong pair_cnt = 0UL;
- ulong garbage_sz = 0UL;
- ulong tstone_cnt = 0UL;
-
- for(;;) {
-
- /* Determine the range of the bstream past we should process next. */
-
- ulong seq0;
- ulong seq1;
-
- /* Lock and fetch the task assignment cursor */
-
- FD_COMPILER_MFENCE();
-# if FD_HAS_ATOMIC
- while( FD_ATOMIC_CAS( _lock, 0UL, 1UL ) ) FD_SPIN_PAUSE();
-# else
- *_lock = 1UL;
-# endif
- FD_COMPILER_MFENCE();
-
- seq1 = _lock[1];
-
- /* At this point, the bstream range [seq_past,seq1) has not been
- assigned. If seq1 is at seq_past, everything has been assigned
- already. Otherwise, the block before cursor is a valid partition
- block (as per the test above) and we claim the range:
-
- [ the older of part_seq0 and seq_past, seq1 )
-
- to process. */
-
- if( FD_UNLIKELY( fd_vinyl_seq_le( seq1, seq_past ) ) ) seq0 = seq_past;
- else {
- fd_vinyl_bstream_block_t const * block = PEEK( seq1 - FD_VINYL_BSTREAM_BLOCK_SZ );
- seq0 = block->part.seq0;
- if( fd_vinyl_seq_lt( seq0, seq_past ) ) seq0 = seq_past;
- }
-
- /* Update and unlock the task assignment cursor */
-
- _lock[1] = seq0;
- FD_COMPILER_MFENCE();
- _lock[0] = 0UL;
- FD_COMPILER_MFENCE();
-
- if( FD_UNLIKELY( fd_vinyl_seq_le( seq1, seq_past ) ) ) break;
-
- /* At this point, we need to recover the range [seq0,seq1). */
-
- ulong seq = seq0;
- while( fd_vinyl_seq_lt( seq, seq1 ) ) {
-
- fd_vinyl_bstream_block_t block[1];
-
- block[0] = *(fd_vinyl_bstream_block_t *)PEEK( seq ); /* testing is destructive */
-
- ulong ctl = block->ctl;
-
- int type = fd_vinyl_bstream_ctl_type( ctl );
-
- switch( type ) {
-
- case FD_VINYL_BSTREAM_CTL_TYPE_PAIR: {
-
- ulong pair_val_esz = fd_vinyl_bstream_ctl_sz( ctl );
-
- ulong pair_sz = fd_vinyl_bstream_pair_sz( pair_val_esz );
-
- if( FD_UNLIKELY( pair_sz > (seq1-seq) ) ) { /* Wrapping safe */
- FD_LOG_WARNING(( "%016lx: truncated", seq ));
- goto done;
- }
-
- fd_vinyl_bstream_block_t ftr[1];
-
- ftr[0] = *PEEK( seq + pair_sz - FD_VINYL_BSTREAM_BLOCK_SZ );
-
- char const * _err = fd_vinyl_bstream_pair_test_fast( io_seed, seq, block, ftr );
- if( FD_UNLIKELY( _err ) ) {
- FD_LOG_WARNING(( "%016lx: %s", seq, _err ));
- goto done;
- }
-
- /* At this point, we appear to have valid completely written
- pair. Prepare the meta to do an update for this key. */
-
- fd_vinyl_meta_query_t query[1];
-
- fd_vinyl_meta_prepare( meta, &block->phdr.key, NULL, query, FD_MAP_FLAG_BLOCKING );
-
- fd_vinyl_meta_ele_t * ele = fd_vinyl_meta_query_ele( query );
-
- if( FD_UNLIKELY( !ele ) ) {
- FD_LOG_WARNING(( "%016lx: corruption detected or meta cache too small for parallel recovery", seq ));
- goto done;
- }
-
- if( FD_LIKELY( (!ele->phdr.ctl) | fd_vinyl_seq_gt( seq, ele->seq ) ) ) {
-
- pair_cnt++;
-
- /* At this point, this is the first time any thread has seen
- pair key or this version of pair key is newer than the
- version (or tstone) of pair key has been seed */
-
- ele->memo = fd_vinyl_meta_query_memo( query );
- ele->phdr = block->phdr;
- ele->line_idx = ULONG_MAX; // pair
- ele->seq = seq;
-
- fd_vinyl_meta_publish( query );
-
- } else {
-
- /* At this point, this version of pair key is older than the
- version (or tstone) for pair key seen by all threads so
- far. */
-
- fd_vinyl_meta_cancel( query );
-
- garbage_sz += pair_sz;
-
- }
-
- seq += pair_sz;
- break;
- }
-
- case FD_VINYL_BSTREAM_CTL_TYPE_DEAD: {
-
- char const * _err = fd_vinyl_bstream_dead_test( io_seed, seq, block );
- if( FD_UNLIKELY( _err ) ) {
- FD_LOG_WARNING(( "%016lx: %s", seq, _err ));
- goto done;
- }
-
- int err = fd_vinyl_recover_tstone( meta, &block->dead.phdr.key, seq, &pair_cnt, &garbage_sz, &tstone_cnt );
- if( FD_UNLIKELY( err ) ) goto done; /* logs details */
-
- garbage_sz += FD_VINYL_BSTREAM_BLOCK_SZ;
- seq += FD_VINYL_BSTREAM_BLOCK_SZ;
- break;
- }
-
- case FD_VINYL_BSTREAM_CTL_TYPE_MOVE: {
-
- if( FD_UNLIKELY( 2UL*FD_VINYL_BSTREAM_BLOCK_SZ > (seq1-seq) ) ) { /* Wrapping safe */
- FD_LOG_WARNING(( "%016lx: truncated", seq ));
- goto done;
- }
-
- fd_vinyl_bstream_block_t dst[1];
-
- dst[0] = *PEEK( seq + FD_VINYL_BSTREAM_BLOCK_SZ );
-
- char const * _err = fd_vinyl_bstream_move_test( io_seed, seq, block, dst );
- if( FD_UNLIKELY( _err ) ) {
- FD_LOG_WARNING(( "%016lx: %s", seq, _err ));
- goto done;
- }
-
- int err = fd_vinyl_recover_tstone( meta, &block->move.src.key, seq, &pair_cnt, &garbage_sz, &tstone_cnt );
- if( FD_UNLIKELY( err ) ) goto done; /* logs details */
-
- /**/ err = fd_vinyl_recover_tstone( meta, &block->move.dst, seq, &pair_cnt, &garbage_sz, &tstone_cnt );
- if( FD_UNLIKELY( err ) ) goto done; /* logs details */
-
- garbage_sz += FD_VINYL_BSTREAM_BLOCK_SZ;
- seq += FD_VINYL_BSTREAM_BLOCK_SZ;
- break;
- }
-
- case FD_VINYL_BSTREAM_CTL_TYPE_PART: {
-
- char const * _err = fd_vinyl_bstream_part_test( io_seed, seq, block );
- if( FD_UNLIKELY( _err ) ) {
- FD_LOG_WARNING(( "%016lx: %s", seq, _err ));
- goto done;
- }
-
- garbage_sz += FD_VINYL_BSTREAM_BLOCK_SZ;
- seq += FD_VINYL_BSTREAM_BLOCK_SZ;
- break;
- }
-
- case FD_VINYL_BSTREAM_CTL_TYPE_ZPAD: {
-
- char const * _err = fd_vinyl_bstream_zpad_test( io_seed, seq, block );
- if( FD_UNLIKELY( _err ) ) {
- FD_LOG_WARNING(( "%016lx: %s", seq, _err ));
- goto done;
- }
-
- seq += FD_VINYL_BSTREAM_BLOCK_SZ;
- break;
- }
-
- default:
- FD_LOG_WARNING(( "%016lx: unknown type (%x)", seq, (uint)type ));
- goto done;
-
- }
- }
-
- if( FD_UNLIKELY( fd_vinyl_seq_ne( seq, seq1 ) ) ) {
- FD_LOG_WARNING(( "%016lx: bad partitioning", seq ));
- goto done;
- }
-
- }
-
- fail = 0UL;
-
-done:
-
- /* If we failed, tell all the other threads to not continue by
- setting the task assignment cursor to seq_past. */
-
- if( fail ) {
- FD_COMPILER_MFENCE();
-# if FD_HAS_ATOMIC
- while( FD_ATOMIC_CAS( _lock, 0UL, 1UL ) ) FD_SPIN_PAUSE();
-# else
- *_lock = 1UL;
-# endif
- FD_COMPILER_MFENCE();
- _lock[1]= seq_past;
- FD_COMPILER_MFENCE();
- _lock[0]= 0UL;
- }
-
- _rlocal[0] = fail;
- _rlocal[1] = pair_cnt;
- _rlocal[2] = garbage_sz;
- _rlocal[3] = tstone_cnt;
-
-} FD_MAP_END {
-
- ulong * _rlocal = (ulong *) arg[0];
- ulong const * _rremote = (ulong const *)_r1;
-
- _rlocal[0] |= _rremote[0];
- _rlocal[1] += _rremote[1];
- _rlocal[2] += _rremote[2];
- _rlocal[3] += _rremote[3];
-
-} FD_REDUCE_END
-
-static FD_FN_UNUSED FD_MAP_REDUCE_BEGIN( fd_vinyl_recover_meta_cleanup_task, 1L, alignof(ulong), sizeof(ulong), 1UL ) {
- ulong * _rlocal = (ulong *)arg[0];
-
- fd_vinyl_t * vinyl = (fd_vinyl_t *)arg[1];
-
- fd_vinyl_meta_t * meta = vinyl->meta;
-
- fd_vinyl_meta_ele_t * ele0 = meta->ele;
- ulong const * lock = meta->lock;
- int lock_shift = meta->lock_shift;
-
- ulong remove_cnt = 0UL;
-
- for( long ele_idx=block_i0; ele_idx> lock_shift;
-
- fd_vinyl_key_t key;
- int try_remove;
-
- /* Do a non-blocking query by ele_idx (not be key). We have to do
- this direct because this is no standard API for this. This is
- highly unlikely to ever block (but theoretically could if the
- remove in a different thread has locked a probe chain that
- touches elements in this thread). */
-
- for(;;) {
- FD_COMPILER_MFENCE();
- ulong ver0 = lock[ lock_idx ];
- FD_COMPILER_MFENCE();
- if( FD_LIKELY( !(ver0 & 1UL) ) ) {
-
- try_remove = (!!ele0[ ele_idx ].phdr.ctl) & (ele0[ ele_idx ].line_idx==(ULONG_MAX-1UL));
- key = ele0[ ele_idx ].phdr.key;
-
- FD_COMPILER_MFENCE();
- ulong ver1 = lock[ lock_idx ];
- FD_COMPILER_MFENCE();
- if( FD_LIKELY( ver0==ver1 ) ) break;
- }
- FD_SPIN_PAUSE();
- }
-
- /* If try_remove is not set, ele_idx either had no key it in or
- had a pair entry. So we continue to the next slot. */
-
- if( FD_LIKELY( !try_remove ) ) continue;
-
- /* At this point, we observed key had a tstone in the meta above.
- So we try to remove it. It is possible (though extremely
- unlikely for big sparse maps and the vanilla thread partitioning
- here) that a remove on another thread got key first. So it is
- okay if this fails. We have to use the parallel version of this
- (even if it is highly unlikely to interfere with other threads)
- for the same reason we had to use a non-blocking query above. */
-
- fd_vinyl_meta_query_t query[1];
- remove_cnt += (ulong)!fd_vinyl_meta_remove( meta, &key, query, FD_MAP_FLAG_BLOCKING );
- }
-
- *_rlocal = remove_cnt;
-
-} FD_MAP_END {
-
- ulong * _rlocal = (ulong *) arg[0];
- ulong const * _rremote = (ulong const *)_r1;
-
- *_rlocal += *_rremote;
-
-} FD_REDUCE_END
-
-ulong
-fd_vinyl_recover( fd_tpool_t * tpool, ulong t0, ulong t1, int level,
- fd_vinyl_t * vinyl ) {
-
- fd_vinyl_meta_t * meta = vinyl->meta;
- ulong line_cnt = vinyl->line_cnt;
-
- ulong ele_max = meta->ele_max;
- ulong lock_cnt = meta->lock_cnt;
-
- /* Using all avaialble threads, flush the lines and meta cache. We do
- the meta flush locked so we don't confuse any concurrent meta
- readers. This will claim any existing locks (e.g. the previous
- meta writer died while holding a lock and the user didn't clean it
- up before calling this). */
-
- ulong reclaim_cnt;
-
- FD_FOR_ALL ( fd_vinyl_recover_line_task, tpool,t0,t1, 0L,(long)line_cnt, vinyl );
- FD_MAP_REDUCE( fd_vinyl_recover_reclaim_task, tpool,t0,t1, 0L,(long)lock_cnt, &reclaim_cnt, vinyl );
- FD_FOR_ALL ( fd_vinyl_recover_meta_flush_task, tpool,t0,t1, 0L,(long)ele_max, vinyl );
- FD_FOR_ALL ( fd_vinyl_recover_unlock_task, tpool,t0,t1, 0L,(long)lock_cnt, vinyl );
-
- if( FD_UNLIKELY( reclaim_cnt ) ) FD_LOG_WARNING(( "reclaimed %lu locks (dead writer?); attempting to continue", reclaim_cnt ));
-
- /* FIXME: should this fail if it detects in progress io? */
-
- /* If there is only 1 thread provided or the bstream past doesn't
- have a valid partitioning, use the serial recovery algorithm */
-
-t1 = t0 + 1UL; /* Turn off parallel recovery while it is untested */
-
- if( FD_UNLIKELY( (t1-t0)<=1UL ) ||
- FD_UNLIKELY( fd_vinyl_recover_test( vinyl->io ) ||
- !FD_HAS_ATOMIC ) ) {
- fd_vinyl_data_reset( tpool,t0,t1, level, vinyl->data );
- return fd_vinyl_recover_serial( vinyl );
- }
-
-# if FD_HAS_ATOMIC
-
- /* The parallel recovery of bstream partition may leave tstones in the
- meta elements. To clean this up, we have two options.
-
- Option 1 (simplest and most robust): we parallel scan all the meta
- elements in parallel for tstones and remove them. We might have to
- do more than one pass because the removal of elements could mean
- some elements are not placed well. This requires no scratch (and
- thus is more robust against arbitrary erase / move patterns in the
- recovery region). While it isn't any less algo inefficient
- (because we paralllel scan all the elements already to clear them),
- it is pracitcally less efficient for applications access patterns
- that don't generate many tombstones and/or have pair_cnt<io );
-
- ulong rtmp[4];
- ulong lock[2];
-
- lock[0] = 0UL;
- lock[1] = seq;
-
- FD_MAP_REDUCE( fd_vinyl_recover_part_task, tpool,t0,t1, 0L,(long)(t1-t0), rtmp, vinyl, lock );
-
- ulong fail = rtmp[0];
- if( FD_UNLIKELY( fail ) ) {
- FD_LOG_WARNING(( "parallel recovery failed; attempting serial recovery" ));
-
- /* Reset the meta from whatever messy state failed parallel recovery
- left it */
-
- FD_MAP_REDUCE( fd_vinyl_recover_reclaim_task, tpool,t0,t1, 0L,(long)lock_cnt, &reclaim_cnt, vinyl );
- FD_FOR_ALL ( fd_vinyl_recover_meta_flush_task, tpool,t0,t1, 0L,(long)ele_max, vinyl );
- FD_FOR_ALL ( fd_vinyl_recover_unlock_task, tpool,t0,t1, 0L,(long)lock_cnt, vinyl );
-
- fd_vinyl_data_reset( tpool,t0,t1, level, vinyl->data );
-
- return fd_vinyl_recover_serial( vinyl );
- }
-
- vinyl->pair_cnt = rtmp[1];
- vinyl->garbage_sz = rtmp[2];
-
- ulong tstone_rem = rtmp[3];
-
- while( tstone_rem ) {
- FD_FOR_ALL( fd_vinyl_recover_meta_cleanup_task, tpool,t0,t1, 0L,(long)ele_max, rtmp, vinyl );
- tstone_rem -= rtmp[0];
- }
-
- /* Reset the data cache to clean up any scratch usage (currently none
- but no reason to do earlier) */
-
- fd_vinyl_data_reset( tpool,t0,t1, level, vinyl->data );
-
- return seq;
-
-# endif /* FD_HAS_ATOMIC */
-}
diff --git a/src/vinyl/fd_vinyl_recover_serial.c b/src/vinyl/fd_vinyl_recover_serial.c
deleted file mode 100644
index 39fc6056893..00000000000
--- a/src/vinyl/fd_vinyl_recover_serial.c
+++ /dev/null
@@ -1,351 +0,0 @@
-/* This is included directly by fd_vinyl_recover.c */
-
-ulong
-fd_vinyl_recover_serial( fd_vinyl_t * vinyl ) {
-
- /* Iterate over the bstream's past to populate the meta. Note that
- our caller flushed the meta cache, data cache and reset the cache
- line eviction priorities to their default. */
-
- fd_vinyl_meta_t * meta = vinyl->meta;
- fd_vinyl_line_t * line = vinyl->line;
- fd_vinyl_io_t * io = vinyl->io;
-
- ulong line_cnt = vinyl->line_cnt;
- ulong pair_max = vinyl->pair_max;
-
- ulong io_seed = fd_vinyl_io_seed ( io );
- ulong seq_past = fd_vinyl_io_seq_past ( io );
- ulong seq_present = fd_vinyl_io_seq_present( io );
-
- fd_vinyl_meta_ele_t * ele0 = meta->ele;
- ulong ele_max = meta->ele_max;
- ulong meta_seed = meta->seed;
- ulong * lock = meta->lock;
- int lock_shift = meta->lock_shift;
-
- ulong seq = seq_past;
- ulong pair_cnt = 0UL;
- ulong garbage_sz = 0UL;
-
- while( fd_vinyl_seq_lt( seq, seq_present ) ) {
-
- /* At this point, we've recovered [seq_past,seq) and still need
- recover [seq,seq_present) (non-empty). Read the block at seq. */
-
- fd_vinyl_bstream_block_t block[1];
-
- fd_vinyl_io_read_imm( io, seq, block, FD_VINYL_BSTREAM_BLOCK_SZ );
-
- ulong ctl = block->ctl;
-
- int type = fd_vinyl_bstream_ctl_type( ctl );
-
- switch( type ) {
-
- case FD_VINYL_BSTREAM_CTL_TYPE_PAIR: {
-
- /* Notes:
-
- - It is okay if we are in a move (move block processing the
- previous iteration already confirmed this is the proper pair.
-
- - We could rewind the bstream to seq on truncation
- automatically but then we might have failed to recover the
- most recent pair and thus have recovered to a state that does
- not correspond to the bstream's past. We instead kick this
- to the user to decide if they want to discard an incompletely
- written pair or not. */
-
- ulong pair_val_esz = fd_vinyl_bstream_ctl_sz( ctl );
-
- ulong pair_sz = fd_vinyl_bstream_pair_sz( pair_val_esz );
-
- if( FD_UNLIKELY( pair_sz > (seq_present-seq) ) ) { /* Wrapping safe */
- FD_LOG_WARNING(( "%016lx: truncated", seq ));
- goto done;
- }
-
- fd_vinyl_bstream_block_t _ftr[1];
- fd_vinyl_bstream_block_t * ftr = _ftr;
-
- if( pair_sz <= FD_VINYL_BSTREAM_BLOCK_SZ ) ftr = block;
- else fd_vinyl_io_read_imm( io, seq + pair_sz - FD_VINYL_BSTREAM_BLOCK_SZ, ftr, FD_VINYL_BSTREAM_BLOCK_SZ );
-
- char const * _err = fd_vinyl_bstream_pair_test_fast( io_seed, seq, block, ftr );
- if( FD_UNLIKELY( _err ) ) {
- FD_LOG_WARNING(( "%016lx: %s", seq, _err ));
- goto done;
- }
-
- /* At this point, we appear to have valid completely written pair.
- Extract the pair metadata and determine if this replaces a
- version we've already seen. Since this single threaded, we can
- use the single threaded optimized meta APIs here. */
-
- fd_vinyl_key_t const * pair_key = &block->phdr.key;
-
- ulong pair_memo = fd_vinyl_key_memo( meta_seed, pair_key );
-
- ulong _ele_idx; /* avoid pointer escape */
- int err = fd_vinyl_meta_query_fast( ele0, ele_max, pair_key, pair_memo, &_ele_idx );
- ulong ele_idx = _ele_idx;
-
- if( FD_LIKELY( err==FD_VINYL_ERR_KEY ) ) {
-
- /* This is the first time we've seen pair key or pair key was
- erased in a previous iteration (e.g. we most recently
- processed an erase for pair key or we are in a move). If we
- have room for pair key, insert it into the meta at ele_idx. */
-
- if( FD_UNLIKELY( pair_cnt>=pair_max ) ) {
- FD_LOG_WARNING(( "%016lx: increase pair_max", seq ));
- goto done;
- }
-
- ele0[ ele_idx ].memo = pair_memo;
- ele0[ ele_idx ].phdr = block->phdr;
- ele0[ ele_idx ].seq = seq;
- ele0[ ele_idx ].line_idx = ULONG_MAX; /* key-val not in cache */
-
- pair_cnt++;
-
- } else if( FD_LIKELY( !err ) ) {
-
- /* This is a more recent version of a pair we saw previously and
- meta element ele_idx currently maps pair key to this previous
- version. Mark the old version as garbage to collect in the
- future and update the mapping to this version. */
-
- ulong old_pair_ctl = ele0[ ele_idx ].phdr.ctl;
-
- ulong old_pair_val_esz = fd_vinyl_bstream_ctl_sz( old_pair_ctl );
-
- garbage_sz += fd_vinyl_bstream_pair_sz( old_pair_val_esz );
-
- //ele0[ ele_idx ].memo = pair_memo; /* already current */
- ele0[ ele_idx ].phdr = block->phdr;
- ele0[ ele_idx ].seq = seq;
- //ele0[ ele_idx ].line_idx = ULONG_MAX; /* already current */
-
- } else {
-
- FD_LOG_WARNING(( "%016lx: corrupt meta", seq ));
- goto done;
-
- }
-
- seq += pair_sz;
- break;
-
- }
-
- case FD_VINYL_BSTREAM_CTL_TYPE_DEAD: {
-
- char const * _err = fd_vinyl_bstream_dead_test( io_seed, seq, block );
- if( FD_UNLIKELY( _err ) ) {
- FD_LOG_WARNING(( "%016lx: %s", seq, _err ));
- goto done;
- }
-
- /* At this point, we appear to have a valid DEAD block. Look up
- the pair it erases. */
-
- ulong pair_val_esz = fd_vinyl_bstream_ctl_sz( block->dead.phdr.ctl );
-
- fd_vinyl_key_t const * pair_key = &block->dead.phdr.key;
-
- ulong pair_memo = fd_vinyl_key_memo( meta_seed, pair_key );
-
- ulong _ele_idx; /* avoid pointer escape */
- int err = fd_vinyl_meta_query_fast( ele0, ele_max, pair_key, pair_memo, &_ele_idx );
- ulong ele_idx = _ele_idx;;
-
- if( FD_LIKELY( err==FD_VINYL_ERR_KEY ) ) {
-
- /* This erases the most recent version of pair key in the
- bstream's antiquity or is a redundant erase block (which is
- arguably an error but, as we can't tell the difference at
- this point, we assume the more likely antiquity case). In
- short, there's nothing to do but mark this block as garbage
- to collect in the future. */
-
- garbage_sz += FD_VINYL_BSTREAM_BLOCK_SZ;
-
- } else {
-
- /* This erases the most recent version of pair key we've
- processed. Validate the erasure target is correct. If so,
- mark this block and that version of pair key as garbage for
- future collection and remove pair key from the meta. */
-
- int bad_order = fd_vinyl_seq_ge( ele0[ ele_idx ].seq, seq );
- int bad_phdr = !!memcmp( &ele0[ ele_idx ].phdr, &block->dead.phdr, sizeof(fd_vinyl_bstream_phdr_t) );
-
- if( FD_UNLIKELY( bad_order | bad_phdr ) ) {
- FD_LOG_WARNING(( "%016lx: %s", seq, bad_order ? "unordered sequence" : "mismatched dead pair metadata" ));
- goto done;
- }
-
- garbage_sz += FD_VINYL_BSTREAM_BLOCK_SZ + fd_vinyl_bstream_pair_sz( pair_val_esz );
-
- fd_vinyl_meta_remove_fast( ele0, ele_max, lock, lock_shift, line, line_cnt, ele_idx );
-
- FD_CRIT( pair_cnt, "corruption detected" );
- pair_cnt--;
-
- }
-
- seq += FD_VINYL_BSTREAM_BLOCK_SZ;
- break;
-
- }
-
- case FD_VINYL_BSTREAM_CTL_TYPE_MOVE: {
-
- if( FD_UNLIKELY( 2UL*FD_VINYL_BSTREAM_BLOCK_SZ > (seq_present-seq) ) ) { /* Wrapping safe */
- FD_LOG_WARNING(( "%016lx: truncated", seq ));
- goto done;
- }
-
- fd_vinyl_bstream_block_t dst[1];
-
- fd_vinyl_io_read_imm( io, seq + FD_VINYL_BSTREAM_BLOCK_SZ, dst, FD_VINYL_BSTREAM_BLOCK_SZ );
-
- char const * _err = fd_vinyl_bstream_move_test( io_seed, seq, block, dst );
- if( FD_UNLIKELY( _err ) ) {
- FD_LOG_WARNING(( "%016lx: %s", seq, _err ));
- goto done;
- }
-
- /* At this point, we appear to have a valid move. Technically, a
- move is an atomic "erase pair src_key if any, erase pair
- dst_key if any, insert pair dst_key with the info src_info_old
- and val src_val_new" where src_val_new is typically the same as
- src_val_old, but, strictly speaking, doesn't have to be.
-
- We do the "erase pair src_key if any" part of the move here.
- The next iteration will handle rest naturally (including doing
- more extensive validation on the new pair_dst). Note that if
- the next iteration detects the new pair dst is invalid, it will
- fail recovery in the middle of the move. So applications
- should be very wary of using a partial recovery as such can
- break move atomicity. */
-
- ulong src_val_esz = fd_vinyl_bstream_ctl_sz( block->move.src.ctl );
- fd_vinyl_key_t const * src_key = &block->move.src.key;
-
- ulong src_memo = fd_vinyl_key_memo( meta_seed, src_key );
-
- ulong _ele_idx; /* avoid pointer escape */
- int err = fd_vinyl_meta_query_fast( ele0, ele_max, src_key, src_memo, &_ele_idx );
- ulong ele_idx = _ele_idx;
-
- if( FD_LIKELY( err==FD_VINYL_ERR_KEY ) ) {
-
- /* This move erases the most recent version of pair src_key in
- the bstream's antiquity or is a redundant move block (which
- is arguably an error but, as we can't tell the difference at
- this point, we assume the more likely antiquity case). In
- short, there's nothing to do but mark this block as garbage
- to collect in the future. */
-
- garbage_sz += FD_VINYL_BSTREAM_BLOCK_SZ;
-
- } else {
-
- /* This move erases the most recent version of pair src_key
- we've processed. Validate the erasure target is correct. If
- so, mark this block and this version of pair src_key as
- garbage for future collection and remove pair src_key from
- the meta. */
-
- int bad_order = fd_vinyl_seq_ge( ele0[ ele_idx ].seq, seq );
- int bad_cnt = !pair_cnt;
- int bad_phdr = !!memcmp( &ele0[ ele_idx ].phdr, &block->move.src, sizeof(fd_vinyl_bstream_phdr_t) );
-
- if( FD_UNLIKELY( bad_order | bad_cnt | bad_phdr ) ) {
- FD_LOG_WARNING(( "%016lx: %s", seq, bad_order ? "unordered sequence" :
- bad_cnt ? "corrupt meta" :
- "mismatched move src metadata" ));
- goto done;
- }
-
- garbage_sz += FD_VINYL_BSTREAM_BLOCK_SZ + fd_vinyl_bstream_pair_sz( src_val_esz );
-
- fd_vinyl_meta_remove_fast( ele0, ele_max, lock, lock_shift, line, line_cnt, ele_idx );
-
- pair_cnt--;
-
- }
-
- /* At this point, we've handled the "erase old src if any" part of
- the move. The next iteration will handle the "erase old dst if
- any" and the "insert new dst" part of the move. We know there
- will be a next iteration for a type pair object with the
- appropriate mojo because of the checks we've already done. So
- moves behave atomically from the point of view of the
- application when fully recovered. */
-
- seq += FD_VINYL_BSTREAM_BLOCK_SZ;
- break;
-
- }
-
- case FD_VINYL_BSTREAM_CTL_TYPE_PART: {
-
- if( FD_UNLIKELY( fd_vinyl_seq_ne( block->part.seq, seq ) ) ) {
- FD_LOG_WARNING(( "%016lx: unexpected part seq", seq ));
- goto done;
- }
-
- char const * _err = fd_vinyl_bstream_part_test( io_seed, seq, block );
- if( FD_UNLIKELY( _err ) ) {
- FD_LOG_WARNING(( "%016lx: %s", seq, _err ));
- goto done;
- }
-
- garbage_sz += FD_VINYL_BSTREAM_BLOCK_SZ;
- seq += FD_VINYL_BSTREAM_BLOCK_SZ;
- break;
-
- }
-
- case FD_VINYL_BSTREAM_CTL_TYPE_ZPAD: {
-
- char const * _err = fd_vinyl_bstream_zpad_test( io_seed, seq, block );
- if( FD_UNLIKELY( _err ) ) {
- FD_LOG_WARNING(( "%016lx: %s", seq, _err ));
- goto done;
- }
-
- /* Note: zpad blocks aren't included in garbage_sz because we
- don't control when they get created (and thus can't easily
- update garbage_sz to account for them when they are created). */
-
- seq += FD_VINYL_BSTREAM_BLOCK_SZ;
- break;
-
- }
-
- default:
- FD_LOG_WARNING(( "%016lx: unknown type (%x)", seq, (uint)type ));
- goto done;
- }
- }
-
-done:
-
- /* At this point, the meta is populated appropriately up to seq.
- Update the vinyl state and return. If we did not get to
- seq_present, we log a warning. */
-
- vinyl->pair_cnt = pair_cnt;
- vinyl->garbage_sz = garbage_sz;
-
- if( FD_UNLIKELY( fd_vinyl_seq_ne( seq, seq_present ) ) )
- FD_LOG_WARNING(( "recovery failed, recovered [%016lx,%016lx)/%lu, unrecovered [%016lx,%016lx)/%lu",
- seq_past, seq, seq-seq_past, seq, seq_present, seq_present-seq ));
-
- return seq;
-}
diff --git a/src/vinyl/line/fd_vinyl_line.h b/src/vinyl/line/fd_vinyl_line.h
index 6a7302ccc6f..cac1e5ab8cf 100644
--- a/src/vinyl/line/fd_vinyl_line.h
+++ b/src/vinyl/line/fd_vinyl_line.h
@@ -83,7 +83,10 @@
acquired-for-read ref times. */
struct __attribute__((aligned(32))) fd_vinyl_line {
- fd_vinyl_data_obj_t * obj; /* location in the data cache of the data_obj storing val, NULL if not caching a pair */
+ union {
+ fd_vinyl_data_obj_t * obj; /* location in the data cache of the data_obj storing val, NULL if not caching a pair */
+ ulong obj_gaddr; /* same, as a global address for cross-address-space use */
+ };
ulong ele_idx; /* map element storing key and the pair metadata (app and key), in [0,map_cnt) */
ulong ctl; /* packs the line version and line reference count */
uint line_idx_older; /* older line in eviction sequence, in [0,line_cnt) */
@@ -92,6 +95,8 @@ struct __attribute__((aligned(32))) fd_vinyl_line {
typedef struct fd_vinyl_line fd_vinyl_line_t;
+FD_STATIC_ASSERT( sizeof(fd_vinyl_line_t)==32UL, layout );
+
FD_PROTOTYPES_BEGIN
/* fd_vinyl_line_ctl returns ver and ref encoded as a line ctl. ver is
diff --git a/src/vinyl/test_vinyl_req.c b/src/vinyl/test_vinyl_req.c
deleted file mode 100644
index 5fc56dd8094..00000000000
--- a/src/vinyl/test_vinyl_req.c
+++ /dev/null
@@ -1,1101 +0,0 @@
-#include "fd_vinyl.h"
-
-#define PAIR_MAX (4UL)
-
-struct pair {
- int creating;
- long acq;
- ulong ver;
- ulong val_max;
- fd_vinyl_key_t key [ 1 ];
- fd_vinyl_info_t info [ 1 ];
- fd_vinyl_info_t backup_info[ 1 ];
- uchar val [ FD_VINYL_VAL_MAX ];
- uchar backup_val [ FD_VINYL_VAL_MAX ];
-};
-
-typedef struct pair pair_t;
-
-static struct {
- ulong quota_rem;
- ulong used;
- pair_t pair[ PAIR_MAX ];
-} ref;
-
-static int
-req( int type, /* request type */
- ulong flags, /* request flags */
- ulong val_max, /* for acquire-with-modify */
- fd_vinyl_key_t const * key, /* Key for req */
- pair_t ** _pair, /* Location of pair on successful acquire and/or successful try */
- ulong * _ver ) { /* Version of pair on try, has try version on test, ignored otherwise */
-
- switch( type ) {
-
- case FD_VINYL_REQ_TYPE_ACQUIRE: {
- if( !ref.quota_rem ) return FD_VINYL_ERR_FULL; /* (comp err) client quota exhausted */
-
- if( fd_vinyl_req_flag_modify( flags ) && (val_max>FD_VINYL_VAL_MAX) ) return FD_VINYL_ERR_INVAL; /* bad req val_max */
-
- ulong idx = 0UL;
- pair_t * pair = NULL;
- for( ; idx> idx) & 1UL) && fd_vinyl_key_eq( key, ref.pair[ idx ].key ) ) {
- pair = &ref.pair[ idx ];
- break;
- }
- }
-
- if( pair ) {
-
- if( !fd_vinyl_req_flag_modify( flags ) ) {
-
- /* start blocking read of an existing pair */
-
- if( pair->acq < 0L ) return FD_VINYL_ERR_AGAIN; /* key acquired for modify */
-
- ulong ref_szc = fd_vinyl_data_szc( (ulong)pair->info->val_sz );
- pair->val_max = fd_vinyl_data_szc_val_max( ref_szc );
-
- FD_TEST( !pair->creating );
- pair->acq++;
- //pair->ver unchanged
-
- ref.quota_rem--;
-
- *_pair = pair;
- return FD_VINYL_SUCCESS;
-
- }
-
- /* start modify of an existing pair */
-
- if( pair->acq ) return FD_VINYL_ERR_AGAIN; /* key acquired at least once */
- if( fd_vinyl_req_flag_excl( flags ) ) return FD_VINYL_ERR_INVAL; /* not allowed to modify existing */
-
- pair->backup_info[0] = pair->info[0];
- if( pair->info->val_sz ) memcpy( pair->backup_val, pair->val, (ulong)pair->info->val_sz );
-
- if( fd_vinyl_req_flag_ignore( flags ) ) pair->info->val_sz = 0U;
-
- ulong ref_szc = fd_vinyl_data_szc( fd_ulong_max( val_max, (ulong)pair->info->val_sz ) );
- pair->val_max = fd_vinyl_data_szc_val_max( ref_szc );
-
- FD_TEST( !pair->creating );
- pair->acq = -1L;
- pair->ver++;
-
- ref.quota_rem--;
-
- *_pair = pair;
- return FD_VINYL_SUCCESS;
-
- }
-
- /* start creating a pair */
-
- if( !(fd_vinyl_req_flag_modify( flags ) && fd_vinyl_req_flag_create( flags )) ) return FD_VINYL_ERR_KEY;
-
- FD_TEST( (ulong)fd_ulong_popcnt( ref.used ) < PAIR_MAX );
- idx = (ulong)fd_ulong_find_lsb( ~ref.used );
- pair = &ref.pair[ idx ];
- ref.used |= (1UL << idx);
-
- pair->key[0] = key[0];
- memset( pair->info, 0UL, sizeof(fd_vinyl_info_t) );
-
- ulong ref_szc = fd_vinyl_data_szc( val_max );
- pair->val_max = fd_vinyl_data_szc_val_max( ref_szc );
-
- pair->creating = 1;
- pair->acq = -1L;
- pair->ver++;
-
- ref.quota_rem--;
-
- *_pair = pair;
- return FD_VINYL_SUCCESS;
- }
-
- case FD_VINYL_REQ_TYPE_RELEASE: {
- ulong idx = 0UL;
- pair_t * pair = NULL;
- for( ; idx> idx) & 1UL) && fd_vinyl_key_eq( key, ref.pair[ idx ].key ) ) {
- pair = &ref.pair[ idx ];
- break;
- }
- }
-
- if( !pair ) return FD_VINYL_ERR_INVAL; /* Key does not exist and is not being created (cannot have been acquired) */
- if( !pair->acq ) return FD_VINYL_ERR_INVAL; /* Key is not acquired */
-
- if( pair->acq > 0L ) {
-
- /* finish blocking read */
-
- if( fd_vinyl_req_flag_modify( flags ) ) FD_LOG_CRIT(( "modify read only" ));
-
- FD_TEST( !pair->creating );
-
- pair->acq--;
-
- ref.quota_rem++;
- return FD_VINYL_SUCCESS;
-
- }
-
- if( pair->creating ) {
-
- if( ((!fd_vinyl_req_flag_modify( flags )) | fd_vinyl_req_flag_erase( flags )) ) {
-
- /* cancel / erase a create */
-
- pair->ver++;
-
- ref.used &= ~(1UL << idx);
-
- ref.quota_rem++;
- return FD_VINYL_SUCCESS;
-
- }
-
- /* finish a create */
-
- if( (ulong)pair->info->val_sz > pair->val_max ) FD_LOG_CRIT(( "val buffer overrun" ));
-
- ulong ref_szc = fd_vinyl_data_szc( (ulong)pair->info->val_sz );
- pair->val_max = fd_vinyl_data_szc_val_max( ref_szc );
-
- pair->creating = 0;
- pair->acq = 0L;
- pair->ver++;
-
- ref.quota_rem++;
- return FD_VINYL_SUCCESS;
-
- }
-
- if( !fd_vinyl_req_flag_modify( flags ) ) {
-
- if( !fd_vinyl_req_flag_ignore( flags ) ) {
-
- /* cancel modify existing (info/val were not clobbered during the modify attempt) */
-
- pair->acq = 0L;
- pair->ver--;
-
- ref.quota_rem++;
- return FD_VINYL_SUCCESS;
-
- }
-
- /* cancel modify existing (info/val were potentially clobbered during the modify attempt) */
-
- pair->info[0] = pair->backup_info[0];
- if( pair->backup_info->val_sz ) memcpy( pair->val, pair->backup_val, (ulong)pair->backup_info->val_sz );
-
- ulong ref_szc = fd_vinyl_data_szc( (ulong)pair->info->val_sz );
- pair->val_max = fd_vinyl_data_szc_val_max( ref_szc );
-
- pair->acq = 0L;
- pair->ver++;
-
- ref.quota_rem++;
- return FD_VINYL_SUCCESS;
-
- }
-
- if( fd_vinyl_req_flag_erase( flags ) ) {
-
- /* erase existing */
-
- pair->ver++;
-
- ref.used &= ~(1UL<info->val_sz > pair->val_max ) FD_LOG_CRIT(( "val buffer overrun" ));
-
- ulong ref_szc = fd_vinyl_data_szc( (ulong)pair->info->val_sz );
- pair->val_max = fd_vinyl_data_szc_val_max( ref_szc );
-
- pair->acq = 0L;
- pair->ver++;
-
- ref.quota_rem++;
- return FD_VINYL_SUCCESS;
- }
-
- case FD_VINYL_REQ_TYPE_ERASE: {
- ulong idx = 0UL;
- pair_t * pair = NULL;
- for( ; idx> idx) & 1UL) && fd_vinyl_key_eq( key, ref.pair[ idx ].key ) ) {
- pair = &ref.pair[ idx ];
- break;
- }
- }
-
- if( !pair ) return FD_VINYL_ERR_KEY; /* Key does not exist */
- if( pair->acq ) return FD_VINYL_ERR_AGAIN; /* Key acquired at least once */
-
- pair->ver++;
-
- ref.used &= ~(1UL<> dst_idx) & 1UL) && fd_vinyl_key_eq( dst_key, ref.pair[ dst_idx ].key ) ) {
- dst_pair = &ref.pair[ dst_idx ];
- break;
- }
- }
-
- if( dst_pair && dst_pair->acq ) return FD_VINYL_ERR_AGAIN;
-
- /* Lookup pair src_key. If it doesn't exist, fail with KEY. If it
- is acquired, fail with AGAIN. */
-
- ulong src_idx = 0UL;
- pair_t * src_pair = NULL;
- for( ; src_idx> src_idx) & 1UL) && fd_vinyl_key_eq( src_key, ref.pair[ src_idx ].key ) ) {
- src_pair = &ref.pair[ src_idx ];
- break;
- }
- }
-
- if( !src_pair ) return FD_VINYL_ERR_KEY;
- if( src_pair->acq ) return FD_VINYL_ERR_AGAIN;
-
- /* At this point:
- - pair dst_key may or may not exist. If it exists it is not
- acquired.
- - pair src_key exists and is not acquired.
- Thus we are clear to move. Erase pair dst_key if it exists. Then
- rename pair src_key to pair dst_key. */
-
- if( dst_pair ) {
- dst_pair->ver++;
- ref.used &= ~(1UL<key[0] = dst_key[0];
- src_pair->ver++;
-
- return FD_VINYL_SUCCESS;
- }
-
- case FD_VINYL_REQ_TYPE_FETCH: {
- return FD_VINYL_SUCCESS;
- }
-
- case FD_VINYL_REQ_TYPE_FLUSH: {
- return FD_VINYL_SUCCESS;
- }
-
- case FD_VINYL_REQ_TYPE_TRY: {
- ulong idx = 0UL;
- pair_t * pair = NULL;
- for( ; idx> idx) & 1UL) && fd_vinyl_key_eq( key, ref.pair[ idx ].key ) ) {
- pair = &ref.pair[ idx ];
- break;
- }
- }
-
- if( !pair ) return FD_VINYL_ERR_KEY; /* Key does not exist */
- if( pair->acq<0L ) return FD_VINYL_ERR_AGAIN; /* Key acquired-for-modify */
-
- *_pair = pair;
- *_ver = pair->ver;
-
- return FD_VINYL_SUCCESS;
- }
-
- case FD_VINYL_REQ_TYPE_TEST: {
-
- if( (*_pair)->ver != (*_ver) ) return FD_VINYL_ERR_CORRUPT; /* Key modified during the try */
-
- return FD_VINYL_SUCCESS;
- }
-
- default:
- break;
- }
-
- return FD_VINYL_ERR_INVAL;
-}
-
-static int
-fd_vinyl_tile( int argc,
- char ** argv ) {
- (void)argc;
- fd_vinyl_exec( (fd_vinyl_t *)argv );
- return 0;
-}
-
-static void
-client_tile( ulong iter_max,
- fd_cnc_t * cnc,
- ulong link_id,
- fd_vinyl_rq_t * rq,
- fd_vinyl_cq_t * cq,
- fd_wksp_t * wksp,
- void * _scratch ) {
- fd_rng_t _rng[1]; fd_rng_t * rng = fd_rng_join( fd_rng_new( _rng, 0U, 0UL ) );
-
- uchar * top = (uchar *)_scratch;
-
- fd_vinyl_key_t * _key = (fd_vinyl_key_t *) top; top += sizeof(fd_vinyl_key_t)*PAIR_MAX;
- ulong * _try_gaddr = (ulong *) top; top += sizeof(ulong)*2UL *PAIR_MAX;
-
- fd_vinyl_comp_t * comp = (fd_vinyl_comp_t *)top; top += sizeof(fd_vinyl_comp_t);
- ulong * val_gaddr = (ulong *) top; top += sizeof(ulong);
- schar * err = (schar *) top; top += sizeof(schar);
-
- ulong comp_gaddr = fd_wksp_gaddr( wksp, comp );
- ulong val_gaddr_gaddr = fd_wksp_gaddr( wksp, val_gaddr );
- ulong err_gaddr = fd_wksp_gaddr( wksp, err );
-
- ulong cq_seq = fd_vinyl_cq_seq( cq );
-
-# define WAIT do { \
- if( oob ) { \
- while( !FD_VOLATILE_CONST( comp->seq ) ) FD_SPIN_PAUSE(); \
- FD_TEST( comp->seq==1UL ); \
- } else { \
- while( fd_vinyl_cq_recv( cq, cq_seq, comp ) ) FD_SPIN_PAUSE(); \
- FD_TEST( comp->seq==cq_seq ); \
- cq_seq++; \
- } \
- FD_TEST( comp->req_id ==req_id ); \
- FD_TEST( comp->link_id==link_id ); \
- } while(0)
-
- ulong val_max_bad = FD_VINYL_VAL_MAX+1UL;
-
- long acq [ PAIR_MAX ]; for( ulong idx=0UL; idx>= 2;
- ulong dst_idx = (r & 3UL); r >>= 2;
- int op = (int)(r & 63UL); r >>= 6;
- int by_key = (int)(r & 1UL); r >>= 1;
- int do_mod = (int)(r & 1UL); r >>= 1;
- ulong oob = (int)(r & 1UL) ? comp_gaddr : 0UL; r >>= 1;
- ulong flags = (r & 255UL); r >>= 8;
- ulong val_max = (r & (ulong)UINT_MAX) % (FD_VINYL_VAL_MAX+1UL); r >>= 32;
- int pat = (int)(r & 255UL); r >>= 8;
-
- fd_vinyl_key_t * src_key = _key + src_idx;
- fd_vinyl_key_t * dst_key = _key + dst_idx;
- ulong * try_gaddr = _try_gaddr + 2UL*src_idx;
-
- ulong src_key_gaddr = fd_wksp_gaddr( wksp, src_key );
- ulong dst_key_gaddr = fd_wksp_gaddr( wksp, dst_key );
- ulong try_gaddr_gaddr = fd_wksp_gaddr( wksp, try_gaddr );
-
- comp->seq = 0UL;
-
- switch( op ) {
-
- case 0: /* mismatched link id (dropped and ticks the DROP_LINK counter) */
- val_gaddr[0] = val_max;
- fd_vinyl_rq_send( rq, req_id, ~link_id, FD_VINYL_REQ_TYPE_ACQUIRE, flags, 1UL,
- src_key_gaddr, val_gaddr_gaddr, err_gaddr, oob );
- break;
-
- case 1: /* unmappable oob completion (dropped and ticks the DROP_COMP counter) */
- val_gaddr[0] = val_max;
- fd_vinyl_rq_send( rq, req_id, link_id, FD_VINYL_REQ_TYPE_ACQUIRE, flags, 1UL,
- src_key_gaddr, val_gaddr_gaddr, err_gaddr, ULONG_MAX );
- break;
-
- case 2: /* bad request type */
- fd_vinyl_rq_send( rq, req_id, link_id, -1, flags, 1UL,
- src_key_gaddr, val_gaddr_gaddr, err_gaddr, oob ); WAIT;
- FD_TEST( comp->err ==FD_VINYL_ERR_INVAL ); FD_TEST( comp->batch_cnt==(ushort)1 );
- FD_TEST( comp->fail_cnt ==(ushort)0 ); FD_TEST( comp->quota_rem==(ushort)ref.quota_rem );
- break;
-
- /* acquire tests */
-
- case 3: /* acquire with unmappable key */
- val_gaddr[0] = val_max;
- fd_vinyl_rq_send( rq, req_id, link_id, FD_VINYL_REQ_TYPE_ACQUIRE, flags, 1UL,
- 0UL, val_gaddr_gaddr, err_gaddr, oob ); WAIT;
- FD_TEST( comp->err ==FD_VINYL_ERR_INVAL ); FD_TEST( comp->batch_cnt==(ushort)1 );
- FD_TEST( comp->fail_cnt ==(ushort)0 ); FD_TEST( comp->quota_rem==(ushort)ref.quota_rem );
- break;
-
- case 4: /* acquire with unmappable val */
- val_gaddr[0] = val_max;
- fd_vinyl_rq_send( rq, req_id, link_id, FD_VINYL_REQ_TYPE_ACQUIRE, flags, 1UL,
- src_key_gaddr, 0UL, err_gaddr, oob ); WAIT;
- FD_TEST( comp->err ==FD_VINYL_ERR_INVAL ); FD_TEST( comp->batch_cnt==(ushort)1 );
- FD_TEST( comp->fail_cnt ==(ushort)0 ); FD_TEST( comp->quota_rem==(ushort)ref.quota_rem );
- break;
-
- case 5: /* acquire with unmappable err */
- val_gaddr[0] = val_max;
- fd_vinyl_rq_send( rq, req_id, link_id, FD_VINYL_REQ_TYPE_ACQUIRE, flags, 1UL,
- src_key_gaddr, val_gaddr_gaddr, 0UL, oob ); WAIT;
- FD_TEST( comp->err ==FD_VINYL_ERR_INVAL ); FD_TEST( comp->batch_cnt==(ushort)1 );
- FD_TEST( comp->fail_cnt ==(ushort)0 ); FD_TEST( comp->quota_rem==(ushort)ref.quota_rem );
- break;
-
- case 6: /* acquire with zero batch */
- val_gaddr[0] = val_max;
- fd_vinyl_rq_send( rq, req_id, link_id, FD_VINYL_REQ_TYPE_ACQUIRE, flags, 0UL,
- 0UL, 0UL, 0UL, oob ); WAIT;
- FD_TEST( comp->err ==FD_VINYL_SUCCESS ); FD_TEST( comp->batch_cnt==(ushort)0 );
- FD_TEST( comp->fail_cnt ==(ushort)0 ); FD_TEST( comp->quota_rem==(ushort)ref.quota_rem );
- break;
-
- case 7: { /* acquire with bad val_max */
- pair_t * pair;
- int ref_err = req( FD_VINYL_REQ_TYPE_ACQUIRE, flags | FD_VINYL_REQ_FLAG_MODIFY, val_max_bad, src_key, &pair, NULL );
- val_gaddr[0] = val_max_bad;
- fd_vinyl_rq_send( rq, req_id, link_id, FD_VINYL_REQ_TYPE_ACQUIRE, flags | FD_VINYL_REQ_FLAG_MODIFY, 1UL,
- src_key_gaddr, val_gaddr_gaddr, err_gaddr, oob ); WAIT;
- if( ref_err==FD_VINYL_ERR_FULL ) {
- FD_TEST( comp->err ==FD_VINYL_ERR_FULL ); FD_TEST( comp->batch_cnt==(ushort)1 );
- FD_TEST( comp->fail_cnt ==(ushort)0 ); FD_TEST( comp->quota_rem==(ushort)ref.quota_rem );
- break;
- }
- FD_TEST( comp->err ==FD_VINYL_SUCCESS ); FD_TEST( comp->batch_cnt==(ushort)1 );
- FD_TEST( comp->fail_cnt ==(ushort)1 ); FD_TEST( comp->quota_rem==(ushort)ref.quota_rem );
- FD_TEST( err[0]==(schar)FD_VINYL_ERR_INVAL );
- break;
- }
-
- case 8: { /* acquire */
- pair_t * pair;
- int ref_err = req( FD_VINYL_REQ_TYPE_ACQUIRE, flags, val_max, src_key, &pair, NULL );
- val_gaddr[0] = val_max;
- fd_vinyl_rq_send( rq, req_id, link_id, FD_VINYL_REQ_TYPE_ACQUIRE, flags, 1UL,
- src_key_gaddr, val_gaddr_gaddr, err_gaddr, oob ); WAIT;
- if( ref_err==FD_VINYL_ERR_FULL ) {
- FD_TEST( comp->err ==FD_VINYL_ERR_FULL ); FD_TEST( comp->batch_cnt==(ushort)1 );
- FD_TEST( comp->fail_cnt ==(ushort)0 ); FD_TEST( comp->quota_rem==(ushort)ref.quota_rem );
- break;
- }
- FD_TEST( comp->err ==FD_VINYL_SUCCESS ); FD_TEST( comp->batch_cnt==(ushort)1 );
- FD_TEST( comp->fail_cnt ==(ushort)!!ref_err ); FD_TEST( comp->quota_rem==(ushort)ref.quota_rem );
- FD_TEST( err[0]==(schar)ref_err );
-
- if( !ref_err ) {
- acq_gaddr[ src_idx ] = val_gaddr[0];
-
- void * val = fd_wksp_laddr_fast( wksp, val_gaddr[0] );
- fd_vinyl_info_t * info = fd_vinyl_data_info( val );
- ulong val_sz = (ulong)info->val_sz;
- ulong val_max = fd_vinyl_data_val_max( val );
-
- FD_TEST( val_max==pair->val_max );
-
- FD_TEST( !memcmp( info, pair->info, sizeof(fd_vinyl_info_t) ) );
-
- if( val_sz ) FD_TEST( !memcmp( val, pair->val, val_sz ) );
-
- /* FIXME: TEST [VAL_SZ,VAL_MAX) ZPAD? */
-
- if( fd_vinyl_req_flag_modify( flags ) ) {
- acq [ src_idx ] = -1L;
- acq_modified[ src_idx ] = fd_vinyl_req_flag_ignore( flags );
- if( do_mod ) {
- val_sz = fd_rng_ulong_roll( rng, val_max + 1UL );
- memset( info, pat, sizeof(fd_vinyl_info_t) ); memset( pair->info, pat, sizeof(fd_vinyl_info_t) );
- memset( val, pat, val_sz ); memset( pair->val, pat, val_sz );
- info->val_sz = (uint)val_sz; pair->info->val_sz = (uint)val_sz;
- acq_modified[ src_idx ] |= 1;
- }
- } else {
- FD_TEST( !memcmp( info, pair->info, sizeof(fd_vinyl_info_t) ) );
- FD_TEST( !memcmp( val, pair->val, val_sz ) );
- acq [ src_idx ]++;
- acq_modified[ src_idx ] = 0;
- }
- }
- break;
- }
-
- /* release tests */
-
- case 9: /* release with unmappable key */
- fd_vinyl_rq_send( rq, req_id, link_id, FD_VINYL_REQ_TYPE_RELEASE, flags | FD_VINYL_REQ_FLAG_BY_KEY, 1UL,
- 0UL, val_gaddr_gaddr, err_gaddr, oob ); WAIT;
- FD_TEST( comp->err ==FD_VINYL_ERR_INVAL ); FD_TEST( comp->batch_cnt==(ushort)1 );
- FD_TEST( comp->fail_cnt ==(ushort)0 ); FD_TEST( comp->quota_rem==(ushort)ref.quota_rem );
- break;
-
- case 10: /* release with unmappable val */
- fd_vinyl_rq_send( rq, req_id, link_id, FD_VINYL_REQ_TYPE_RELEASE, flags & ~FD_VINYL_REQ_FLAG_BY_KEY, 1UL,
- src_key_gaddr, 0UL, err_gaddr, oob ); WAIT;
- FD_TEST( comp->err ==FD_VINYL_ERR_INVAL ); FD_TEST( comp->batch_cnt==(ushort)1 );
- FD_TEST( comp->fail_cnt ==(ushort)0 ); FD_TEST( comp->quota_rem==(ushort)ref.quota_rem );
- break;
-
- case 11: /* release with unmappable err */
- fd_vinyl_rq_send( rq, req_id, link_id, FD_VINYL_REQ_TYPE_RELEASE, flags, 1UL,
- src_key_gaddr, val_gaddr_gaddr, 0UL, oob ); WAIT;
- FD_TEST( comp->err ==FD_VINYL_ERR_INVAL ); FD_TEST( comp->batch_cnt==(ushort)1 );
- FD_TEST( comp->fail_cnt ==(ushort)0 ); FD_TEST( comp->quota_rem==(ushort)ref.quota_rem );
- break;
-
- case 12: /* release with zero batch */
- fd_vinyl_rq_send( rq, req_id, link_id, FD_VINYL_REQ_TYPE_RELEASE, flags, 0UL,
- 0UL, 0UL, 0UL, oob ); WAIT;
- FD_TEST( comp->err ==FD_VINYL_SUCCESS ); FD_TEST( comp->batch_cnt==(ushort)0 );
- FD_TEST( comp->fail_cnt ==(ushort)0 ); FD_TEST( comp->quota_rem==(ushort)ref.quota_rem );
- break;
-
- case 13: { /* release */
- if( acq[ src_idx ] > 0L ) flags &= ~FD_VINYL_REQ_FLAG_MODIFY; /* can't say modify on an acquire-for-read */
- if( ((acq[ src_idx ] < 0L) & (!fd_vinyl_req_flag_modify( flags )) & acq_modified[ src_idx ]) )
- flags |= FD_VINYL_REQ_FLAG_IGNORE;
- int ref_err = req( FD_VINYL_REQ_TYPE_RELEASE, flags, val_max_bad, src_key, NULL, NULL );
- if( by_key || !acq[ src_idx ] ) {
- fd_vinyl_rq_send( rq, req_id, link_id, FD_VINYL_REQ_TYPE_RELEASE, flags | FD_VINYL_REQ_FLAG_BY_KEY, 1UL,
- src_key_gaddr, 0UL, err_gaddr, oob );
- } else {
- val_gaddr[0] = acq_gaddr[ src_idx ];
- fd_vinyl_rq_send( rq, req_id, link_id, FD_VINYL_REQ_TYPE_RELEASE, flags & ~FD_VINYL_REQ_FLAG_BY_KEY, 1UL,
- 0UL, val_gaddr_gaddr, err_gaddr, oob );
- }
- WAIT;
- FD_TEST( comp->err ==FD_VINYL_SUCCESS ); FD_TEST( comp->batch_cnt==(ushort)1 );
- FD_TEST( comp->fail_cnt ==(ushort)!!ref_err ); FD_TEST( comp->quota_rem==(ushort)ref.quota_rem );
- FD_TEST( err[0]==(schar)ref_err );
- if( !ref_err ) acq[ src_idx ] = (acq[ src_idx ] > 0L) ? (acq[ src_idx ]-1L) : 0L;
- break;
- }
-
- /* erase tests */
-
- case 14: /* erase with unmappable key */
- fd_vinyl_rq_send( rq, req_id, link_id, FD_VINYL_REQ_TYPE_ERASE, flags, 1UL,
- 0UL, 0UL, err_gaddr, oob ); WAIT;
- FD_TEST( comp->err ==FD_VINYL_ERR_INVAL ); FD_TEST( comp->batch_cnt==(ushort)1 );
- FD_TEST( comp->fail_cnt ==(ushort)0 ); FD_TEST( comp->quota_rem==(ushort)ref.quota_rem );
- break;
-
- case 15: /* erase with unmappable err */
- fd_vinyl_rq_send( rq, req_id, link_id, FD_VINYL_REQ_TYPE_ERASE, flags, 1UL,
- src_key_gaddr, 0UL, 0UL, oob ); WAIT;
- FD_TEST( comp->err ==FD_VINYL_ERR_INVAL ); FD_TEST( comp->batch_cnt==(ushort)1 );
- FD_TEST( comp->fail_cnt ==(ushort)0 ); FD_TEST( comp->quota_rem==(ushort)ref.quota_rem );
- break;
-
- case 16: /* erase with zero batch */
- fd_vinyl_rq_send( rq, req_id, link_id, FD_VINYL_REQ_TYPE_ERASE, flags, 0UL,
- src_key_gaddr, 0UL, err_gaddr, oob ); WAIT;
- FD_TEST( comp->err ==FD_VINYL_SUCCESS ); FD_TEST( comp->batch_cnt==(ushort)0 );
- FD_TEST( comp->fail_cnt ==(ushort)0 ); FD_TEST( comp->quota_rem==(ushort)ref.quota_rem );
- break;
-
- case 17: { /* erase */
- int ref_err = req( FD_VINYL_REQ_TYPE_ERASE, flags, val_max_bad, src_key, NULL, NULL );
- fd_vinyl_rq_send( rq, req_id, link_id, FD_VINYL_REQ_TYPE_ERASE, flags, 1UL,
- src_key_gaddr, 1UL, err_gaddr, oob ); WAIT;
- FD_TEST( comp->err ==FD_VINYL_SUCCESS ); FD_TEST( comp->batch_cnt==(ushort)1 );
- FD_TEST( comp->fail_cnt ==(ushort)!!ref_err ); FD_TEST( comp->quota_rem==(ushort)ref.quota_rem );
- FD_TEST( err[0]==(schar)ref_err );
- break;
- }
-
- /* move tests */
-
- case 18: /* move with unmappable src */
- fd_vinyl_rq_send( rq, req_id, link_id, FD_VINYL_REQ_TYPE_MOVE, flags, 1UL,
- 0UL, src_key_gaddr, err_gaddr, oob ); WAIT;
- FD_TEST( comp->err ==FD_VINYL_ERR_INVAL ); FD_TEST( comp->batch_cnt==(ushort)1 );
- FD_TEST( comp->fail_cnt ==(ushort)0 ); FD_TEST( comp->quota_rem==(ushort)ref.quota_rem );
- break;
-
- case 19: /* move with unmappable dst */
- fd_vinyl_rq_send( rq, req_id, link_id, FD_VINYL_REQ_TYPE_MOVE, flags, 1UL,
- src_key_gaddr, 0UL, err_gaddr, oob ); WAIT;
- FD_TEST( comp->err ==FD_VINYL_ERR_INVAL ); FD_TEST( comp->batch_cnt==(ushort)1 );
- FD_TEST( comp->fail_cnt ==(ushort)0 ); FD_TEST( comp->quota_rem==(ushort)ref.quota_rem );
- break;
-
- case 20: /* move with unmappable err */
- fd_vinyl_rq_send( rq, req_id, link_id, FD_VINYL_REQ_TYPE_MOVE, flags, 1UL,
- src_key_gaddr, dst_key_gaddr, 0UL, oob ); WAIT;
- FD_TEST( comp->err ==FD_VINYL_ERR_INVAL ); FD_TEST( comp->batch_cnt==(ushort)1 );
- FD_TEST( comp->fail_cnt ==(ushort)0 ); FD_TEST( comp->quota_rem==(ushort)ref.quota_rem );
- break;
-
- case 21: /* move with zero batch */
- fd_vinyl_rq_send( rq, req_id, link_id, FD_VINYL_REQ_TYPE_MOVE, flags, 0UL,
- src_key_gaddr, dst_key_gaddr, err_gaddr, oob ); WAIT;
- FD_TEST( comp->err ==FD_VINYL_SUCCESS ); FD_TEST( comp->batch_cnt==(ushort)0 );
- FD_TEST( comp->fail_cnt ==(ushort)0 ); FD_TEST( comp->quota_rem==(ushort)ref.quota_rem );
- break;
-
- case 22: { /* move */
- int ref_err = req( FD_VINYL_REQ_TYPE_MOVE, flags, val_max_bad, src_key, (pair_t **)dst_key, NULL );
- fd_vinyl_rq_send( rq, req_id, link_id, FD_VINYL_REQ_TYPE_MOVE, flags, 1UL,
- src_key_gaddr, dst_key_gaddr, err_gaddr, oob ); WAIT;
- FD_TEST( comp->err ==FD_VINYL_SUCCESS ); FD_TEST( comp->batch_cnt==(ushort)1 );
- FD_TEST( comp->fail_cnt ==(ushort)!!ref_err ); FD_TEST( comp->quota_rem==(ushort)ref.quota_rem );
- FD_TEST( err[0]==(schar)ref_err );
- break;
- }
-
- /* fetch tests (these are logical no-op / hints and don't generate completions) */
-
- case 23: /* fetch with unmappable key */
- fd_vinyl_rq_send( rq, req_id, link_id, FD_VINYL_REQ_TYPE_FETCH, flags, 1UL, 0UL, 0UL, 0UL, oob );
- break;
-
- case 24: /* fetch with zero batch cnt */
- fd_vinyl_rq_send( rq, req_id, link_id, FD_VINYL_REQ_TYPE_FETCH, flags, 0UL, 0UL, 0UL, 0UL, oob );
- break;
-
- case 25: /* fetch */
- FD_TEST( !req( FD_VINYL_REQ_TYPE_FETCH, 0UL, 0UL, src_key, NULL, NULL ) );
- fd_vinyl_rq_send( rq, req_id, link_id, FD_VINYL_REQ_TYPE_FETCH, flags, 1UL, src_key_gaddr, 0UL, 0UL, oob );
- break;
-
- /* flush tests (these are logical no-ops / hints and don't generate completions) */
-
- case 26: /* flush with unmappable key */
- fd_vinyl_rq_send( rq, req_id, link_id, FD_VINYL_REQ_TYPE_FLUSH, flags, 1UL, 0UL, 0UL, 0UL, oob );
- break;
-
- case 27: /* flush with zero batch cnt */
- fd_vinyl_rq_send( rq, req_id, link_id, FD_VINYL_REQ_TYPE_FLUSH, flags, 0UL, 0UL, 0UL, 0UL, oob );
- break;
-
- case 28: /* flush */
- FD_TEST( !req( FD_VINYL_REQ_TYPE_FLUSH, 0UL, 0UL, src_key, NULL, NULL ) );
- fd_vinyl_rq_send( rq, req_id, link_id, FD_VINYL_REQ_TYPE_FLUSH, flags, 1UL, src_key_gaddr, 0UL, 0UL, oob );
- break;
-
- /* try tests */
-
- case 29: /* try with unmappable key */
- fd_vinyl_rq_send( rq, req_id, link_id, FD_VINYL_REQ_TYPE_TRY, flags, 1UL,
- 0UL, try_gaddr_gaddr, err_gaddr, oob ); WAIT;
- FD_TEST( comp->err ==FD_VINYL_ERR_INVAL ); FD_TEST( comp->batch_cnt==(ushort)1 );
- FD_TEST( comp->fail_cnt ==(ushort)0 ); FD_TEST( comp->quota_rem==(ushort)ref.quota_rem );
- break;
-
- case 30: /* try with unmappable try */
- fd_vinyl_rq_send( rq, req_id, link_id, FD_VINYL_REQ_TYPE_TRY, flags, 1UL,
- src_key_gaddr, 0UL, err_gaddr, oob ); WAIT;
- FD_TEST( comp->err ==FD_VINYL_ERR_INVAL ); FD_TEST( comp->batch_cnt==(ushort)1 );
- FD_TEST( comp->fail_cnt ==(ushort)0 ); FD_TEST( comp->quota_rem==(ushort)ref.quota_rem );
- break;
-
- case 31: /* try with unmappable err */
- fd_vinyl_rq_send( rq, req_id, link_id, FD_VINYL_REQ_TYPE_TRY, flags, 1UL,
- src_key_gaddr, try_gaddr_gaddr, 0UL, oob ); WAIT;
- FD_TEST( comp->err ==FD_VINYL_ERR_INVAL ); FD_TEST( comp->batch_cnt==(ushort)1 );
- FD_TEST( comp->fail_cnt ==(ushort)0 ); FD_TEST( comp->quota_rem==(ushort)ref.quota_rem );
- break;
-
- case 32: /* try with zero batch */
- fd_vinyl_rq_send( rq, req_id, link_id, FD_VINYL_REQ_TYPE_TRY, flags, 0UL,
- src_key_gaddr, try_gaddr_gaddr, err_gaddr, oob ); WAIT;
- FD_TEST( comp->err ==FD_VINYL_SUCCESS ); FD_TEST( comp->batch_cnt==(ushort)0 );
- FD_TEST( comp->fail_cnt ==(ushort)0 ); FD_TEST( comp->quota_rem==(ushort)ref.quota_rem );
- break;
-
- case 33: { /* try */
- int ref_err = req( FD_VINYL_REQ_TYPE_TRY, flags, val_max_bad, src_key, &try_pair[ src_idx ], &try_ver[ src_idx ] );
- fd_vinyl_rq_send( rq, req_id, link_id, FD_VINYL_REQ_TYPE_TRY, flags, 1UL,
- src_key_gaddr, try_gaddr_gaddr, err_gaddr, oob ); WAIT;
- FD_TEST( comp->err ==FD_VINYL_SUCCESS ); FD_TEST( comp->batch_cnt==(ushort)1 );
- FD_TEST( comp->fail_cnt ==(ushort)!!ref_err ); FD_TEST( comp->quota_rem==(ushort)ref.quota_rem );
- FD_TEST( err[0]==(schar)ref_err );
- if( !ref_err ) try_live[ src_idx ] = 1;
- break;
- }
-
- /* test tests */
-
- case 34: /* test with unmappable try */
- fd_vinyl_rq_send( rq, req_id, link_id, FD_VINYL_REQ_TYPE_TEST, flags, 1UL,
- 0UL, 0UL, err_gaddr, oob ); WAIT;
- FD_TEST( comp->err ==FD_VINYL_ERR_INVAL ); FD_TEST( comp->batch_cnt==(ushort)1 );
- FD_TEST( comp->fail_cnt ==(ushort)0 ); FD_TEST( comp->quota_rem==(ushort)ref.quota_rem );
- break;
-
- case 35: /* test with unmappable err */
- fd_vinyl_rq_send( rq, req_id, link_id, FD_VINYL_REQ_TYPE_TEST, flags, 1UL,
- 0UL, try_gaddr_gaddr, 0UL, oob ); WAIT;
- FD_TEST( comp->err ==FD_VINYL_ERR_INVAL ); FD_TEST( comp->batch_cnt==(ushort)1 );
- FD_TEST( comp->fail_cnt ==(ushort)0 ); FD_TEST( comp->quota_rem==(ushort)ref.quota_rem );
- break;
-
- case 36: /* test with zero batch */
- fd_vinyl_rq_send( rq, req_id, link_id, FD_VINYL_REQ_TYPE_TEST, flags, 0UL,
- 0UL, try_gaddr_gaddr, err_gaddr, oob ); WAIT;
- FD_TEST( comp->err ==FD_VINYL_SUCCESS ); FD_TEST( comp->batch_cnt==(ushort)0 );
- FD_TEST( comp->fail_cnt ==(ushort)0 ); FD_TEST( comp->quota_rem==(ushort)ref.quota_rem );
- break;
-
- case 37: { /* test */
- if( !try_live[ src_idx ] ) break;
- void * try_val = fd_wksp_laddr_fast( wksp, try_gaddr[0] );
- fd_vinyl_info_t * try_info = fd_vinyl_data_info( try_val );
- ulong try_val_sz = fd_ulong_min( (ulong)try_info->val_sz, FD_VINYL_VAL_MAX );
- int try_cmp = (!memcmp( try_info, try_pair[ src_idx ]->info, sizeof(fd_vinyl_info_t) )) &&
- (!memcmp( try_val, try_pair[ src_idx ]->val, try_val_sz ));
- int ref_err = req( FD_VINYL_REQ_TYPE_TEST, flags, val_max_bad, NULL, &try_pair[ src_idx ], &try_ver[ src_idx ] );
- fd_vinyl_rq_send( rq, req_id, link_id, FD_VINYL_REQ_TYPE_TEST, flags, 1UL,
- 0UL, try_gaddr_gaddr, err_gaddr, oob ); WAIT;
- try_live[ src_idx ] = 0;
- FD_TEST( comp->err ==FD_VINYL_SUCCESS ); FD_TEST( comp->batch_cnt==(ushort)1 );
- /**/ FD_TEST( comp->quota_rem==(ushort)ref.quota_rem );
- /* Note that it is possible for a try to work in the ref version
- and fail in the full version (e.g. cache line flushing, val
- resizing in the background, etc). */
- if( ((!ref_err) & (!!err[0])) ) break;
- FD_TEST( comp->fail_cnt ==(ushort)!!ref_err );
- FD_TEST( (!!err[0]) | try_cmp );
- break;
- }
-
- case 38: { /* sync */
- FD_TEST( !fd_vinyl_sync( cnc ) );
- break;
- }
-
- case 39: { /* randomly toggle data compression on and off */
- int new_style = (r & 1UL) ? (ulong)FD_VINYL_BSTREAM_CTL_STYLE_RAW : (ulong)FD_VINYL_BSTREAM_CTL_STYLE_LZ4;
- FD_TEST( !fd_vinyl_set( cnc, FD_VINYL_OPT_STYLE, (ulong)new_style, NULL ) );
- break;
- }
-
- default: break;
- }
- }
-
- /* Clean up */
-
- for( ulong src_idx=0UL; src_idxerr ==FD_VINYL_SUCCESS ); FD_TEST( comp->batch_cnt==(ushort)1 );
- FD_TEST( comp->fail_cnt ==(ushort)!!ref_err ); FD_TEST( comp->quota_rem==(ushort)ref.quota_rem );
- FD_TEST( err[0]==(schar)ref_err );
- }
-
- }
-
- fd_rng_delete( fd_rng_leave( rng ) );
-}
-
-int
-main( int argc,
- char ** argv ) {
- fd_boot( &argc, &argv );
-
- if( FD_UNLIKELY( fd_tile_cnt() < 2UL ) ) FD_LOG_ERR(( "This test requires at least 2 tiles" ));
-
- char const * _wksp = fd_env_strip_cmdline_cstr ( &argc, &argv, "--wksp", NULL, NULL );
- char const * _page_sz = fd_env_strip_cmdline_cstr ( &argc, &argv, "--page-sz", NULL, "gigantic" );
- ulong page_cnt = fd_env_strip_cmdline_ulong( &argc, &argv, "--page-cnt", NULL, 8UL );
- ulong near_cpu = fd_env_strip_cmdline_ulong( &argc, &argv, "--near-cpu", NULL, fd_log_cpu_id() );
- ulong tag = fd_env_strip_cmdline_ulong( &argc, &argv, "--tag", NULL, 1UL );
-
- ulong spad_max = fd_env_strip_cmdline_ulong( &argc, &argv, "--spad-max", NULL, fd_vinyl_io_spad_est() );
- ulong dev_sz = fd_env_strip_cmdline_ulong( &argc, &argv, "--dev-sz", NULL, 1UL << 30 );
- ulong io_seed = fd_env_strip_cmdline_ulong( &argc, &argv, "--io-seed", NULL, 1234UL );
-
- ulong line_cnt = fd_env_strip_cmdline_ulong( &argc, &argv, "--line-cnt", NULL, 7UL );
-
- ulong ele_max = fd_env_strip_cmdline_ulong( &argc, &argv, "--ele-max", NULL, 8UL );
- ulong lock_cnt = fd_env_strip_cmdline_ulong( &argc, &argv, "--lock_cnt", NULL, 8UL );
- ulong probe_max = ele_max;
- ulong seed = fd_env_strip_cmdline_ulong( &argc, &argv, "--seed", NULL, 5678UL );
-
- ulong obj_sz = fd_env_strip_cmdline_ulong( &argc, &argv, "--obj-sz", NULL, 6UL << 30 );
-
- ulong async_min = fd_env_strip_cmdline_ulong( &argc, &argv, "--async-min", NULL, 5UL );
- ulong async_max = fd_env_strip_cmdline_ulong( &argc, &argv, "--async-max", NULL, 2UL*async_min );
- ulong part_thresh = fd_env_strip_cmdline_ulong( &argc, &argv, "--part-thresh", NULL, 64UL << 20 );
- ulong gc_thresh = fd_env_strip_cmdline_ulong( &argc, &argv, "--gc-thresh", NULL, 128UL << 20 );
- int gc_eager = fd_env_strip_cmdline_int ( &argc, &argv, "--gc-eager", NULL, 2 );
- char const * _style = fd_env_strip_cmdline_cstr ( &argc, &argv, "--style", NULL, "lz4" );
- int level = fd_env_strip_cmdline_int ( &argc, &argv, "--level", NULL, 0 );
-
- ulong rq_max = fd_env_strip_cmdline_ulong( &argc, &argv, "--rq-max", NULL, 32UL );
- ulong cq_max = fd_env_strip_cmdline_ulong( &argc, &argv, "--cq-max", NULL, 32UL );
- ulong link_id = fd_env_strip_cmdline_ulong( &argc, &argv, "--link-id", NULL, 2345UL );
- ulong burst_max = fd_env_strip_cmdline_ulong( &argc, &argv, "--burst-max", NULL, 1UL );
- ulong quota_max = fd_env_strip_cmdline_ulong( &argc, &argv, "--quota-max", NULL, 2UL );
- ulong scratch_sz = fd_env_strip_cmdline_ulong( &argc, &argv, "--scratch-sz", NULL, 4096UL );
-
- ulong iter_max = fd_env_strip_cmdline_ulong( &argc, &argv, "--iter-max", NULL, (ulong)1e7 );
-
- int style = fd_cstr_to_vinyl_bstream_ctl_style( _style );
-
- fd_wksp_t * wksp;
- if( _wksp ) {
- FD_LOG_NOTICE(( "Attaching to --wksp %s", _wksp ));
- wksp = fd_wksp_attach( _wksp );
- } else {
- FD_LOG_NOTICE(( "--wksp not specified, using an anonymous local workspace (--page-sz %s --page-cnt %lu --near-cpu %lu)",
- _page_sz, page_cnt, near_cpu ));
- wksp = fd_wksp_new_anonymous( fd_cstr_to_shmem_page_sz( _page_sz ), page_cnt, near_cpu, "wksp", 0UL );
- }
- FD_TEST( wksp );
-
- FD_LOG_NOTICE(( "Creating vinyl tile" ));
-
- ulong io_footprint = fd_vinyl_io_mm_footprint( spad_max ); FD_TEST( io_footprint );
- ulong dev_footprint = fd_ulong_align_dn( dev_sz, FD_VINYL_BSTREAM_BLOCK_SZ ); FD_TEST( dev_footprint );
- ulong vinyl_footprint = fd_vinyl_footprint(); FD_TEST( vinyl_footprint );
- ulong cnc_footprint = fd_cnc_footprint( FD_VINYL_CNC_APP_SZ ); FD_TEST( cnc_footprint );
- ulong meta_footprint = fd_vinyl_meta_footprint( ele_max, lock_cnt, probe_max ); FD_TEST( meta_footprint );
- ulong line_footprint = sizeof(fd_vinyl_line_t) * line_cnt; FD_TEST( line_footprint );
- ulong ele_footprint = sizeof(fd_vinyl_meta_ele_t) * ele_max; FD_TEST( ele_footprint );
- ulong obj_footprint = fd_ulong_align_dn( obj_sz, alignof(fd_vinyl_data_obj_t) ); FD_TEST( obj_footprint );
- ulong rq_footprint = fd_vinyl_rq_footprint( rq_max ); FD_TEST( rq_footprint );
- ulong cq_footprint = fd_vinyl_cq_footprint( cq_max ); FD_TEST( cq_footprint );
-
- void * _io = fd_wksp_alloc_laddr( wksp, fd_vinyl_io_mm_align(), io_footprint, tag ); FD_TEST( _io );
- void * _dev = fd_wksp_alloc_laddr( wksp, FD_VINYL_BSTREAM_BLOCK_SZ, dev_footprint, tag ); FD_TEST( _dev );
- void * _vinyl = fd_wksp_alloc_laddr( wksp, fd_vinyl_align(), vinyl_footprint, tag ); FD_TEST( _vinyl );
- void * _cnc = fd_wksp_alloc_laddr( wksp, fd_cnc_align(), cnc_footprint, tag ); FD_TEST( _cnc );
- void * _meta = fd_wksp_alloc_laddr( wksp, fd_vinyl_meta_align(), meta_footprint, tag ); FD_TEST( _meta );
- void * _line = fd_wksp_alloc_laddr( wksp, alignof(fd_vinyl_line_t), line_footprint, tag ); FD_TEST( _line );
- void * _ele = fd_wksp_alloc_laddr( wksp, alignof(fd_vinyl_meta_ele_t), ele_footprint, tag ); FD_TEST( _ele );
- void * _obj = fd_wksp_alloc_laddr( wksp, alignof(fd_vinyl_data_obj_t), obj_footprint, tag ); FD_TEST( _obj );
- void * _rq = fd_wksp_alloc_laddr( wksp, fd_vinyl_rq_align(), rq_footprint, tag ); FD_TEST( _rq );
- void * _cq = fd_wksp_alloc_laddr( wksp, fd_vinyl_cq_align(), cq_footprint, tag ); FD_TEST( _cq );
- void * _scratch = fd_wksp_alloc_laddr( wksp, 4096UL, scratch_sz, tag ); FD_TEST( _scratch );
-
- fd_vinyl_io_t * io = fd_vinyl_io_mm_init( _io, spad_max, _dev, dev_footprint, 1, "test", 5UL, io_seed ); FD_TEST( io );
-
- fd_tpool_t * tpool = NULL;
-
- ulong thread_cnt = fd_tile_cnt();
-
- if( FD_LIKELY( thread_cnt>1UL ) ) {
- FD_LOG_NOTICE(( "Creating temporary tpool from all %lu tiles for thread parallel init", thread_cnt ));
-
- static uchar _tpool[ FD_TPOOL_FOOTPRINT( FD_TILE_MAX ) ] __attribute__((aligned(FD_TPOOL_ALIGN)));
-
- tpool = fd_tpool_init( _tpool, thread_cnt, 0UL ); /* logs details */
- if( FD_UNLIKELY( !tpool ) ) FD_LOG_ERR(( "fd_tpool_init failed" ));
-
- for( ulong thread_idx=1UL; thread_idx