Skip to content

Commit 91b4c75

Browse files
authored
perf[gpu]: reduce register pressure in dyn dispatch (#7489)
We decrease the number of values per tile in the output stage each GPU thread uses, as well as limit the register count to 32 in the launch bounds. This brings the dynamic dispatch kernel into a reasonably close range compared to the standalone kernel for now. Type | Dynamic dispatch | Standalone | Ratio | |---|---|---|---| | u8 bw6 | 172 µs | 79 µs | 2.17× | | u16 bw6 | 140 µs | 88 µs | 1.59× | | u32 bw6 | 184 µs | 148 µs | 1.24× | | u64 bw8 | 303 µs | 276 µs | 1.10×| Signed-off-by: Alexander Droste <alexander.droste@protonmail.com>
1 parent 1169d84 commit 91b4c75

File tree

1 file changed

+6
-4
lines changed

1 file changed

+6
-4
lines changed

vortex-cuda/kernels/src/dynamic_dispatch.cu

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -279,7 +279,8 @@ __device__ void execute_output_stage(T *__restrict output,
279279
char *__restrict smem,
280280
uint64_t block_start,
281281
uint32_t block_len) {
282-
constexpr uint32_t VALUES_PER_TILE = 32 / sizeof(T);
282+
// Cap at 4 values per thread per tile to minimise register pressure.
283+
constexpr uint32_t VALUES_PER_TILE = (32 / sizeof(T)) < 4 ? (32 / sizeof(T)) : 4;
283284
const uint32_t tile_size = blockDim.x * VALUES_PER_TILE;
284285
const auto &src = stage.source;
285286
const void *raw_input = reinterpret_cast<const void *>(stage.input_ptr);
@@ -472,9 +473,10 @@ dynamic_dispatch(T *__restrict output, uint64_t array_len, const uint8_t *__rest
472473
// matters is load_element(), which dispatches on the per-op PTypeTag to
473474
// sign-extend or zero-extend when widening a narrow source to T.
474475
#define GENERATE_KERNEL(suffix, Type) \
475-
extern "C" __global__ void dynamic_dispatch_##suffix(Type *__restrict output, \
476-
uint64_t array_len, \
477-
const uint8_t *__restrict packed_plan) { \
476+
extern "C" __global__ void __launch_bounds__(BLOCK_SIZE, 32) \
477+
dynamic_dispatch_##suffix(Type *__restrict output, \
478+
uint64_t array_len, \
479+
const uint8_t *__restrict packed_plan) { \
478480
dynamic_dispatch<Type>(output, array_len, packed_plan); \
479481
}
480482

0 commit comments

Comments
 (0)