chore: simplify scalar op loop

0ax1 · 0ax1 · commit b39914582f04 · 2026-02-12T18:29:17.000Z
diff --git a/vortex-cuda/benches/dynamic_dispatch_cuda.rs b/vortex-cuda/benches/dynamic_dispatch_cuda.rs
@@ -38,7 +38,11 @@ use vortex_error::vortex_err;
 use vortex_fastlanes::BitPackedArray;
 use vortex_session::VortexSession;
 
-const BENCH_ARGS: &[(usize, &str)] = &[(1_000_000, "1M"), (10_000_000, "10M")];
+const BENCH_ARGS: &[(usize, &str)] = &[
+    (1_000_000, "1M"),
+    (10_000_000, "10M"),
+    (100_000_000, "100M"),
+];
 
 const REFERENCE_VALUE: u32 = 100_000;
 
@@ -152,10 +156,6 @@ fn run_dynamic_dispatch_timed(
     Ok(Duration::from_secs_f32(elapsed_ms / 1000.0))
 }
 
-// ============================================================================
-// Benchmark: BitUnpack + FoR — two separate kernel launches
-// ============================================================================
-
 /// Run bitunpack then FoR as two separate kernel launches, returning GPU time.
 fn run_bitunpack_for_separate_timed(
     cuda_ctx: &mut CudaExecutionCtx,
@@ -199,19 +199,17 @@ fn run_bitunpack_for_separate_timed(
         .map_err(|e| vortex_err!("failed to record start event: {:?}", e))?;
 
     // --- Kernel 1: BitUnpack ---
-    {
-        let output_width = u32::BITS as usize;
-        let cuda_function = bitpacked_cuda_kernel(bit_width, output_width, cuda_ctx)?;
-        let mut launch_builder = cuda_ctx.launch_builder(&cuda_function);
-        launch_builder.arg(&input_view);
-        launch_builder.arg(&output_view);
-
-        let config = bitpacked_cuda_launch_config(output_width, len)?;
-        unsafe {
-            launch_builder
-                .launch(config)
-                .map_err(|e| vortex_err!("bit_unpack kernel launch failed: {}", e))?;
-        }
+    let output_width = u32::BITS as usize;
+    let cuda_function = bitpacked_cuda_kernel(bit_width, output_width, cuda_ctx)?;
+    let mut launch_builder = cuda_ctx.launch_builder(&cuda_function);
+    launch_builder.arg(&input_view);
+    launch_builder.arg(&output_view);
+
+    let config = bitpacked_cuda_launch_config(output_width, len)?;
+    unsafe {
+        launch_builder
+            .launch(config)
+            .map_err(|e| vortex_err!("bit_unpack kernel launch failed: {}", e))?;
     }
 
     // --- Kernel 2: FoR (in-place on output_buf) ---
diff --git a/vortex-cuda/kernels/src/dynamic_dispatch.cu b/vortex-cuda/kernels/src/dynamic_dispatch.cu
@@ -17,7 +17,7 @@
 #include "dynamic_dispatch.h"
 #include "types.cuh"
 
-constexpr uint32_t FL_CHUNK_SIZE = 1024;
+constexpr uint32_t ELEMENTS_PER_BLOCK = 2048;
 
 template <typename T>
 __device__ __forceinline__ void bitunpack_lane_to_smem(const T *__restrict packed_chunk, T *__restrict smem,
@@ -45,16 +45,23 @@ __device__ __forceinline__ void dynamic_source_op(const T *__restrict input, T *
                                                uint64_t chunk_start, uint32_t chunk_len,
                                                const struct SourceOp &source_op) {
     constexpr uint32_t T_BITS = sizeof(T) * 8;
-    constexpr uint32_t FL_LANES = FL_CHUNK_SIZE / T_BITS;
+    constexpr uint32_t FL_LANES = ELEMENTS_PER_BLOCK / T_BITS;
 
     switch (source_op.op_code) {
     case SourceOp::BITUNPACK: {
+        constexpr uint32_t ELEMENTS_PER_FL_BLOCK = 1024;
+        constexpr uint32_t LANES_PER_FL_BLOCK = ELEMENTS_PER_FL_BLOCK / T_BITS;
         const uint32_t bit_width = source_op.params.bitunpack.bit_width;
-        const uint32_t packed_words_per_chunk = FL_LANES * bit_width;
-        const uint64_t chunk_idx = chunk_start / FL_CHUNK_SIZE;
-        const T *packed_chunk = input + chunk_idx * packed_words_per_chunk;
-        for (uint32_t lane = threadIdx.x; lane < FL_LANES; lane += blockDim.x) {
-            bitunpack_lane_to_smem<T>(packed_chunk, smem, lane, bit_width);
+        const uint32_t packed_words_per_fl_block = LANES_PER_FL_BLOCK * bit_width;
+        const uint64_t first_fl_block = chunk_start / ELEMENTS_PER_FL_BLOCK;
+
+        #pragma unroll
+        for (uint32_t blk = 0; blk < ELEMENTS_PER_BLOCK / ELEMENTS_PER_FL_BLOCK; ++blk) {
+            const T *packed_fl = input + (first_fl_block + blk) * packed_words_per_fl_block;
+            T *smem_fl = smem + blk * ELEMENTS_PER_FL_BLOCK;
+            for (uint32_t lane = threadIdx.x; lane < LANES_PER_FL_BLOCK; lane += blockDim.x) {
+                bitunpack_lane_to_smem<T>(packed_fl, smem_fl, lane, bit_width);
+            }
         }
         break;
     }
@@ -82,66 +89,61 @@ __device__ __forceinline__ T dynamic_scalar_op(T value, const struct ScalarOp &o
 template <typename T>
 __device__ void dynamic_dispatch_impl(const T *__restrict input, T *__restrict output, uint64_t array_len,
                                       const struct DynamicDispatchPlan *__restrict plan) {
-    constexpr uint32_t ELEMENTS_PER_BLOCK = 2048;
     constexpr uint32_t VALUES_PER_LOOP = 32 / sizeof(T);
 
     __shared__ struct DynamicDispatchPlan smem_plan;
-    __shared__ T smem_values[FL_CHUNK_SIZE];
+    __shared__ T smem_values[ELEMENTS_PER_BLOCK];
 
     // Cache the plan in shared memory.
     if (threadIdx.x == 0) smem_plan = *plan;
     __syncthreads();
 
     const uint64_t block_start = static_cast<uint64_t>(blockIdx.x) * ELEMENTS_PER_BLOCK;
     const uint64_t block_end = min(block_start + ELEMENTS_PER_BLOCK, array_len);
+    const uint32_t block_len = static_cast<uint32_t>(block_end - block_start);
 
-    for (uint64_t chunk_start = block_start; chunk_start < block_end; chunk_start += FL_CHUNK_SIZE) {
-        const uint32_t chunk_len = min(FL_CHUNK_SIZE, static_cast<uint32_t>(block_end - chunk_start));
-        dynamic_source_op<T>(input, smem_values, chunk_start, chunk_len, smem_plan.source);
-        __syncthreads();
-
-        const uint32_t tile_size = blockDim.x * VALUES_PER_LOOP;
-        const uint32_t num_full_tiles = chunk_len / tile_size;
+    dynamic_source_op<T>(input, smem_values, block_start, block_len, smem_plan.source);
+    __syncthreads();
 
-        for (uint32_t tile = 0; tile < num_full_tiles; ++tile) {
-            const uint32_t tile_base = tile * tile_size;
+    const uint32_t tile_size = blockDim.x * VALUES_PER_LOOP;
+    const uint32_t num_full_tiles = block_len / tile_size;
 
-            // Operate on values in registers. This is faster than a coalesced
-            // one-element-per-thread loop as it enables better instruction-level
-            // parallelism.
-            T values[VALUES_PER_LOOP];
+    for (uint32_t tile = 0; tile < num_full_tiles; ++tile) {
+        const uint32_t tile_base = tile * tile_size;
 
-            #pragma unroll
-            for (uint32_t idx = 0; idx < VALUES_PER_LOOP; ++idx) {
-                values[idx] = smem_values[tile_base + idx * blockDim.x + threadIdx.x];
-            }
+        // Operate on values in registers. This is faster than a coalesced
+        // one-element-per-thread loop as it enables better instruction-level
+        // parallelism.
+        T values[VALUES_PER_LOOP];
 
-            for (uint8_t op_idx = 0; op_idx < smem_plan.num_scalar_ops; ++op_idx) {
-                const struct ScalarOp &scalar_op = smem_plan.scalar_ops[op_idx];
+        #pragma unroll
+        for (uint32_t idx = 0; idx < VALUES_PER_LOOP; ++idx) {
+            values[idx] = smem_values[tile_base + idx * blockDim.x + threadIdx.x];
+        }
 
-                #pragma unroll
-                for (uint32_t idx = 0; idx < VALUES_PER_LOOP; ++idx) {
-                    values[idx] = dynamic_scalar_op(values[idx], scalar_op);
-                }
-            }
+        for (uint8_t op_idx = 0; op_idx < smem_plan.num_scalar_ops; ++op_idx) {
+            const struct ScalarOp &scalar_op = smem_plan.scalar_ops[op_idx];
 
             #pragma unroll
             for (uint32_t idx = 0; idx < VALUES_PER_LOOP; ++idx) {
-                output[chunk_start + tile_base + idx * blockDim.x + threadIdx.x] = values[idx];
+                values[idx] = dynamic_scalar_op(values[idx], scalar_op);
             }
         }
 
-        // Handle remaining elements that were not part of a full tile.
-        const uint32_t rem_start = num_full_tiles * tile_size;
-        for (uint32_t elem_idx = rem_start + threadIdx.x; elem_idx < chunk_len; elem_idx += blockDim.x) {
-            T val = smem_values[elem_idx];
-            for (uint8_t op_idx = 0; op_idx < smem_plan.num_scalar_ops; ++op_idx) {
-                val = dynamic_scalar_op(val, smem_plan.scalar_ops[op_idx]);
-            }
-            output[chunk_start + elem_idx] = val;
+        #pragma unroll
+        for (uint32_t idx = 0; idx < VALUES_PER_LOOP; ++idx) {
+            output[block_start + tile_base + idx * blockDim.x + threadIdx.x] = values[idx];
         }
+    }
 
-        __syncthreads();
+    // Handle remaining elements that were not part of a full tile.
+    const uint32_t rem_start = num_full_tiles * tile_size;
+    for (uint32_t elem_idx = rem_start + threadIdx.x; elem_idx < block_len; elem_idx += blockDim.x) {
+        T val = smem_values[elem_idx];
+        for (uint8_t op_idx = 0; op_idx < smem_plan.num_scalar_ops; ++op_idx) {
+            val = dynamic_scalar_op(val, smem_plan.scalar_ops[op_idx]);
+        }
+        output[block_start + elem_idx] = val;
     }
 }