feat: cuda alp dyn dispatch

0ax1 · 0ax1 · commit 53f4d08a4342 · 2026-02-12T14:34:15.000Z
diff --git a/vortex-cuda/benches/dynamic_dispatch_cuda.rs b/vortex-cuda/benches/dynamic_dispatch_cuda.rs
@@ -27,6 +27,7 @@ use vortex_cuda::CudaSession;
 use vortex_cuda::bitpacked_cuda_kernel;
 use vortex_cuda::bitpacked_cuda_launch_config;
 use vortex_cuda::dynamic_dispatch_op::DynamicOp;
+use vortex_cuda::dynamic_dispatch_op::DynamicOpCode_ALP;
 use vortex_cuda::dynamic_dispatch_op::DynamicOpCode_BITUNPACK;
 use vortex_cuda::dynamic_dispatch_op::DynamicOpCode_FOR;
 use vortex_cuda_macros::cuda_available;
@@ -45,10 +46,18 @@ const REFERENCE_VALUE: u32 = 100_000;
 /// Bit width used for the bitpack+FoR benchmarks.
 const BIT_WIDTH: u8 = 6;
 
+/// ALP decode factors for the ALP benchmarks.
+const ALP_F: f32 = 10.0;
+const ALP_E: f32 = 1.0;
+
 // ---------------------------------------------------------------------------
 // Helpers
 // ---------------------------------------------------------------------------
 
+fn pack_alp_f32_param(f: f32, e: f32) -> u64 {
+    (e.to_bits() as u64) << 32 | f.to_bits() as u64
+}
+
 /// Helper: launch a single FoR kernel on a device buffer (in-place).
 fn launch_for_kernel(
     cuda_ctx: &mut CudaExecutionCtx,
@@ -269,11 +278,12 @@ fn bench_bitunpack_for_separate(c: &mut Criterion) {
 // Benchmark: BitUnpack + FoR — single fused dynamic scalar_decode launch
 // ============================================================================
 
-/// Run bitunpack+FoR as a single fused dynamic_dispatch launch, returning GPU time.
-fn run_bitunpack_for_fused_timed(
+/// Run a fused dynamic_dispatch launch on a bitpacked array, returning GPU time.
+fn run_dynamic_dispatch_bitpacked_timed(
     cuda_ctx: &mut CudaExecutionCtx,
     bitpacked_array: &BitPackedArray,
     device_ops: &Arc<cudarc::driver::CudaSlice<DynamicOp>>,
+    num_ops: u8,
 ) -> VortexResult<Duration> {
     let packed = bitpacked_array.packed().clone();
     let len = bitpacked_array.len();
@@ -298,9 +308,6 @@ fn run_bitunpack_for_fused_timed(
     let output_buf = CudaDeviceBuffer::new(output_slice);
     let output_ptr = output_buf.as_view::<u32>().device_ptr(cuda_ctx.stream()).0;
 
-    // ops = [BITUNPACK(bit_width), FOR(reference)]
-    let num_ops: u8 = 2;
-
     // Ensure all previous works on the stream completed.
     cuda_ctx
         .stream()
@@ -354,9 +361,84 @@ fn bench_bitunpack_for_dynamic_dispatch(c: &mut Criterion) {
                     let mut total_time = Duration::ZERO;
 
                     for _ in 0..iters {
-                        let kernel_time =
-                            run_bitunpack_for_fused_timed(&mut cuda_ctx, array, &device_ops)
-                                .vortex_expect("bitunpack+for dynamic_dispatch failed");
+                        let kernel_time = run_dynamic_dispatch_bitpacked_timed(
+                            &mut cuda_ctx,
+                            array,
+                            &device_ops,
+                            ops.len() as u8,
+                        )
+                        .vortex_expect("bitunpack+for dynamic_dispatch failed");
+                        total_time += kernel_time;
+                    }
+
+                    total_time
+                });
+            },
+        );
+    }
+
+    group.finish();
+}
+
+// ============================================================================
+// Benchmark: BitUnpack + FoR + ALP — single fused dynamic dispatch launch
+// ============================================================================
+
+fn bench_bitunpack_for_alp_dynamic_dispatch(c: &mut Criterion) {
+    let mut group = c.benchmark_group("bitunpack_for_alp");
+    group.sample_size(10);
+
+    // ops = [BITUNPACK(bit_width), FOR(reference), ALP(f, e)]
+    let ops = vec![
+        DynamicOp {
+            op: DynamicOpCode_BITUNPACK,
+            param: BIT_WIDTH as u64,
+        },
+        DynamicOp {
+            op: DynamicOpCode_FOR,
+            param: REFERENCE_VALUE as u64,
+        },
+        DynamicOp {
+            op: DynamicOpCode_ALP,
+            param: pack_alp_f32_param(ALP_F, ALP_E),
+        },
+    ];
+
+    for (len, len_str) in BENCH_ARGS {
+        group.throughput(Throughput::Bytes((len * size_of::<u32>()) as u64));
+
+        let bitpacked = make_bitpacked_array_u32(BIT_WIDTH, *len);
+
+        group.bench_with_input(
+            BenchmarkId::new("dynamic_dispatch_u32", len_str),
+            &bitpacked,
+            |b, array| {
+                let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
+                    .vortex_expect("failed to create execution context");
+
+                // Force PTX JIT compilation before any measurement.
+                cuda_ctx
+                    .load_function("dynamic_dispatch", &["u32"])
+                    .vortex_expect("failed to preload dynamic_dispatch kernel");
+
+                let device_ops = Arc::new(
+                    cuda_ctx
+                        .stream()
+                        .clone_htod(ops.as_slice())
+                        .expect("failed to copy ops to device"),
+                );
+
+                b.iter_custom(|iters| {
+                    let mut total_time = Duration::ZERO;
+
+                    for _ in 0..iters {
+                        let kernel_time = run_dynamic_dispatch_bitpacked_timed(
+                            &mut cuda_ctx,
+                            array,
+                            &device_ops,
+                            ops.len() as u8,
+                        )
+                        .vortex_expect("bitunpack+for+alp dynamic_dispatch failed");
                         total_time += kernel_time;
                     }
 
@@ -372,6 +454,7 @@ fn bench_bitunpack_for_dynamic_dispatch(c: &mut Criterion) {
 fn benchmark_nested_decode(c: &mut Criterion) {
     bench_bitunpack_for_separate(c);
     bench_bitunpack_for_dynamic_dispatch(c);
+    bench_bitunpack_for_alp_dynamic_dispatch(c);
 }
 
 criterion::criterion_group!(benches, benchmark_nested_decode);
diff --git a/vortex-cuda/kernels/src/dynamic_dispatch.cu b/vortex-cuda/kernels/src/dynamic_dispatch.cu
@@ -33,8 +33,13 @@ __device__ __forceinline__ T apply_scalar_op(T value, const DynamicOp &op) {
     case ZIGZAG: {
         return (value >> 1) ^ static_cast<T>(-(value & 1));
     }
-    default:
-        return value;
+    case ALP: {
+        float f_val = __uint_as_float(static_cast<uint32_t>(op.param));
+        float e_val = __uint_as_float(static_cast<uint32_t>(op.param >> 32));
+        float result = static_cast<float>(static_cast<int32_t>(value)) * f_val * e_val;
+        return static_cast<T>(__float_as_uint(result));
+    }
+    default: __builtin_unreachable();
     }
 }
 
diff --git a/vortex-cuda/kernels/src/dynamic_dispatch.h b/vortex-cuda/kernels/src/dynamic_dispatch.h
@@ -16,6 +16,7 @@ enum DynamicOpCode {
     FOR,
     ZIGZAG,
     BITUNPACK,
+    ALP,
 };
 
 // Operation to pass to the dynamic dispatch kernel.
diff --git a/vortex-cuda/src/kernel/encodings/dynamic_dispatch.rs b/vortex-cuda/src/kernel/encodings/dynamic_dispatch.rs
@@ -11,23 +11,33 @@ mod tests {
     use cudarc::driver::DevicePtr;
     use cudarc::driver::LaunchConfig;
     use cudarc::driver::PushKernelArg;
+    use vortex_alp::ALPFloat;
+    use vortex_alp::Exponents;
+    use vortex_alp::alp_encode;
+    use vortex_array::ToCanonical;
     use vortex_array::arrays::PrimitiveArray;
     use vortex_array::buffer::BufferHandle;
     use vortex_array::validity::Validity::NonNullable;
     use vortex_buffer::Buffer;
     use vortex_error::VortexExpect;
     use vortex_error::VortexResult;
     use vortex_fastlanes::BitPackedArray;
+    use vortex_fastlanes::FoRArray;
     use vortex_session::VortexSession;
 
     use crate::CudaBufferExt;
     use crate::CudaDeviceBuffer;
     use crate::CudaExecutionCtx;
     use crate::dynamic_dispatch_op::DynamicOp;
+    use crate::dynamic_dispatch_op::DynamicOpCode_ALP;
     use crate::dynamic_dispatch_op::DynamicOpCode_BITUNPACK;
     use crate::dynamic_dispatch_op::DynamicOpCode_FOR;
     use crate::session::CudaSession;
 
+    fn pack_alp_f32_param(f: f32, e: f32) -> u64 {
+        (e.to_bits() as u64) << 32 | f.to_bits() as u64
+    }
+
     fn make_bitpacked_array_u32(bit_width: u8, len: usize) -> BitPackedArray {
         let max_val = (1u64 << bit_width).saturating_sub(1);
         let values: Vec<u32> = (0..len)
@@ -90,6 +100,17 @@ mod tests {
         Ok(host_output[..output_len].to_vec())
     }
 
+    fn run_dynamic_dispatch_f32(
+        cuda_ctx: &CudaExecutionCtx,
+        input_ptr: u64,
+        output_len: usize,
+        ops: &[DynamicOp],
+    ) -> VortexResult<Vec<f32>> {
+        let result = run_dynamic_dispatch_u32(cuda_ctx, input_ptr, output_len, ops)?;
+        // SAFETY: f32 and u32 have identical size and alignment.
+        Ok(unsafe { std::mem::transmute::<Vec<u32>, Vec<f32>>(result) })
+    }
+
     fn copy_to_device(
         cuda_ctx: &CudaExecutionCtx,
         bitpacked: &BitPackedArray,
@@ -132,17 +153,20 @@ mod tests {
 
     #[test]
     fn test_for() -> VortexResult<()> {
-        let reference: u32 = 42;
         let len = 5000;
 
-        let input: Vec<u32> = (0..len).map(|i| i as u32).collect();
-        let expected: Vec<u32> = input.iter().map(|v| v + reference).collect();
+        let original: Vec<u32> = (0..len).map(|i| i as u32 + 42).collect();
+        let primitive = PrimitiveArray::new(Buffer::from(original.clone()), NonNullable);
+
+        let for_array = FoRArray::encode(primitive)?;
+        let reference = u32::try_from(for_array.reference_scalar())?;
 
         let cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
 
+        let encoded_prim = for_array.encoded().to_primitive();
         let device_input = cuda_ctx
             .stream()
-            .clone_htod(input.as_slice())
+            .clone_htod(encoded_prim.as_slice::<u32>())
             .expect("copy input to device");
         let input_ptr = device_input.device_ptr(cuda_ctx.stream()).0;
 
@@ -151,13 +175,100 @@ mod tests {
             param: reference as u64,
         }];
 
+        // Kernel should reconstruct the original data.
         let result = run_dynamic_dispatch_u32(&cuda_ctx, input_ptr, len, &ops)?;
-        assert_eq!(result, expected);
+        assert_eq!(result, original);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_alp() -> VortexResult<()> {
+        let len = 2050;
+
+        // Start from f32 data that ALP-encodes cleanly - no patches.
+        let exponents = Exponents { e: 2, f: 0 };
+        let floats: Vec<f32> = (0..len)
+            .map(|i| <f32 as ALPFloat>::decode_single(i as i32, exponents))
+            .collect();
+        let float_prim = PrimitiveArray::new(Buffer::from(floats.clone()), NonNullable);
+
+        let alp_array = alp_encode(&float_prim, Some(exponents))?;
+        assert!(alp_array.patches().is_none());
+
+        let f = <f32 as ALPFloat>::F10[alp_array.exponents().f as usize];
+        let e = <f32 as ALPFloat>::IF10[alp_array.exponents().e as usize];
+
+        let cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
+
+        let encoded_prim = alp_array.encoded().to_primitive();
+        let device_input = cuda_ctx
+            .stream()
+            .clone_htod(encoded_prim.as_slice::<i32>())
+            .expect("copy input to device");
+        let input_ptr = device_input.device_ptr(cuda_ctx.stream()).0;
+
+        let ops = [DynamicOp {
+            op: DynamicOpCode_ALP,
+            param: pack_alp_f32_param(f, e),
+        }];
+
+        let result = run_dynamic_dispatch_f32(&cuda_ctx, input_ptr, len, &ops)?;
+        assert_eq!(result, floats);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_alp_for_bitunpack() -> VortexResult<()> {
+        let len = 2050;
+
+        let exponents = Exponents { e: 2, f: 0 };
+        let floats: Vec<f32> = (0..len)
+            .map(|i| <f32 as ALPFloat>::decode_single(10 + (i as i32 % 64), exponents))
+            .collect();
+        let float_prim = PrimitiveArray::new(Buffer::from(floats.clone()), NonNullable);
+
+        // ALP encode f32 → i32 encoded integers + exponents.
+        let alp_array = alp_encode(&float_prim, Some(exponents))?;
+        assert!(alp_array.patches().is_none());
+
+        // FOR encode the ALP-encoded i32 integers.
+        let for_array = FoRArray::encode(alp_array.encoded().to_primitive())?;
+        let reference = i32::try_from(for_array.reference_scalar())? as u32;
+
+        // BitPack the FOR-encoded values.
+        let bit_width: u8 = 6;
+        let bitpacked = BitPackedArray::encode(for_array.encoded(), bit_width)?;
+
+        // Derive ALP decode factors from the actual exponents.
+        let alp_f = <f32 as ALPFloat>::F10[alp_array.exponents().f as usize];
+        let alp_e = <f32 as ALPFloat>::IF10[alp_array.exponents().e as usize];
+
+        let cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
+        let (input_ptr, _device_input) = copy_to_device(&cuda_ctx, &bitpacked)?;
+
+        let ops = [
+            DynamicOp {
+                op: DynamicOpCode_BITUNPACK,
+                param: bit_width as u64,
+            },
+            DynamicOp {
+                op: DynamicOpCode_FOR,
+                param: reference as u64,
+            },
+            DynamicOp {
+                op: DynamicOpCode_ALP,
+                param: pack_alp_f32_param(alp_f, alp_e),
+            },
+        ];
+
+        let result = run_dynamic_dispatch_f32(&cuda_ctx, input_ptr, len, &ops)?;
+        assert_eq!(result, floats);
 
         Ok(())
     }
 
-    /// 1 bitunpack + 7 FoR
     #[test]
     fn test_max_ops_bitunpack_7for() -> VortexResult<()> {
         let bit_width: u8 = 6;

Original file line number	Diff line number	Diff line change
`@@ -33,8 +33,13 @@ __device__ __forceinline__ T apply_scalar_op(T value, const DynamicOp &op) {`
`33`	`33`	`case ZIGZAG: {`
`34`	`34`	`return (value >> 1) ^ static_cast<T>(-(value & 1));`
`35`	`35`	`}`
`36`		`- default:`
`37`		`- return value;`
	`36`	`+ case ALP: {`
	`37`	`+ float f_val = __uint_as_float(static_cast<uint32_t>(op.param));`
	`38`	`+ float e_val = __uint_as_float(static_cast<uint32_t>(op.param >> 32));`
	`39`	`+ float result = static_cast<float>(static_cast<int32_t>(value)) * f_val * e_val;`
	`40`	`+ return static_cast<T>(__float_as_uint(result));`
	`41`	`+ }`
	`42`	`+ default: __builtin_unreachable();`
`38`	`43`	`}`
`39`	`44`	`}`
`40`	`45`