Skip to content

Commit 53f4d08

Browse files
committed
feat: cuda alp dyn dispatch
1 parent 45030ab commit 53f4d08

File tree

4 files changed

+216
-16
lines changed

4 files changed

+216
-16
lines changed

vortex-cuda/benches/dynamic_dispatch_cuda.rs

Lines changed: 91 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ use vortex_cuda::CudaSession;
2727
use vortex_cuda::bitpacked_cuda_kernel;
2828
use vortex_cuda::bitpacked_cuda_launch_config;
2929
use vortex_cuda::dynamic_dispatch_op::DynamicOp;
30+
use vortex_cuda::dynamic_dispatch_op::DynamicOpCode_ALP;
3031
use vortex_cuda::dynamic_dispatch_op::DynamicOpCode_BITUNPACK;
3132
use vortex_cuda::dynamic_dispatch_op::DynamicOpCode_FOR;
3233
use vortex_cuda_macros::cuda_available;
@@ -45,10 +46,18 @@ const REFERENCE_VALUE: u32 = 100_000;
4546
/// Bit width used for the bitpack+FoR benchmarks.
4647
const BIT_WIDTH: u8 = 6;
4748

49+
/// ALP decode factors for the ALP benchmarks.
50+
const ALP_F: f32 = 10.0;
51+
const ALP_E: f32 = 1.0;
52+
4853
// ---------------------------------------------------------------------------
4954
// Helpers
5055
// ---------------------------------------------------------------------------
5156

57+
fn pack_alp_f32_param(f: f32, e: f32) -> u64 {
58+
(e.to_bits() as u64) << 32 | f.to_bits() as u64
59+
}
60+
5261
/// Helper: launch a single FoR kernel on a device buffer (in-place).
5362
fn launch_for_kernel(
5463
cuda_ctx: &mut CudaExecutionCtx,
@@ -269,11 +278,12 @@ fn bench_bitunpack_for_separate(c: &mut Criterion) {
269278
// Benchmark: BitUnpack + FoR — single fused dynamic scalar_decode launch
270279
// ============================================================================
271280

272-
/// Run bitunpack+FoR as a single fused dynamic_dispatch launch, returning GPU time.
273-
fn run_bitunpack_for_fused_timed(
281+
/// Run a fused dynamic_dispatch launch on a bitpacked array, returning GPU time.
282+
fn run_dynamic_dispatch_bitpacked_timed(
274283
cuda_ctx: &mut CudaExecutionCtx,
275284
bitpacked_array: &BitPackedArray,
276285
device_ops: &Arc<cudarc::driver::CudaSlice<DynamicOp>>,
286+
num_ops: u8,
277287
) -> VortexResult<Duration> {
278288
let packed = bitpacked_array.packed().clone();
279289
let len = bitpacked_array.len();
@@ -298,9 +308,6 @@ fn run_bitunpack_for_fused_timed(
298308
let output_buf = CudaDeviceBuffer::new(output_slice);
299309
let output_ptr = output_buf.as_view::<u32>().device_ptr(cuda_ctx.stream()).0;
300310

301-
// ops = [BITUNPACK(bit_width), FOR(reference)]
302-
let num_ops: u8 = 2;
303-
304311
// Ensure all previous works on the stream completed.
305312
cuda_ctx
306313
.stream()
@@ -354,9 +361,84 @@ fn bench_bitunpack_for_dynamic_dispatch(c: &mut Criterion) {
354361
let mut total_time = Duration::ZERO;
355362

356363
for _ in 0..iters {
357-
let kernel_time =
358-
run_bitunpack_for_fused_timed(&mut cuda_ctx, array, &device_ops)
359-
.vortex_expect("bitunpack+for dynamic_dispatch failed");
364+
let kernel_time = run_dynamic_dispatch_bitpacked_timed(
365+
&mut cuda_ctx,
366+
array,
367+
&device_ops,
368+
ops.len() as u8,
369+
)
370+
.vortex_expect("bitunpack+for dynamic_dispatch failed");
371+
total_time += kernel_time;
372+
}
373+
374+
total_time
375+
});
376+
},
377+
);
378+
}
379+
380+
group.finish();
381+
}
382+
383+
// ============================================================================
384+
// Benchmark: BitUnpack + FoR + ALP — single fused dynamic dispatch launch
385+
// ============================================================================
386+
387+
fn bench_bitunpack_for_alp_dynamic_dispatch(c: &mut Criterion) {
388+
let mut group = c.benchmark_group("bitunpack_for_alp");
389+
group.sample_size(10);
390+
391+
// ops = [BITUNPACK(bit_width), FOR(reference), ALP(f, e)]
392+
let ops = vec![
393+
DynamicOp {
394+
op: DynamicOpCode_BITUNPACK,
395+
param: BIT_WIDTH as u64,
396+
},
397+
DynamicOp {
398+
op: DynamicOpCode_FOR,
399+
param: REFERENCE_VALUE as u64,
400+
},
401+
DynamicOp {
402+
op: DynamicOpCode_ALP,
403+
param: pack_alp_f32_param(ALP_F, ALP_E),
404+
},
405+
];
406+
407+
for (len, len_str) in BENCH_ARGS {
408+
group.throughput(Throughput::Bytes((len * size_of::<u32>()) as u64));
409+
410+
let bitpacked = make_bitpacked_array_u32(BIT_WIDTH, *len);
411+
412+
group.bench_with_input(
413+
BenchmarkId::new("dynamic_dispatch_u32", len_str),
414+
&bitpacked,
415+
|b, array| {
416+
let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
417+
.vortex_expect("failed to create execution context");
418+
419+
// Force PTX JIT compilation before any measurement.
420+
cuda_ctx
421+
.load_function("dynamic_dispatch", &["u32"])
422+
.vortex_expect("failed to preload dynamic_dispatch kernel");
423+
424+
let device_ops = Arc::new(
425+
cuda_ctx
426+
.stream()
427+
.clone_htod(ops.as_slice())
428+
.expect("failed to copy ops to device"),
429+
);
430+
431+
b.iter_custom(|iters| {
432+
let mut total_time = Duration::ZERO;
433+
434+
for _ in 0..iters {
435+
let kernel_time = run_dynamic_dispatch_bitpacked_timed(
436+
&mut cuda_ctx,
437+
array,
438+
&device_ops,
439+
ops.len() as u8,
440+
)
441+
.vortex_expect("bitunpack+for+alp dynamic_dispatch failed");
360442
total_time += kernel_time;
361443
}
362444

@@ -372,6 +454,7 @@ fn bench_bitunpack_for_dynamic_dispatch(c: &mut Criterion) {
372454
fn benchmark_nested_decode(c: &mut Criterion) {
373455
bench_bitunpack_for_separate(c);
374456
bench_bitunpack_for_dynamic_dispatch(c);
457+
bench_bitunpack_for_alp_dynamic_dispatch(c);
375458
}
376459

377460
criterion::criterion_group!(benches, benchmark_nested_decode);

vortex-cuda/kernels/src/dynamic_dispatch.cu

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,13 @@ __device__ __forceinline__ T apply_scalar_op(T value, const DynamicOp &op) {
3333
case ZIGZAG: {
3434
return (value >> 1) ^ static_cast<T>(-(value & 1));
3535
}
36-
default:
37-
return value;
36+
case ALP: {
37+
float f_val = __uint_as_float(static_cast<uint32_t>(op.param));
38+
float e_val = __uint_as_float(static_cast<uint32_t>(op.param >> 32));
39+
float result = static_cast<float>(static_cast<int32_t>(value)) * f_val * e_val;
40+
return static_cast<T>(__float_as_uint(result));
41+
}
42+
default: __builtin_unreachable();
3843
}
3944
}
4045

vortex-cuda/kernels/src/dynamic_dispatch.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ enum DynamicOpCode {
1616
FOR,
1717
ZIGZAG,
1818
BITUNPACK,
19+
ALP,
1920
};
2021

2122
// Operation to pass to the dynamic dispatch kernel.

vortex-cuda/src/kernel/encodings/dynamic_dispatch.rs

Lines changed: 117 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,23 +11,33 @@ mod tests {
1111
use cudarc::driver::DevicePtr;
1212
use cudarc::driver::LaunchConfig;
1313
use cudarc::driver::PushKernelArg;
14+
use vortex_alp::ALPFloat;
15+
use vortex_alp::Exponents;
16+
use vortex_alp::alp_encode;
17+
use vortex_array::ToCanonical;
1418
use vortex_array::arrays::PrimitiveArray;
1519
use vortex_array::buffer::BufferHandle;
1620
use vortex_array::validity::Validity::NonNullable;
1721
use vortex_buffer::Buffer;
1822
use vortex_error::VortexExpect;
1923
use vortex_error::VortexResult;
2024
use vortex_fastlanes::BitPackedArray;
25+
use vortex_fastlanes::FoRArray;
2126
use vortex_session::VortexSession;
2227

2328
use crate::CudaBufferExt;
2429
use crate::CudaDeviceBuffer;
2530
use crate::CudaExecutionCtx;
2631
use crate::dynamic_dispatch_op::DynamicOp;
32+
use crate::dynamic_dispatch_op::DynamicOpCode_ALP;
2733
use crate::dynamic_dispatch_op::DynamicOpCode_BITUNPACK;
2834
use crate::dynamic_dispatch_op::DynamicOpCode_FOR;
2935
use crate::session::CudaSession;
3036

37+
fn pack_alp_f32_param(f: f32, e: f32) -> u64 {
38+
(e.to_bits() as u64) << 32 | f.to_bits() as u64
39+
}
40+
3141
fn make_bitpacked_array_u32(bit_width: u8, len: usize) -> BitPackedArray {
3242
let max_val = (1u64 << bit_width).saturating_sub(1);
3343
let values: Vec<u32> = (0..len)
@@ -90,6 +100,17 @@ mod tests {
90100
Ok(host_output[..output_len].to_vec())
91101
}
92102

103+
fn run_dynamic_dispatch_f32(
104+
cuda_ctx: &CudaExecutionCtx,
105+
input_ptr: u64,
106+
output_len: usize,
107+
ops: &[DynamicOp],
108+
) -> VortexResult<Vec<f32>> {
109+
let result = run_dynamic_dispatch_u32(cuda_ctx, input_ptr, output_len, ops)?;
110+
// SAFETY: f32 and u32 have identical size and alignment.
111+
Ok(unsafe { std::mem::transmute::<Vec<u32>, Vec<f32>>(result) })
112+
}
113+
93114
fn copy_to_device(
94115
cuda_ctx: &CudaExecutionCtx,
95116
bitpacked: &BitPackedArray,
@@ -132,17 +153,20 @@ mod tests {
132153

133154
#[test]
134155
fn test_for() -> VortexResult<()> {
135-
let reference: u32 = 42;
136156
let len = 5000;
137157

138-
let input: Vec<u32> = (0..len).map(|i| i as u32).collect();
139-
let expected: Vec<u32> = input.iter().map(|v| v + reference).collect();
158+
let original: Vec<u32> = (0..len).map(|i| i as u32 + 42).collect();
159+
let primitive = PrimitiveArray::new(Buffer::from(original.clone()), NonNullable);
160+
161+
let for_array = FoRArray::encode(primitive)?;
162+
let reference = u32::try_from(for_array.reference_scalar())?;
140163

141164
let cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
142165

166+
let encoded_prim = for_array.encoded().to_primitive();
143167
let device_input = cuda_ctx
144168
.stream()
145-
.clone_htod(input.as_slice())
169+
.clone_htod(encoded_prim.as_slice::<u32>())
146170
.expect("copy input to device");
147171
let input_ptr = device_input.device_ptr(cuda_ctx.stream()).0;
148172

@@ -151,13 +175,100 @@ mod tests {
151175
param: reference as u64,
152176
}];
153177

178+
// Kernel should reconstruct the original data.
154179
let result = run_dynamic_dispatch_u32(&cuda_ctx, input_ptr, len, &ops)?;
155-
assert_eq!(result, expected);
180+
assert_eq!(result, original);
181+
182+
Ok(())
183+
}
184+
185+
#[test]
186+
fn test_alp() -> VortexResult<()> {
187+
let len = 2050;
188+
189+
// Start from f32 data that ALP-encodes cleanly - no patches.
190+
let exponents = Exponents { e: 2, f: 0 };
191+
let floats: Vec<f32> = (0..len)
192+
.map(|i| <f32 as ALPFloat>::decode_single(i as i32, exponents))
193+
.collect();
194+
let float_prim = PrimitiveArray::new(Buffer::from(floats.clone()), NonNullable);
195+
196+
let alp_array = alp_encode(&float_prim, Some(exponents))?;
197+
assert!(alp_array.patches().is_none());
198+
199+
let f = <f32 as ALPFloat>::F10[alp_array.exponents().f as usize];
200+
let e = <f32 as ALPFloat>::IF10[alp_array.exponents().e as usize];
201+
202+
let cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
203+
204+
let encoded_prim = alp_array.encoded().to_primitive();
205+
let device_input = cuda_ctx
206+
.stream()
207+
.clone_htod(encoded_prim.as_slice::<i32>())
208+
.expect("copy input to device");
209+
let input_ptr = device_input.device_ptr(cuda_ctx.stream()).0;
210+
211+
let ops = [DynamicOp {
212+
op: DynamicOpCode_ALP,
213+
param: pack_alp_f32_param(f, e),
214+
}];
215+
216+
let result = run_dynamic_dispatch_f32(&cuda_ctx, input_ptr, len, &ops)?;
217+
assert_eq!(result, floats);
218+
219+
Ok(())
220+
}
221+
222+
#[test]
223+
fn test_alp_for_bitunpack() -> VortexResult<()> {
224+
let len = 2050;
225+
226+
let exponents = Exponents { e: 2, f: 0 };
227+
let floats: Vec<f32> = (0..len)
228+
.map(|i| <f32 as ALPFloat>::decode_single(10 + (i as i32 % 64), exponents))
229+
.collect();
230+
let float_prim = PrimitiveArray::new(Buffer::from(floats.clone()), NonNullable);
231+
232+
// ALP encode f32 → i32 encoded integers + exponents.
233+
let alp_array = alp_encode(&float_prim, Some(exponents))?;
234+
assert!(alp_array.patches().is_none());
235+
236+
// FOR encode the ALP-encoded i32 integers.
237+
let for_array = FoRArray::encode(alp_array.encoded().to_primitive())?;
238+
let reference = i32::try_from(for_array.reference_scalar())? as u32;
239+
240+
// BitPack the FOR-encoded values.
241+
let bit_width: u8 = 6;
242+
let bitpacked = BitPackedArray::encode(for_array.encoded(), bit_width)?;
243+
244+
// Derive ALP decode factors from the actual exponents.
245+
let alp_f = <f32 as ALPFloat>::F10[alp_array.exponents().f as usize];
246+
let alp_e = <f32 as ALPFloat>::IF10[alp_array.exponents().e as usize];
247+
248+
let cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
249+
let (input_ptr, _device_input) = copy_to_device(&cuda_ctx, &bitpacked)?;
250+
251+
let ops = [
252+
DynamicOp {
253+
op: DynamicOpCode_BITUNPACK,
254+
param: bit_width as u64,
255+
},
256+
DynamicOp {
257+
op: DynamicOpCode_FOR,
258+
param: reference as u64,
259+
},
260+
DynamicOp {
261+
op: DynamicOpCode_ALP,
262+
param: pack_alp_f32_param(alp_f, alp_e),
263+
},
264+
];
265+
266+
let result = run_dynamic_dispatch_f32(&cuda_ctx, input_ptr, len, &ops)?;
267+
assert_eq!(result, floats);
156268

157269
Ok(())
158270
}
159271

160-
/// 1 bitunpack + 7 FoR
161272
#[test]
162273
fn test_max_ops_bitunpack_7for() -> VortexResult<()> {
163274
let bit_width: u8 = 6;

0 commit comments

Comments
 (0)