Skip to content

Commit e666cf8

Browse files
committed
feat: cuda alp dyn dispatch
1 parent 45030ab commit e666cf8

File tree

4 files changed

+226
-16
lines changed

4 files changed

+226
-16
lines changed

vortex-cuda/benches/dynamic_dispatch_cuda.rs

Lines changed: 91 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ use vortex_cuda::CudaSession;
2727
use vortex_cuda::bitpacked_cuda_kernel;
2828
use vortex_cuda::bitpacked_cuda_launch_config;
2929
use vortex_cuda::dynamic_dispatch_op::DynamicOp;
30+
use vortex_cuda::dynamic_dispatch_op::DynamicOpCode_ALP;
3031
use vortex_cuda::dynamic_dispatch_op::DynamicOpCode_BITUNPACK;
3132
use vortex_cuda::dynamic_dispatch_op::DynamicOpCode_FOR;
3233
use vortex_cuda_macros::cuda_available;
@@ -45,10 +46,18 @@ const REFERENCE_VALUE: u32 = 100_000;
4546
/// Bit width used for the bitpack+FoR benchmarks.
4647
const BIT_WIDTH: u8 = 6;
4748

49+
/// ALP decode factors for the ALP benchmarks.
50+
const ALP_F: f32 = 10.0;
51+
const ALP_E: f32 = 1.0;
52+
4853
// ---------------------------------------------------------------------------
4954
// Helpers
5055
// ---------------------------------------------------------------------------
5156

57+
fn pack_alp_f32_param(f: f32, e: f32) -> u64 {
58+
(e.to_bits() as u64) << 32 | f.to_bits() as u64
59+
}
60+
5261
/// Helper: launch a single FoR kernel on a device buffer (in-place).
5362
fn launch_for_kernel(
5463
cuda_ctx: &mut CudaExecutionCtx,
@@ -269,11 +278,12 @@ fn bench_bitunpack_for_separate(c: &mut Criterion) {
269278
// Benchmark: BitUnpack + FoR — single fused dynamic scalar_decode launch
270279
// ============================================================================
271280

272-
/// Run bitunpack+FoR as a single fused dynamic_dispatch launch, returning GPU time.
273-
fn run_bitunpack_for_fused_timed(
281+
/// Run a fused dynamic_dispatch launch on a bitpacked array, returning GPU time.
282+
fn run_dynamic_dispatch_bitpacked_timed(
274283
cuda_ctx: &mut CudaExecutionCtx,
275284
bitpacked_array: &BitPackedArray,
276285
device_ops: &Arc<cudarc::driver::CudaSlice<DynamicOp>>,
286+
num_ops: u8,
277287
) -> VortexResult<Duration> {
278288
let packed = bitpacked_array.packed().clone();
279289
let len = bitpacked_array.len();
@@ -298,9 +308,6 @@ fn run_bitunpack_for_fused_timed(
298308
let output_buf = CudaDeviceBuffer::new(output_slice);
299309
let output_ptr = output_buf.as_view::<u32>().device_ptr(cuda_ctx.stream()).0;
300310

301-
// ops = [BITUNPACK(bit_width), FOR(reference)]
302-
let num_ops: u8 = 2;
303-
304311
// Ensure all previous works on the stream completed.
305312
cuda_ctx
306313
.stream()
@@ -354,9 +361,84 @@ fn bench_bitunpack_for_dynamic_dispatch(c: &mut Criterion) {
354361
let mut total_time = Duration::ZERO;
355362

356363
for _ in 0..iters {
357-
let kernel_time =
358-
run_bitunpack_for_fused_timed(&mut cuda_ctx, array, &device_ops)
359-
.vortex_expect("bitunpack+for dynamic_dispatch failed");
364+
let kernel_time = run_dynamic_dispatch_bitpacked_timed(
365+
&mut cuda_ctx,
366+
array,
367+
&device_ops,
368+
ops.len() as u8,
369+
)
370+
.vortex_expect("bitunpack+for dynamic_dispatch failed");
371+
total_time += kernel_time;
372+
}
373+
374+
total_time
375+
});
376+
},
377+
);
378+
}
379+
380+
group.finish();
381+
}
382+
383+
// ============================================================================
384+
// Benchmark: BitUnpack + FoR + ALP — single fused dynamic dispatch launch
385+
// ============================================================================
386+
387+
fn bench_bitunpack_for_alp_dynamic_dispatch(c: &mut Criterion) {
388+
let mut group = c.benchmark_group("bitunpack_for_alp");
389+
group.sample_size(10);
390+
391+
// ops = [BITUNPACK(bit_width), FOR(reference), ALP(f, e)]
392+
let ops = vec![
393+
DynamicOp {
394+
op: DynamicOpCode_BITUNPACK,
395+
param: BIT_WIDTH as u64,
396+
},
397+
DynamicOp {
398+
op: DynamicOpCode_FOR,
399+
param: REFERENCE_VALUE as u64,
400+
},
401+
DynamicOp {
402+
op: DynamicOpCode_ALP,
403+
param: pack_alp_f32_param(ALP_F, ALP_E),
404+
},
405+
];
406+
407+
for (len, len_str) in BENCH_ARGS {
408+
group.throughput(Throughput::Bytes((len * size_of::<u32>()) as u64));
409+
410+
let bitpacked = make_bitpacked_array_u32(BIT_WIDTH, *len);
411+
412+
group.bench_with_input(
413+
BenchmarkId::new("dynamic_dispatch_u32", len_str),
414+
&bitpacked,
415+
|b, array| {
416+
let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
417+
.vortex_expect("failed to create execution context");
418+
419+
// Force PTX JIT compilation before any measurement.
420+
cuda_ctx
421+
.load_function("dynamic_dispatch", &["u32"])
422+
.vortex_expect("failed to preload dynamic_dispatch kernel");
423+
424+
let device_ops = Arc::new(
425+
cuda_ctx
426+
.stream()
427+
.clone_htod(ops.as_slice())
428+
.expect("failed to copy ops to device"),
429+
);
430+
431+
b.iter_custom(|iters| {
432+
let mut total_time = Duration::ZERO;
433+
434+
for _ in 0..iters {
435+
let kernel_time = run_dynamic_dispatch_bitpacked_timed(
436+
&mut cuda_ctx,
437+
array,
438+
&device_ops,
439+
ops.len() as u8,
440+
)
441+
.vortex_expect("bitunpack+for+alp dynamic_dispatch failed");
360442
total_time += kernel_time;
361443
}
362444

@@ -372,6 +454,7 @@ fn bench_bitunpack_for_dynamic_dispatch(c: &mut Criterion) {
372454
fn benchmark_nested_decode(c: &mut Criterion) {
373455
bench_bitunpack_for_separate(c);
374456
bench_bitunpack_for_dynamic_dispatch(c);
457+
bench_bitunpack_for_alp_dynamic_dispatch(c);
375458
}
376459

377460
criterion::criterion_group!(benches, benchmark_nested_decode);

vortex-cuda/kernels/src/dynamic_dispatch.cu

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,13 @@ __device__ __forceinline__ T apply_scalar_op(T value, const DynamicOp &op) {
3333
case ZIGZAG: {
3434
return (value >> 1) ^ static_cast<T>(-(value & 1));
3535
}
36-
default:
37-
return value;
36+
case ALP: {
37+
float f_val = __uint_as_float(static_cast<uint32_t>(op.param));
38+
float e_val = __uint_as_float(static_cast<uint32_t>(op.param >> 32));
39+
float result = static_cast<float>(static_cast<int32_t>(value)) * f_val * e_val;
40+
return static_cast<T>(__float_as_uint(result));
41+
}
42+
default: __builtin_unreachable();
3843
}
3944
}
4045

vortex-cuda/kernels/src/dynamic_dispatch.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ enum DynamicOpCode {
1616
FOR,
1717
ZIGZAG,
1818
BITUNPACK,
19+
ALP,
1920
};
2021

2122
// Operation to pass to the dynamic dispatch kernel.

vortex-cuda/src/kernel/encodings/dynamic_dispatch.rs

Lines changed: 127 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,23 +11,34 @@ mod tests {
1111
use cudarc::driver::DevicePtr;
1212
use cudarc::driver::LaunchConfig;
1313
use cudarc::driver::PushKernelArg;
14+
use vortex_alp::ALPFloat;
15+
use vortex_alp::Exponents;
16+
use vortex_alp::alp_encode;
17+
use vortex_array::ToCanonical;
1418
use vortex_array::arrays::PrimitiveArray;
1519
use vortex_array::buffer::BufferHandle;
1620
use vortex_array::validity::Validity::NonNullable;
1721
use vortex_buffer::Buffer;
22+
use vortex_dtype::PType;
1823
use vortex_error::VortexExpect;
1924
use vortex_error::VortexResult;
2025
use vortex_fastlanes::BitPackedArray;
26+
use vortex_fastlanes::FoRArray;
2127
use vortex_session::VortexSession;
2228

2329
use crate::CudaBufferExt;
2430
use crate::CudaDeviceBuffer;
2531
use crate::CudaExecutionCtx;
2632
use crate::dynamic_dispatch_op::DynamicOp;
33+
use crate::dynamic_dispatch_op::DynamicOpCode_ALP;
2734
use crate::dynamic_dispatch_op::DynamicOpCode_BITUNPACK;
2835
use crate::dynamic_dispatch_op::DynamicOpCode_FOR;
2936
use crate::session::CudaSession;
3037

38+
fn pack_alp_f32_param(f: f32, e: f32) -> u64 {
39+
(e.to_bits() as u64) << 32 | f.to_bits() as u64
40+
}
41+
3142
fn make_bitpacked_array_u32(bit_width: u8, len: usize) -> BitPackedArray {
3243
let max_val = (1u64 << bit_width).saturating_sub(1);
3344
let values: Vec<u32> = (0..len)
@@ -90,6 +101,17 @@ mod tests {
90101
Ok(host_output[..output_len].to_vec())
91102
}
92103

104+
fn run_dynamic_dispatch_f32(
105+
cuda_ctx: &CudaExecutionCtx,
106+
input_ptr: u64,
107+
output_len: usize,
108+
ops: &[DynamicOp],
109+
) -> VortexResult<Vec<f32>> {
110+
let result = run_dynamic_dispatch_u32(cuda_ctx, input_ptr, output_len, ops)?;
111+
// SAFETY: f32 and u32 have identical size and alignment.
112+
Ok(unsafe { std::mem::transmute::<Vec<u32>, Vec<f32>>(result) })
113+
}
114+
93115
fn copy_to_device(
94116
cuda_ctx: &CudaExecutionCtx,
95117
bitpacked: &BitPackedArray,
@@ -132,17 +154,23 @@ mod tests {
132154

133155
#[test]
134156
fn test_for() -> VortexResult<()> {
135-
let reference: u32 = 42;
136157
let len = 5000;
137158

138-
let input: Vec<u32> = (0..len).map(|i| i as u32).collect();
139-
let expected: Vec<u32> = input.iter().map(|v| v + reference).collect();
159+
// Create original u32 data with an offset so FOR has a meaningful reference.
160+
let original: Vec<u32> = (0..len).map(|i| i as u32 + 42).collect();
161+
let primitive = PrimitiveArray::new(Buffer::from(original.clone()), NonNullable);
162+
163+
// FOR encode to get the reference and encoded (subtracted) values.
164+
let for_array = FoRArray::encode(primitive)?;
165+
let reference = u32::try_from(for_array.reference_scalar())?;
140166

141167
let cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
142168

169+
// Copy the encoded values to device.
170+
let encoded_prim = for_array.encoded().to_primitive();
143171
let device_input = cuda_ctx
144172
.stream()
145-
.clone_htod(input.as_slice())
173+
.clone_htod(encoded_prim.as_slice::<u32>())
146174
.expect("copy input to device");
147175
let input_ptr = device_input.device_ptr(cuda_ctx.stream()).0;
148176

@@ -151,13 +179,106 @@ mod tests {
151179
param: reference as u64,
152180
}];
153181

182+
// Kernel should reconstruct the original data.
154183
let result = run_dynamic_dispatch_u32(&cuda_ctx, input_ptr, len, &ops)?;
155-
assert_eq!(result, expected);
184+
assert_eq!(result, original);
185+
186+
Ok(())
187+
}
188+
189+
#[test]
190+
fn test_alp() -> VortexResult<()> {
191+
let len = 2050;
192+
193+
// Start from f32 data that ALP-encodes cleanly (no patches).
194+
let exponents = Exponents { e: 2, f: 0 };
195+
let floats: Vec<f32> = (0..len)
196+
.map(|i| <f32 as ALPFloat>::decode_single(i as i32, exponents))
197+
.collect();
198+
let float_prim = PrimitiveArray::new(Buffer::from(floats.clone()), NonNullable);
199+
200+
// ALP encode the float data.
201+
let alp_array = alp_encode(&float_prim, Some(exponents))?;
202+
assert!(alp_array.patches().is_none());
203+
204+
// Derive ALP decode factors from the actual exponents.
205+
let f = <f32 as ALPFloat>::F10[alp_array.exponents().f as usize];
206+
let e = <f32 as ALPFloat>::IF10[alp_array.exponents().e as usize];
207+
208+
let cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
209+
210+
// Copy encoded i32 values to device (reinterpreted as u32).
211+
let encoded_as_u32 = alp_array
212+
.encoded()
213+
.to_primitive()
214+
.reinterpret_cast(PType::U32);
215+
let device_input = cuda_ctx
216+
.stream()
217+
.clone_htod(encoded_as_u32.as_slice::<u32>())
218+
.expect("copy input to device");
219+
let input_ptr = device_input.device_ptr(cuda_ctx.stream()).0;
220+
221+
let ops = [DynamicOp {
222+
op: DynamicOpCode_ALP,
223+
param: pack_alp_f32_param(f, e),
224+
}];
225+
226+
let result = run_dynamic_dispatch_f32(&cuda_ctx, input_ptr, len, &ops)?;
227+
assert_eq!(result, floats);
228+
229+
Ok(())
230+
}
231+
232+
#[test]
233+
fn test_alp_for_bitunpack() -> VortexResult<()> {
234+
let len = 2050;
235+
236+
let exponents = Exponents { e: 2, f: 0 };
237+
let floats: Vec<f32> = (0..len)
238+
.map(|i| <f32 as ALPFloat>::decode_single(10 + (i as i32 % 64), exponents))
239+
.collect();
240+
let float_prim = PrimitiveArray::new(Buffer::from(floats.clone()), NonNullable);
241+
242+
// ALP encode f32 → i32 encoded integers + exponents.
243+
let alp_array = alp_encode(&float_prim, Some(exponents))?;
244+
assert!(alp_array.patches().is_none());
245+
246+
// FOR encode the ALP-encoded i32 integers.
247+
let for_array = FoRArray::encode(alp_array.encoded().to_primitive())?;
248+
let reference = i32::try_from(for_array.reference_scalar())? as u32;
249+
250+
// BitPack the FOR-encoded values.
251+
let bit_width: u8 = 6;
252+
let bitpacked = BitPackedArray::encode(for_array.encoded(), bit_width)?;
253+
254+
// Derive ALP decode factors from the actual exponents.
255+
let alp_f = <f32 as ALPFloat>::F10[alp_array.exponents().f as usize];
256+
let alp_e = <f32 as ALPFloat>::IF10[alp_array.exponents().e as usize];
257+
258+
let cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
259+
let (input_ptr, _device_input) = copy_to_device(&cuda_ctx, &bitpacked)?;
260+
261+
let ops = [
262+
DynamicOp {
263+
op: DynamicOpCode_BITUNPACK,
264+
param: bit_width as u64,
265+
},
266+
DynamicOp {
267+
op: DynamicOpCode_FOR,
268+
param: reference as u64,
269+
},
270+
DynamicOp {
271+
op: DynamicOpCode_ALP,
272+
param: pack_alp_f32_param(alp_f, alp_e),
273+
},
274+
];
275+
276+
let result = run_dynamic_dispatch_f32(&cuda_ctx, input_ptr, len, &ops)?;
277+
assert_eq!(result, floats);
156278

157279
Ok(())
158280
}
159281

160-
/// 1 bitunpack + 7 FoR
161282
#[test]
162283
fn test_max_ops_bitunpack_7for() -> VortexResult<()> {
163284
let bit_width: u8 = 6;

0 commit comments

Comments
 (0)