Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
64 commits
Select commit Hold shift + click to select a range
a33c26a
Add SSE4.2 implementation
AntoinePrv Oct 27, 2025
09e9835
Add unpack uint8_t benchmark
AntoinePrv Oct 28, 2025
af24a60
Add bool unpack benchmark
AntoinePrv Oct 30, 2025
2caf2e1
Bias benchmarks toward small scale
AntoinePrv Nov 25, 2025
a9789b8
Add Kernel plan builder
AntoinePrv Oct 20, 2025
2c4f9f1
Add simd kernel
AntoinePrv Oct 24, 2025
d79d399
Handle rshifts on SSE2
AntoinePrv Oct 27, 2025
61ab1b2
Use new kernel when possible in generated 128 code
AntoinePrv Oct 27, 2025
288d744
Refactor array to xsimd::batch_constant
AntoinePrv Oct 27, 2025
b87ff0d
Refactor right shift
AntoinePrv Oct 27, 2025
df9d6ed
Add oversized plan
AntoinePrv Oct 28, 2025
b55c888
Add oversized kernel
AntoinePrv Oct 28, 2025
f91630a
Rename kernels
AntoinePrv Oct 28, 2025
b4d3281
Add simd kernel dispatch
AntoinePrv Oct 28, 2025
6b395d0
Call Simd kernel directly
AntoinePrv Oct 28, 2025
a195cd2
Fix SIMD level None
AntoinePrv Oct 29, 2025
4aa76eb
Initialize swizzles to -1
AntoinePrv Oct 29, 2025
2bd7e93
Doc
AntoinePrv Oct 29, 2025
060c434
Improve test error message
AntoinePrv Oct 29, 2025
a70e085
Use new kernel in avx2
AntoinePrv Oct 28, 2025
11fa65a
AVX2 swizzle fallback
AntoinePrv Oct 29, 2025
fa41ca7
Remove dead code
AntoinePrv Oct 30, 2025
9f24cf1
Simplify Large masks
AntoinePrv Oct 30, 2025
76b2428
Remove bpacking 256 generated file
AntoinePrv Oct 30, 2025
b38fd4d
Remove uint8_t fallback
AntoinePrv Oct 30, 2025
3cb34a6
Add boolean simd implementation
AntoinePrv Oct 30, 2025
29ffdb3
Use std::is_base_of for arch detection
AntoinePrv Oct 30, 2025
bb97c69
Improve swizzle
AntoinePrv Nov 17, 2025
329f8ea
Only use lshift hack when available
AntoinePrv Nov 17, 2025
4392751
Fix return type
AntoinePrv Nov 17, 2025
1fae638
Fix shift included size
AntoinePrv Nov 18, 2025
415f2ac
Add Avx2 uint16_t shift fallback
AntoinePrv Nov 19, 2025
557a0b0
Refactor make_mult
AntoinePrv Nov 19, 2025
9de5928
Add Avx2 lshift unint8_t fallback
AntoinePrv Nov 19, 2025
0e39313
Refactor right shift excess
AntoinePrv Nov 19, 2025
a4a2cde
Refactor make_mult
AntoinePrv Nov 20, 2025
7a10f2a
Add SSE var shift uint8_t fallback to uint16_t
AntoinePrv Nov 20, 2025
6f5bc9c
Implement size reading reduction
AntoinePrv Nov 20, 2025
5c26e2f
Add fallback Avx2 right shift
AntoinePrv Nov 24, 2025
7550527
Refactor static dispatch
AntoinePrv Nov 26, 2025
6fae07c
Forward oversized to larger uint when possible
AntoinePrv Nov 26, 2025
b761efc
Add arch detection functions
AntoinePrv Nov 26, 2025
2007d6b
Refactor traits usage
AntoinePrv Nov 26, 2025
5716616
Forward x86_64 unpack64 to unpack32
AntoinePrv Nov 26, 2025
7e291d6
Simplify template usage
AntoinePrv Nov 26, 2025
424de01
Reorganize and doc
AntoinePrv Nov 26, 2025
e6d7baa
Refactor KernelDispatch and remove Oversized dispatch
AntoinePrv Nov 26, 2025
b2bd763
Forward large unpack8 to unpack16 on SSE2
AntoinePrv Nov 26, 2025
2ab462c
Use fallback right shift on large uint8_t avx2
AntoinePrv Nov 26, 2025
bc9d27f
Fix enable_if
AntoinePrv Nov 27, 2025
c9d7278
Add missing header
AntoinePrv Nov 27, 2025
814d8b4
fmt
AntoinePrv Nov 27, 2025
8a8f9ba
Add SSE4.2 to dynamic dispatch
AntoinePrv Nov 27, 2025
62efbb0
Rename bpacking_simd_impl > bpacking_simd_kernel
AntoinePrv Nov 27, 2025
b95d89d
Restore modifications to simd_codegen
AntoinePrv Nov 27, 2025
4036ec6
Reduce reading size and declare bytes read
AntoinePrv Nov 27, 2025
867ea97
Add kBytesRead to scalar code
AntoinePrv Nov 27, 2025
9f45892
Add kBytesRead to simd 512 generated code
AntoinePrv Nov 27, 2025
9aa7c74
Prevent overreading
AntoinePrv Nov 27, 2025
c629263
Fix pessimit overeading guard
AntoinePrv Nov 28, 2025
0191392
Fix overreading guard comparison
AntoinePrv Dec 1, 2025
8e7a758
Add UnpackOptions and max_read_bytes
AntoinePrv Dec 1, 2025
d964d56
Use C++20 NTTP
AntoinePrv Jan 7, 2026
2f99ce0
xsimd 14.0 compatibility
AntoinePrv Jan 8, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 9 additions & 4 deletions cpp/src/arrow/util/bit_stream_utils_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -273,14 +273,19 @@ inline int BitReader::GetBatch(int num_bits, T* v, int batch_size) {
batch_size = static_cast<int>(remaining_bits / num_bits);
}

const ::arrow::internal::UnpackOptions opts{
/* .batch_size= */ batch_size,
/* .bit_width= */ num_bits,
/* .bit_offset= */ bit_offset_,
/* .max_read_bytes= */ max_bytes_ - byte_offset_,
};

if constexpr (std::is_same_v<T, bool>) {
::arrow::internal::unpack(buffer_ + byte_offset_, v, batch_size, num_bits,
bit_offset_);
::arrow::internal::unpack(buffer_ + byte_offset_, v, opts);

} else {
::arrow::internal::unpack(buffer_ + byte_offset_,
reinterpret_cast<std::make_unsigned_t<T>*>(v), batch_size,
num_bits, bit_offset_);
reinterpret_cast<std::make_unsigned_t<T>*>(v), opts);
}

Advance(batch_size * num_bits);
Expand Down
23 changes: 12 additions & 11 deletions cpp/src/arrow/util/bpacking.cc
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@

#include <array>

#include "arrow/util/bpacking_dispatch_internal.h"
#include "arrow/util/bpacking_internal.h"
#include "arrow/util/bpacking_scalar_internal.h"
#include "arrow/util/bpacking_simd_internal.h"
Expand All @@ -34,9 +33,11 @@ struct UnpackDynamicFunction {

static constexpr auto implementations() {
return std::array {
// Current SIMD unpack algorithm works terribly on SSE4.2 due to lack of variable
// rhsift and poor xsimd fallback.
#if defined(ARROW_HAVE_SSE4_2)
Implementation{DispatchLevel::NONE, &unpack_sse4_2<Uint>},
#else
Implementation{DispatchLevel::NONE, &unpack_scalar<Uint>},
#endif
#if defined(ARROW_HAVE_RUNTIME_AVX2)
Implementation{DispatchLevel::AVX2, &unpack_avx2<Uint>},
#endif
Expand All @@ -50,19 +51,19 @@ struct UnpackDynamicFunction {
} // namespace

template <typename Uint>
void unpack(const uint8_t* in, Uint* out, int batch_size, int num_bits, int bit_offset) {
void unpack(const uint8_t* in, Uint* out, const UnpackOptions& opts) {
#if defined(ARROW_HAVE_NEON)
return unpack_neon(in, out, batch_size, num_bits, bit_offset);
return unpack_neon(in, out, opts);
#else
static DynamicDispatch<UnpackDynamicFunction<Uint> > dispatch;
return dispatch.func(in, out, batch_size, num_bits, bit_offset);
return dispatch.func(in, out, opts);
#endif
}

template void unpack<bool>(const uint8_t*, bool*, int, int, int);
template void unpack<uint8_t>(const uint8_t*, uint8_t*, int, int, int);
template void unpack<uint16_t>(const uint8_t*, uint16_t*, int, int, int);
template void unpack<uint32_t>(const uint8_t*, uint32_t*, int, int, int);
template void unpack<uint64_t>(const uint8_t*, uint64_t*, int, int, int);
template void unpack<bool>(const uint8_t*, bool*, const UnpackOptions&);
template void unpack<uint8_t>(const uint8_t*, uint8_t*, const UnpackOptions&);
template void unpack<uint16_t>(const uint8_t*, uint16_t*, const UnpackOptions&);
template void unpack<uint32_t>(const uint8_t*, uint32_t*, const UnpackOptions&);
template void unpack<uint64_t>(const uint8_t*, uint64_t*, const UnpackOptions&);

} // namespace arrow::internal
93 changes: 85 additions & 8 deletions cpp/src/arrow/util/bpacking_benchmark.cc
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
// specific language governing permissions and limitations
// under the License.

#include <memory>
#include <stdexcept>
#include <vector>

Expand All @@ -33,7 +34,7 @@ namespace arrow::internal {
namespace {

template <typename Int>
using UnpackFunc = void (*)(const uint8_t*, Int*, int, int, int);
using UnpackFunc = void (*)(const uint8_t*, Int*, const UnpackOptions&);

/// Get the number of bytes associate with a packing.
constexpr int32_t GetNumBytes(int32_t num_values, int32_t bit_width) {
Expand Down Expand Up @@ -86,33 +87,62 @@ void BM_Unpack(benchmark::State& state, bool aligned, UnpackFunc<Int> unpack, bo
const uint8_t* packed_ptr =
GetNextAlignedByte(packed.data(), sizeof(Int)) + (aligned ? 0 : 1);

std::vector<Int> unpacked(num_values, 0);
auto unpacked = std::make_unique<Int[]>(num_values);

const ::arrow::internal::UnpackOptions opts{
/* .batch_size= */ num_values,
/* .bit_width= */ bit_width,
/* .bit_offset= */ 0,
/* .max_read_bytes= */ -1,
};

for (auto _ : state) {
unpack(packed_ptr, unpacked.data(), num_values, bit_width, /* bit_offset = */ 0);
unpack(packed_ptr, unpacked.get(), opts);
benchmark::ClobberMemory();
}
state.SetItemsProcessed(num_values * state.iterations());
}

constexpr int32_t kMinRange = 64;
constexpr int32_t kMaxRange = 32768;
/// Currently, the minimum unpack SIMD kernel size is 32 and the bit packing encoder will
/// not emit runs larger than 512 (though other implementation might), so we biased the
/// benchmarks towards a rather small scale.
static const auto kNumValuesRange = benchmark::CreateRange(32, 512, 2);
constexpr std::initializer_list<int64_t> kBitWidths8 = {1, 2, 8};
constexpr std::initializer_list<int64_t> kBitWidths16 = {1, 2, 8, 13};
constexpr std::initializer_list<int64_t> kBitWidths32 = {1, 2, 8, 20};
constexpr std::initializer_list<int64_t> kBitWidths64 = {1, 2, 8, 20, 47};

static const std::vector<std::vector<int64_t>> kBitWidthsNumValuesBool = {
{0, 1},
kNumValuesRange,
};
static const std::vector<std::vector<int64_t>> kBitWidthsNumValues8 = {
kBitWidths8,
kNumValuesRange,
};
static const std::vector<std::vector<int64_t>> kBitWidthsNumValues16 = {
kBitWidths16,
benchmark::CreateRange(kMinRange, kMaxRange, /*multi=*/32),
kNumValuesRange,
};
static const std::vector<std::vector<int64_t>> kBitWidthsNumValues32 = {
kBitWidths32,
benchmark::CreateRange(kMinRange, kMaxRange, /*multi=*/32),
kNumValuesRange,
};
static const std::vector<std::vector<int64_t>> kBitWidthsNumValues64 = {
kBitWidths64,
benchmark::CreateRange(kMinRange, kMaxRange, /*multi=*/32),
kNumValuesRange,
};

/// Nudge for MSVC template inside BENCHMARK_CAPTURE macro.
void BM_UnpackBool(benchmark::State& state, bool aligned, UnpackFunc<bool> unpack,
bool skip = false, std::string skip_msg = "") {
return BM_Unpack<bool>(state, aligned, unpack, skip, std::move(skip_msg));
}
/// Nudge for MSVC template inside BENCHMARK_CAPTURE macro.
void BM_UnpackUint8(benchmark::State& state, bool aligned, UnpackFunc<uint8_t> unpack,
bool skip = false, std::string skip_msg = "") {
return BM_Unpack<uint8_t>(state, aligned, unpack, skip, std::move(skip_msg));
}
/// Nudge for MSVC template inside BENCHMARK_CAPTURE macro.
void BM_UnpackUint16(benchmark::State& state, bool aligned, UnpackFunc<uint16_t> unpack,
bool skip = false, std::string skip_msg = "") {
Expand All @@ -129,14 +159,39 @@ void BM_UnpackUint64(benchmark::State& state, bool aligned, UnpackFunc<uint64_t>
return BM_Unpack<uint64_t>(state, aligned, unpack, skip, std::move(skip_msg));
}

BENCHMARK_CAPTURE(BM_UnpackBool, ScalarUnaligned, false, &unpack_scalar<bool>)
->ArgsProduct(kBitWidthsNumValuesBool);
BENCHMARK_CAPTURE(BM_UnpackUint8, ScalarUnaligned, false, &unpack_scalar<uint8_t>)
->ArgsProduct(kBitWidthsNumValues8);
BENCHMARK_CAPTURE(BM_UnpackUint16, ScalarUnaligned, false, &unpack_scalar<uint16_t>)
->ArgsProduct(kBitWidthsNumValues16);
BENCHMARK_CAPTURE(BM_UnpackUint32, ScalarUnaligned, false, &unpack_scalar<uint32_t>)
->ArgsProduct(kBitWidthsNumValues32);
BENCHMARK_CAPTURE(BM_UnpackUint64, ScalarUnaligned, false, &unpack_scalar<uint64_t>)
->ArgsProduct(kBitWidthsNumValues64);

#if defined(ARROW_HAVE_SSE4_2)
BENCHMARK_CAPTURE(BM_UnpackBool, Sse42Unaligned, false, &unpack_sse4_2<bool>)
->ArgsProduct(kBitWidthsNumValuesBool);
BENCHMARK_CAPTURE(BM_UnpackUint8, Sse42Unaligned, false, &unpack_sse4_2<uint8_t>)
->ArgsProduct(kBitWidthsNumValues8);
BENCHMARK_CAPTURE(BM_UnpackUint16, Sse42Unaligned, false, &unpack_sse4_2<uint16_t>)
->ArgsProduct(kBitWidthsNumValues16);
BENCHMARK_CAPTURE(BM_UnpackUint32, Sse42Unaligned, false, &unpack_sse4_2<uint32_t>)
->ArgsProduct(kBitWidthsNumValues32);
BENCHMARK_CAPTURE(BM_UnpackUint64, Sse42Unaligned, false, &unpack_sse4_2<uint64_t>)
->ArgsProduct(kBitWidthsNumValues64);
#endif

#if defined(ARROW_HAVE_RUNTIME_AVX2)
BENCHMARK_CAPTURE(BM_UnpackBool, Avx2Unaligned, false, &unpack_avx2<bool>,
!CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX2),
"Avx2 not available")
->ArgsProduct(kBitWidthsNumValuesBool);
BENCHMARK_CAPTURE(BM_UnpackUint8, Avx2Unaligned, false, &unpack_avx2<uint8_t>,
!CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX2),
"Avx2 not available")
->ArgsProduct(kBitWidthsNumValues8);
BENCHMARK_CAPTURE(BM_UnpackUint16, Avx2Unaligned, false, &unpack_avx2<uint16_t>,
!CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX2),
"Avx2 not available")
Expand All @@ -152,6 +207,14 @@ BENCHMARK_CAPTURE(BM_UnpackUint64, Avx2Unaligned, false, &unpack_avx2<uint64_t>,
#endif

#if defined(ARROW_HAVE_RUNTIME_AVX512)
BENCHMARK_CAPTURE(BM_UnpackBool, Avx512Unaligned, false, &unpack_avx512<bool>,
!CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX512),
"Avx512 not available")
->ArgsProduct(kBitWidthsNumValuesBool);
BENCHMARK_CAPTURE(BM_UnpackUint8, Avx512Unaligned, false, &unpack_avx512<uint8_t>,
!CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX512),
"Avx512 not available")
->ArgsProduct(kBitWidthsNumValues8);
BENCHMARK_CAPTURE(BM_UnpackUint16, Avx512Unaligned, false, &unpack_avx512<uint16_t>,
!CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX512),
"Avx512 not available")
Expand All @@ -167,6 +230,10 @@ BENCHMARK_CAPTURE(BM_UnpackUint64, Avx512Unaligned, false, &unpack_avx512<uint64
#endif

#if defined(ARROW_HAVE_NEON)
BENCHMARK_CAPTURE(BM_UnpackBool, NeonUnaligned, false, &unpack_neon<bool>)
->ArgsProduct(kBitWidthsNumValuesBool);
BENCHMARK_CAPTURE(BM_UnpackUint8, NeonUnaligned, false, &unpack_neon<uint8_t>)
->ArgsProduct(kBitWidthsNumValues8);
BENCHMARK_CAPTURE(BM_UnpackUint16, NeonUnaligned, false, &unpack_neon<uint16_t>)
->ArgsProduct(kBitWidthsNumValues16);
BENCHMARK_CAPTURE(BM_UnpackUint32, NeonUnaligned, false, &unpack_neon<uint32_t>)
Expand All @@ -175,6 +242,16 @@ BENCHMARK_CAPTURE(BM_UnpackUint64, NeonUnaligned, false, &unpack_neon<uint64_t>)
->ArgsProduct(kBitWidthsNumValues64);
#endif

BENCHMARK_CAPTURE(BM_UnpackBool, DynamicAligned, true, &unpack<bool>)
->ArgsProduct(kBitWidthsNumValuesBool);
BENCHMARK_CAPTURE(BM_UnpackBool, DynamicUnaligned, false, &unpack<bool>)
->ArgsProduct(kBitWidthsNumValuesBool);

BENCHMARK_CAPTURE(BM_UnpackUint8, DynamicAligned, true, &unpack<uint8_t>)
->ArgsProduct(kBitWidthsNumValues8);
BENCHMARK_CAPTURE(BM_UnpackUint8, DynamicUnaligned, false, &unpack<uint8_t>)
->ArgsProduct(kBitWidthsNumValues8);

BENCHMARK_CAPTURE(BM_UnpackUint16, DynamicAligned, true, &unpack<uint16_t>)
->ArgsProduct(kBitWidthsNumValues16);
BENCHMARK_CAPTURE(BM_UnpackUint16, DynamicUnaligned, false, &unpack<uint16_t>)
Expand Down
Loading
Loading