From b4fc4f1aad67a87356f9d4f9b8d1bf33d77c41a3 Mon Sep 17 00:00:00 2001 From: Johannes Hengstler Date: Sun, 19 Oct 2025 14:45:56 +0200 Subject: [PATCH 01/12] un-hide elided lifetimes because we are so conformant --- src/wavelet/mod.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/wavelet/mod.rs b/src/wavelet/mod.rs index 3d08602..e8676be 100644 --- a/src/wavelet/mod.rs +++ b/src/wavelet/mod.rs @@ -1942,7 +1942,7 @@ impl WaveletMatrix { /// assert_eq!(iter.collect::>(), vec![1, 4, 4, 1, 2, 7]); /// ``` #[must_use] - pub fn iter_u64(&self) -> Option { + pub fn iter_u64(&self) -> Option> { if self.bits_per_element() > 64 { None } else { @@ -1967,7 +1967,7 @@ impl WaveletMatrix { /// /// See also [`iter_sorted_u64`] for an iterator that yields `u64` elements. #[must_use] - pub fn iter_sorted(&self) -> WaveletSortedRefIter { + pub fn iter_sorted(&self) -> WaveletSortedRefIter<'_> { WaveletSortedRefIter::new(self) } @@ -1993,7 +1993,7 @@ impl WaveletMatrix { /// assert_eq!(iter.collect::>(), vec![1, 1, 2, 4, 4, 7]); /// ``` #[must_use] - pub fn iter_sorted_u64(&self) -> Option { + pub fn iter_sorted_u64(&self) -> Option> { if self.bits_per_element() > 64 { None } else { From 8829120e578f57ce636d22723f6e5cc06a98243a Mon Sep 17 00:00:00 2001 From: Johannes Hengstler Date: Sun, 19 Oct 2025 14:53:21 +0200 Subject: [PATCH 02/12] replace manual modulo checks with is_multiple_of --- src/bit_vec/fast_rs_vec/mod.rs | 8 ++++---- src/bit_vec/mask.rs | 2 +- src/bit_vec/mod.rs | 16 ++++++++-------- src/trees/mmt.rs | 2 +- 4 files changed, 14 insertions(+), 14 deletions(-) diff --git a/src/bit_vec/fast_rs_vec/mod.rs b/src/bit_vec/fast_rs_vec/mod.rs index 2c35643..e33b82f 100644 --- a/src/bit_vec/fast_rs_vec/mod.rs +++ b/src/bit_vec/fast_rs_vec/mod.rs @@ -144,7 +144,7 @@ impl RsVec { let mut new_zeros = word.count_zeros() as usize; // in the last block, remove remaining zeros of limb that aren't part of the vector - if idx == vec.data.len() - 1 && vec.len % WORD_SIZE > 0 { + if idx == vec.data.len() - 1 && !vec.len.is_multiple_of(WORD_SIZE) { let mask = (1 << (vec.len % WORD_SIZE)) - 1; new_zeros -= (word | mask).count_zeros() as usize; } @@ -477,9 +477,9 @@ impl RsVec { } // if last incomplete block exists, test it without junk data - if self.len % 64 > 0 - && self.data[self.len / 64] & ((1 << (self.len % 64)) - 1) - != other.data[self.len / 64] & ((1 << (other.len % 64)) - 1) + if !self.len.is_multiple_of(WORD_SIZE) + && self.data[self.len / WORD_SIZE] & ((1 << (self.len % WORD_SIZE)) - 1) + != other.data[self.len / WORD_SIZE] & ((1 << (other.len % WORD_SIZE)) - 1) { return false; } diff --git a/src/bit_vec/mask.rs b/src/bit_vec/mask.rs index a146b24..4669829 100644 --- a/src/bit_vec/mask.rs +++ b/src/bit_vec/mask.rs @@ -180,7 +180,7 @@ where .take(self.vec.len / WORD_SIZE) .map(|limb| u64::from(limb.count_ones())) .sum(); - if self.vec.len % WORD_SIZE > 0 { + if !self.vec.len.is_multiple_of(WORD_SIZE) { ones += u64::from( ((self.bin_op)( *self.vec.data.last().unwrap(), diff --git a/src/bit_vec/mod.rs b/src/bit_vec/mod.rs index 056091e..a7844b7 100644 --- a/src/bit_vec/mod.rs +++ b/src/bit_vec/mod.rs @@ -87,7 +87,7 @@ impl BitVec { #[must_use] pub fn from_zeros(len: usize) -> Self { let mut data = vec![0; len / WORD_SIZE]; - if len % WORD_SIZE != 0 { + if !len.is_multiple_of(WORD_SIZE) { data.push(0); } Self { data, len } @@ -98,7 +98,7 @@ impl BitVec { #[must_use] pub fn from_ones(len: usize) -> Self { let mut data = vec![u64::MAX; len / WORD_SIZE]; - if len % WORD_SIZE != 0 { + if !len.is_multiple_of(WORD_SIZE) { data.push((1 << (len % WORD_SIZE)) - 1); } Self { data, len } @@ -500,7 +500,7 @@ impl BitVec { /// [`append_bit_u8`]: BitVec::append_bit_u8 /// [`append_word`]: BitVec::append_word pub fn append(&mut self, bit: bool) { - if self.len % WORD_SIZE == 0 { + if self.len.is_multiple_of(WORD_SIZE) { self.data.push(0); } if bit { @@ -574,7 +574,7 @@ impl BitVec { /// [`append_bit_u8`]: BitVec::append_bit_u8 /// [`append_word`]: BitVec::append_word pub fn append_bit(&mut self, bit: u64) { - if self.len % WORD_SIZE == 0 { + if self.len.is_multiple_of(WORD_SIZE) { self.data.push(0); } if bit % 2 == 1 { @@ -653,7 +653,7 @@ impl BitVec { /// [`append_bit_u16`]: BitVec::append_bit_u16 /// [`append_bit_u8`]: BitVec::append_bit_u8 pub fn append_word(&mut self, word: u64) { - if self.len % WORD_SIZE == 0 { + if self.len.is_multiple_of(WORD_SIZE) { self.data.push(word); } else { // zero out the unused bits before or-ing the new one, to ensure no garbage data remains @@ -688,7 +688,7 @@ impl BitVec { pub fn append_bits(&mut self, bits: u64, len: usize) { assert!(len <= 64, "Cannot append more than 64 bits"); - if self.len % WORD_SIZE == 0 { + if self.len.is_multiple_of(WORD_SIZE) { self.data.push(bits); } else { // zero out the unused bits before or-ing the new one, to ensure no garbage data remains @@ -725,7 +725,7 @@ impl BitVec { /// [`append_bits`]: BitVec::append_bits /// [`drop_last`]: BitVec::drop_last pub fn append_bits_unchecked(&mut self, bits: u64, len: usize) { - if self.len % WORD_SIZE == 0 { + if self.len.is_multiple_of(WORD_SIZE) { self.data.push(bits); } else { self.data[self.len / WORD_SIZE] |= bits << (self.len % WORD_SIZE); @@ -1043,7 +1043,7 @@ impl BitVec { .iter() .map(|limb| u64::from(limb.count_ones())) .sum(); - if self.len % WORD_SIZE > 0 { + if !self.len.is_multiple_of(WORD_SIZE) { ones += u64::from( (self.data.last().unwrap() & ((1 << (self.len % WORD_SIZE)) - 1)).count_ones(), ); diff --git a/src/trees/mmt.rs b/src/trees/mmt.rs index e66aa09..d796f2b 100644 --- a/src/trees/mmt.rs +++ b/src/trees/mmt.rs @@ -170,7 +170,7 @@ impl MinMaxTree { /// Get the index of the left sibling of the node at `index` if it exists #[allow(clippy::unused_self)] // self is used for consistency with other methods pub(crate) fn left_sibling(&self, index: NonZeroUsize) -> Option { - if index.get() % 2 == 0 { + if index.get().is_multiple_of(2) { // index is at least 2 NonZeroUsize::new(index.get() - 1) } else { From f9fb40aaf96e0b1f8ec965b98508f3e715f87cb7 Mon Sep 17 00:00:00 2001 From: Johannes Hengstler Date: Sun, 19 Oct 2025 14:54:50 +0200 Subject: [PATCH 03/12] bump patch version --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 371da6e..963c846 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "vers-vecs" -version = "1.7.0" +version = "1.7.1" edition = "2021" authors = ["Johannes \"Cydhra\" Hengstler"] description = "A collection of succinct data structures supported by fast implementations of rank and select queries." From ef3b35c2001069f471ec304676c801f786aa3d6f Mon Sep 17 00:00:00 2001 From: Johannes Hengstler Date: Sun, 19 Oct 2025 15:05:45 +0200 Subject: [PATCH 04/12] avx512 is stable --- src/lib.rs | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 2e1c297..3baf4c6 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,14 +1,3 @@ -#![cfg_attr( - all( - feature = "simd", - target_arch = "x86_64", - target_feature = "avx", - target_feature = "avx2", - target_feature = "avx512f", - target_feature = "avx512bw", - ), - feature(stdarch_x86_avx512) -)] #![warn(missing_docs)] #![allow(clippy::module_name_repetitions)] #![allow(clippy::assertions_on_constants)] // for asserts warning about incompatible constant values From 4fdf71d19279cf4997bc9116f4642ea2118d1c5e Mon Sep 17 00:00:00 2001 From: Johannes Hengstler Date: Sun, 19 Oct 2025 15:07:57 +0200 Subject: [PATCH 05/12] since I very explicitly change MSRV it requires upgrading to 1.8 --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 963c846..5a6edde 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "vers-vecs" -version = "1.7.1" +version = "1.8.0" edition = "2021" authors = ["Johannes \"Cydhra\" Hengstler"] description = "A collection of succinct data structures supported by fast implementations of rank and select queries." From 8d2a525a36df9fca907c54ea1708e959e796d5cc Mon Sep 17 00:00:00 2001 From: Johannes Hengstler Date: Sun, 19 Oct 2025 15:10:36 +0200 Subject: [PATCH 06/12] update readme about simd feature stability --- readme.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/readme.md b/readme.md index ea64ed1..d21edac 100644 --- a/readme.md +++ b/readme.md @@ -31,8 +31,6 @@ since the intrinsics speed up both `rank` and `select` operations by a factor of - `simd`: Enables the use of SIMD instructions for rank and select operations. This feature requires AVX-512 support and uses unsafe code. It also enables a special iterator for the rank/select bit vector that uses vectorized operations. -The feature only works on nightly Rust. -Enabling it on stable Rust is a no-op, because the required CPU features are not available there. - `serde`: Enables serialization and deserialization of the data structures using the `serde` crate. - `u16_lookup` Enables a larger lookup table for BP tree queries. The larger table requires 128 KiB instead of 4 KiB. From 0232dcbf13319c9c0ed5a596449b7de891d24442 Mon Sep 17 00:00:00 2001 From: Johannes Hengstler Date: Sun, 19 Oct 2025 15:31:00 +0200 Subject: [PATCH 07/12] doc_auto_cfg was removed --- src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lib.rs b/src/lib.rs index 3baf4c6..0b1d729 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -2,7 +2,7 @@ #![allow(clippy::module_name_repetitions)] #![allow(clippy::assertions_on_constants)] // for asserts warning about incompatible constant values #![allow(clippy::inline_always)] // we actually measure performance increases with most of these -#![cfg_attr(docsrs, feature(doc_cfg), feature(doc_auto_cfg))] // for conditional compilation in docs +#![cfg_attr(docsrs, feature(doc_cfg))] // for conditional compilation in docs //! This crate provides a collection of data structures supported by fast implementations of //! rank and select queries. The data structures are static, meaning that they cannot be modified From 2a463177bde4b9c0dd9ddc642efc600f062dbcd6 Mon Sep 17 00:00:00 2001 From: Johannes Hengstler Date: Sun, 19 Oct 2025 16:12:58 +0200 Subject: [PATCH 08/12] bump patch for docsrs fix --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 5a6edde..7798dd4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "vers-vecs" -version = "1.8.0" +version = "1.8.1" edition = "2021" authors = ["Johannes \"Cydhra\" Hengstler"] description = "A collection of succinct data structures supported by fast implementations of rank and select queries." From ed0bbaacb5ad871f8c36d054b28ac700192fa9d3 Mon Sep 17 00:00:00 2001 From: Johannes Hengstler Date: Sun, 19 Oct 2025 16:26:53 +0200 Subject: [PATCH 09/12] fix broken doc links --- src/bit_vec/mod.rs | 6 ++++++ src/bit_vec/sparse.rs | 4 +++- src/elias_fano/mod.rs | 2 ++ src/trees/bp/builder.rs | 3 ++- src/trees/mod.rs | 7 +++++++ src/wavelet/mod.rs | 10 +++++++++- 6 files changed, 29 insertions(+), 3 deletions(-) diff --git a/src/bit_vec/mod.rs b/src/bit_vec/mod.rs index a7844b7..95779cc 100644 --- a/src/bit_vec/mod.rs +++ b/src/bit_vec/mod.rs @@ -820,6 +820,8 @@ impl BitVec { /// assert_eq!(bv.get(1), Some(0)); /// assert_eq!(bv.get(2), Some(1)); /// ``` + /// + /// [`get_unchecked`]: Self::get_unchecked #[must_use] pub fn get(&self, pos: usize) -> Option { if pos >= self.len { @@ -1226,6 +1228,8 @@ impl BitVec { /// containing the original vector. /// /// See also: [`split_at_unchecked`] + /// + /// [`split_at_unchecked`]: Self::split_at_unchecked pub fn split_at(self, at: usize) -> Result<(Self, Self), Self> { if at > self.len { Err(self) @@ -1241,6 +1245,8 @@ impl BitVec { /// If the index is larger than the length of the vector the function will panic or run /// out of memory. /// Use [`split_at`] to properly handle this case. + /// + /// [`split_at`]: Self::split_at #[must_use] pub fn split_at_unchecked(mut self, at: usize) -> (Self, Self) { let other_len = self.len - at; diff --git a/src/bit_vec/sparse.rs b/src/bit_vec/sparse.rs index bc0dbe5..9e016b5 100644 --- a/src/bit_vec/sparse.rs +++ b/src/bit_vec/sparse.rs @@ -2,7 +2,7 @@ //! The vector requires `O(n log u/n) + 2n + o(n)` bits of space, where `n` is the number of bits in the vector //! and `u` is the number of 1-bits. //! The vector is constructed from a sorted list of indices of 1-bits, or from an existing -//! [`BitVec`](crate::BitVec). +//! [`BitVec`]. use crate::{BitVec, EliasFanoVec}; @@ -170,6 +170,8 @@ impl SparseRSVec { /// # Panics /// If `i` is out of bounds the function might panic or produce incorrect results. /// Use [`get`] for a checked version. + /// + /// [`get`]: Self::get #[must_use] pub fn get_unchecked(&self, i: u64) -> u64 { self.is_set_unchecked(i).into() diff --git a/src/elias_fano/mod.rs b/src/elias_fano/mod.rs index 75b009c..0d80410 100644 --- a/src/elias_fano/mod.rs +++ b/src/elias_fano/mod.rs @@ -164,6 +164,8 @@ impl EliasFanoVec { /// /// Note, that select in bit-vectors returns an index, while select in Elias-Fano returns the /// element at the given rank. + /// + /// [`get`]: Self::get #[must_use] pub fn select(&self, rank: usize) -> Option { self.get(rank) diff --git a/src/trees/bp/builder.rs b/src/trees/bp/builder.rs index 753600c..b896d13 100644 --- a/src/trees/bp/builder.rs +++ b/src/trees/bp/builder.rs @@ -5,7 +5,8 @@ use crate::BitVec; /// A builder for [`BpTrees`] using depth-first traversal of the tree. See the documentation of /// [`TreeBuilder`]. /// -/// [`BpTree`]: BpTree +/// [`BpTrees`]: BpTree +/// [`TreeBuilder`]: TreeBuilder pub struct BpBuilder { excess: i64, bit_vec: BitVec, diff --git a/src/trees/mod.rs b/src/trees/mod.rs index 3e2f0eb..5df9bc8 100644 --- a/src/trees/mod.rs +++ b/src/trees/mod.rs @@ -122,6 +122,10 @@ pub trait LevelTree: Tree { /// /// Once the full tree has been visited, the caller must call [`build`] to create an instance of the /// implementing tree type. +/// +/// [`enter_node`]: TreeBuilder::enter_node +/// [`leave_node`]: TreeBuilder::leave_node +/// [`build`]: TreeBuilder::build pub trait TreeBuilder { /// The tree type constructed with this interface type Tree; @@ -139,5 +143,8 @@ pub trait TreeBuilder { /// (i.e. there are nodes for which [`leave_node`] has not been called, /// or there are more calls to `leave_node` than to [`enter_node`]; /// the number of extraneous calls to `enter_node` is returned in the error). + /// + /// [`leave_node`]: Self::leave_node + /// [`enter_node`]: Self::enter_node fn build(self) -> Result; } diff --git a/src/wavelet/mod.rs b/src/wavelet/mod.rs index e8676be..4795436 100644 --- a/src/wavelet/mod.rs +++ b/src/wavelet/mod.rs @@ -62,6 +62,10 @@ use std::ops::Range; /// ``` /// /// [`RsVec`]: RsVec +/// [`from_bit_vec`]: WaveletMatrix::from_bit_vec +/// [`from_slice`]: WaveletMatrix::from_slice +/// [`from_bit_vec_pc`]: WaveletMatrix::from_bit_vec_pc +/// [`from_slice_pc`]: WaveletMatrix::from_slice_pc #[derive(Clone, Debug)] #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] pub struct WaveletMatrix { @@ -1080,7 +1084,7 @@ impl WaveletMatrix { /// Get the `k`-th smallest element in the encoded sequence in the specified `range`, /// where `k = 0` returns the smallest element. /// The `range` is a half-open interval, meaning that the `end` index is exclusive. - /// The `k`-th smallest element is returned as a `BitVec`, + /// The `k`-th smallest element is returned as a [`BitVec`], /// where the least significant bit is the first element. /// /// Returns `None` if the `range` is out of bounds, or if `k` is greater than the size of the range. @@ -1114,6 +1118,8 @@ impl WaveletMatrix { /// /// # Panics /// May panic if the `i` is out of bounds, or returns an empty bit vector. + /// + /// [`get_sorted`]: Self::get_sorted #[must_use] pub fn get_sorted_unchecked(&self, i: usize) -> BitVec { self.quantile_unchecked(0..self.len(), i) @@ -1966,6 +1972,8 @@ impl WaveletMatrix { /// The iterator yields `BitVec` elements. /// /// See also [`iter_sorted_u64`] for an iterator that yields `u64` elements. + /// + /// [`iter_sorted_u64`]: Self::iter_sorted_u64 #[must_use] pub fn iter_sorted(&self) -> WaveletSortedRefIter<'_> { WaveletSortedRefIter::new(self) From e22dfbd54933bfc5be33e766c506fbede6ea045e Mon Sep 17 00:00:00 2001 From: Johannes Hengstler Date: Sun, 19 Oct 2025 16:34:24 +0200 Subject: [PATCH 10/12] add workflow for docs --- .github/workflows/rust.yml | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index c0b0208..2daeb6f 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -31,4 +31,14 @@ jobs: - name: Build run: cargo build --verbose --features serde - name: Run tests - run: cargo test --verbose --features serde \ No newline at end of file + run: cargo test --verbose --features serde + + docs: + runs-on: ubuntu-latest + env: + RUSTFLAGS: -C target-cpu=x86-64 + RUSTDOCFLAGS: -C target-cpu=x86-64 + steps: + - uses: actions/checkout@v4 + - name: Docs + run: cargo doc --verbose --all-features \ No newline at end of file From 13a356afb0245d8f51f390b63eb4804df5adc5a5 Mon Sep 17 00:00:00 2001 From: Cydhra Date: Sun, 19 Oct 2025 23:43:34 +0200 Subject: [PATCH 11/12] change `const` to `static` for lookup table to prevent expensive (and transitively applied) inlining (#37) --- benches/bp.rs | 2 -- src/trees/bp/lookup.rs | 2 +- src/trees/bp/lookup_query.rs | 4 ++-- src/trees/bp/mod.rs | 2 -- 4 files changed, 3 insertions(+), 7 deletions(-) diff --git a/benches/bp.rs b/benches/bp.rs index c278694..941f9d1 100644 --- a/benches/bp.rs +++ b/benches/bp.rs @@ -1,5 +1,3 @@ -#![allow(long_running_const_eval)] - use criterion::{black_box, criterion_group, criterion_main, BatchSize, BenchmarkId, Criterion}; use rand::rngs::StdRng; use rand::{Rng, SeedableRng}; diff --git a/src/trees/bp/lookup.rs b/src/trees/bp/lookup.rs index 2c8fc8b..0095fd9 100644 --- a/src/trees/bp/lookup.rs +++ b/src/trees/bp/lookup.rs @@ -48,7 +48,7 @@ const LOOKUP_MAX_VALUE: u32 = u8::MAX as u32; /// /// The rest of the bits are zero. #[allow(long_running_const_eval)] -const PAREN_BLOCK_LOOKUP: [EncodedTableType; 1 << LOOKUP_BLOCK_SIZE] = calculate_lookup_table(); +static PAREN_BLOCK_LOOKUP: [EncodedTableType; 1 << LOOKUP_BLOCK_SIZE] = calculate_lookup_table(); /// Offset to add to encoded excess values, so negative numbers are stored as positive integers, reducing /// encoding complexity diff --git a/src/trees/bp/lookup_query.rs b/src/trees/bp/lookup_query.rs index a16aea9..e817743 100644 --- a/src/trees/bp/lookup_query.rs +++ b/src/trees/bp/lookup_query.rs @@ -22,14 +22,14 @@ const LOOKUP_MAX_VALUE: u32 = u8::MAX as u32; /// to dual-encode negative excess), and another 51 bits for all 17 queries that may end in this block /// (-8 to 8 relative excess). #[allow(long_running_const_eval)] -const PAREN_BLOCK_LOOKUP_FWD: [u64; 1 << LOOKUP_BLOCK_SIZE] = calculate_lookup_table(true); +static PAREN_BLOCK_LOOKUP_FWD: [u64; 1 << LOOKUP_BLOCK_SIZE] = calculate_lookup_table(true); /// Encoded bwd query results for all possible 8-bit blocks. /// The encoding reserves 10 bits for minimum and maximum excess (shifted by 8 bits so we don't have /// to dual-encode negative excess), and another 51 bits for all 17 queries that may end in this block /// (-8 to 8 relative excess). #[allow(long_running_const_eval)] -const PAREN_BLOCK_LOOKUP_BWD: [u64; 1 << LOOKUP_BLOCK_SIZE] = calculate_lookup_table(false); +static PAREN_BLOCK_LOOKUP_BWD: [u64; 1 << LOOKUP_BLOCK_SIZE] = calculate_lookup_table(false); /// Bitmask for one of the lookup values. const ENCODING_MASK: u64 = 0b11111; diff --git a/src/trees/bp/mod.rs b/src/trees/bp/mod.rs index 6b9e89c..43e9e13 100644 --- a/src/trees/bp/mod.rs +++ b/src/trees/bp/mod.rs @@ -85,7 +85,6 @@ use lookup_query::{process_block_bwd, process_block_fwd, LOOKUP_BLOCK_SIZE}; /// The high-level approach to building a tree is to use the [`BpBuilder`] to construct the tree /// using depth-first traversal of all its nodes. /// ```rust -/// # #![allow(long_running_const_eval)] // for some reason this is needed for test cases /// use vers_vecs::{BitVec, BpBuilder, BpTree, TreeBuilder, Tree}; /// /// let mut builder = BpBuilder::<512>::new(); @@ -119,7 +118,6 @@ use lookup_query::{process_block_bwd, process_block_fwd, LOOKUP_BLOCK_SIZE}; /// This is also how trees with unbalanced parenthesis expressions can be constructed. /// /// ```rust -/// # #![allow(long_running_const_eval)] /// use vers_vecs::{BitVec, BpTree, Tree}; /// let bv = BitVec::pack_sequence_u8(&[0b1101_0111, 0b0010_0100], 8); /// let tree = BpTree::<4>::from_bit_vector(bv); From 365964c8278e0db09c482965bea7ae16f96a996d Mon Sep 17 00:00:00 2001 From: Johannes Hengstler Date: Sun, 26 Oct 2025 15:46:01 +0100 Subject: [PATCH 12/12] experimental port simd setup with fallbacks disabled and not compiling on stable --- src/bit_vec/fast_rs_vec/mod.rs | 1 + src/bit_vec/fast_rs_vec/select.rs | 185 ++++++++++++++++++++---------- src/lib.rs | 1 + 3 files changed, 125 insertions(+), 62 deletions(-) diff --git a/src/bit_vec/fast_rs_vec/mod.rs b/src/bit_vec/fast_rs_vec/mod.rs index e33b82f..ed7701f 100644 --- a/src/bit_vec/fast_rs_vec/mod.rs +++ b/src/bit_vec/fast_rs_vec/mod.rs @@ -42,6 +42,7 @@ const SELECT_BLOCK_SIZE: usize = 1 << 13; /// always stores the number zero, which serves as a sentinel value to avoid special-casing the /// first block in a super-block (which would be a performance hit due branch prediction failures). #[derive(Clone, Copy, Debug)] +#[repr(C)] #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] struct BlockDescriptor { zeros: u16, diff --git a/src/bit_vec/fast_rs_vec/select.rs b/src/bit_vec/fast_rs_vec/select.rs index b8721d7..4e2a6a3 100644 --- a/src/bit_vec/fast_rs_vec/select.rs +++ b/src/bit_vec/fast_rs_vec/select.rs @@ -4,6 +4,7 @@ use crate::bit_vec::fast_rs_vec::{BLOCK_SIZE, SELECT_BLOCK_SIZE, SUPER_BLOCK_SIZ use crate::bit_vec::WORD_SIZE; use crate::util::pdep::Pdep; use crate::util::unroll; +use std::slice::from_raw_parts; /// A safety constant for assertions to make sure that the block size doesn't change without /// adjusting the code. @@ -229,81 +230,141 @@ impl super::RsVec { /// It loads the entire block into a SIMD register and compares the rank to the number of ones /// in the block. The resulting mask is popcounted to find how many blocks from the block boundary /// the rank is. - #[cfg(all( - feature = "simd", - target_arch = "x86_64", - target_feature = "avx", - target_feature = "avx2", - target_feature = "avx512vl", - target_feature = "avx512bw", - ))] - #[inline(always)] - pub(super) fn search_block1( + // #[cfg(all( + // feature = "simd", + // target_arch = "x86_64", + // target_feature = "avx", + // target_feature = "avx2", + // target_feature = "avx512vl", + // target_feature = "avx512bw", + // ))] + // #[inline(always)] + // pub(super) fn search_block1( + // &self, + // rank: usize, + // block_at_super_block: usize, + // block_index: &mut usize, + // ) { + // use std::arch::x86_64::{ + // _mm256_cmpgt_epu16_mask, _mm256_loadu_epi16, _mm256_set1_epi16, _mm256_set_epi16, + // _mm256_sub_epi16, + // }; + // + // if self.blocks.len() > *block_index + BLOCKS_PER_SUPERBLOCK { + // debug_assert!( + // SUPER_BLOCK_SIZE / BLOCK_SIZE == BLOCKS_PER_SUPERBLOCK, + // "change unroll constant to {}", + // 64 - (SUPER_BLOCK_SIZE / BLOCK_SIZE).leading_zeros() - 1 + // ); + // + // unsafe { + // let bit_nums = _mm256_set_epi16( + // (15 * BLOCK_SIZE) as i16, + // (14 * BLOCK_SIZE) as i16, + // (13 * BLOCK_SIZE) as i16, + // (12 * BLOCK_SIZE) as i16, + // (11 * BLOCK_SIZE) as i16, + // (10 * BLOCK_SIZE) as i16, + // (9 * BLOCK_SIZE) as i16, + // (8 * BLOCK_SIZE) as i16, + // (7 * BLOCK_SIZE) as i16, + // (6 * BLOCK_SIZE) as i16, + // (5 * BLOCK_SIZE) as i16, + // (4 * BLOCK_SIZE) as i16, + // (3 * BLOCK_SIZE) as i16, + // (2 * BLOCK_SIZE) as i16, + // (1 * BLOCK_SIZE) as i16, + // (0 * BLOCK_SIZE) as i16, + // ); + // + // let blocks = _mm256_loadu_epi16(self.blocks[*block_index..].as_ptr() as *const i16); + // let ones = _mm256_sub_epi16(bit_nums, blocks); + // + // let ranks = _mm256_set1_epi16(rank as i16); + // let mask = _mm256_cmpgt_epu16_mask(ones, ranks); + // + // debug_assert!( + // mask.count_zeros() > 0, + // "first block should always be zero, but still claims to be greater than rank" + // ); + // *block_index += mask.count_zeros() as usize - 1; + // } + // } else { + // self.search_block1_naive(rank, block_at_super_block, block_index) + // } + // } + + #[cfg(all(feature = "simd",))] + pub(super) fn search_block1_portable( &self, rank: usize, - block_at_super_block: usize, + _block_at_super_block: usize, block_index: &mut usize, ) { - use std::arch::x86_64::{ - _mm256_cmpgt_epu16_mask, _mm256_loadu_epi16, _mm256_set1_epi16, _mm256_set_epi16, - _mm256_sub_epi16, - }; + use std::simd::cmp::SimdPartialOrd; + use std::simd::u16x16; - if self.blocks.len() > *block_index + BLOCKS_PER_SUPERBLOCK { - debug_assert!( - SUPER_BLOCK_SIZE / BLOCK_SIZE == BLOCKS_PER_SUPERBLOCK, - "change unroll constant to {}", - 64 - (SUPER_BLOCK_SIZE / BLOCK_SIZE).leading_zeros() - 1 - ); + debug_assert!( + SUPER_BLOCK_SIZE / BLOCK_SIZE == BLOCKS_PER_SUPERBLOCK, + "change unroll constant to {}", + 64 - (SUPER_BLOCK_SIZE / BLOCK_SIZE).leading_zeros() - 1 + ); - unsafe { - let bit_nums = _mm256_set_epi16( - (15 * BLOCK_SIZE) as i16, - (14 * BLOCK_SIZE) as i16, - (13 * BLOCK_SIZE) as i16, - (12 * BLOCK_SIZE) as i16, - (11 * BLOCK_SIZE) as i16, - (10 * BLOCK_SIZE) as i16, - (9 * BLOCK_SIZE) as i16, - (8 * BLOCK_SIZE) as i16, - (7 * BLOCK_SIZE) as i16, - (6 * BLOCK_SIZE) as i16, - (5 * BLOCK_SIZE) as i16, - (4 * BLOCK_SIZE) as i16, - (3 * BLOCK_SIZE) as i16, - (2 * BLOCK_SIZE) as i16, - (1 * BLOCK_SIZE) as i16, - (0 * BLOCK_SIZE) as i16, - ); + let bit_nums = u16x16::from([ + (0 * BLOCK_SIZE) as u16, + (1 * BLOCK_SIZE) as u16, + (2 * BLOCK_SIZE) as u16, + (3 * BLOCK_SIZE) as u16, + (4 * BLOCK_SIZE) as u16, + (5 * BLOCK_SIZE) as u16, + (6 * BLOCK_SIZE) as u16, + (7 * BLOCK_SIZE) as u16, + (8 * BLOCK_SIZE) as u16, + (9 * BLOCK_SIZE) as u16, + (10 * BLOCK_SIZE) as u16, + (11 * BLOCK_SIZE) as u16, + (12 * BLOCK_SIZE) as u16, + (13 * BLOCK_SIZE) as u16, + (14 * BLOCK_SIZE) as u16, + (15 * BLOCK_SIZE) as u16, + ]); + + let sentinel = u16x16::default(); + + let slice: &[u16] = unsafe { + from_raw_parts( + self.blocks[*block_index..].as_ptr() as *const u16, + self.blocks[*block_index..].len(), + ) + }; + let blocks = u16x16::load_or(slice, sentinel); + let ones = bit_nums - blocks; - let blocks = _mm256_loadu_epi16(self.blocks[*block_index..].as_ptr() as *const i16); - let ones = _mm256_sub_epi16(bit_nums, blocks); + let ranks = u16x16::splat(rank as u16); + let mask = ones.simd_gt(ranks); - let ranks = _mm256_set1_epi16(rank as i16); - let mask = _mm256_cmpgt_epu16_mask(ones, ranks); + // calculate the number of blocks where the number of ones does not exceed rank + // (subtract 48 unused bits because the bitmask has 64 bits) + let num = mask.to_bitmask().count_zeros() - 48; - debug_assert!( - mask.count_zeros() > 0, - "first block should always be zero, but still claims to be greater than rank" - ); - *block_index += mask.count_zeros() as usize - 1; - } - } else { - self.search_block1_naive(rank, block_at_super_block, block_index) - } + debug_assert!( + num > 0, + "first block should always be zero, but still claims to be greater than rank" + ); + *block_index += num as usize - 1; } /// Search for the block in a superblock that contains the rank. This function is only used /// internally and is not part of the public API. /// It compares blocks in a loop-unrolled binary search to find the block that contains the rank. - #[cfg(not(all( - feature = "simd", - target_arch = "x86_64", - target_feature = "avx", - target_feature = "avx2", - target_feature = "avx512vl", - target_feature = "avx512bw", - )))] + // #[cfg(not(all( + // feature = "simd", + // target_arch = "x86_64", + // target_feature = "avx", + // target_feature = "avx2", + // target_feature = "avx512vl", + // target_feature = "avx512bw", + // )))] #[inline(always)] pub(super) fn search_block1( &self, @@ -311,7 +372,7 @@ impl super::RsVec { block_at_super_block: usize, block_index: &mut usize, ) { - self.search_block1_naive(rank, block_at_super_block, block_index); + self.search_block1_portable(rank, block_at_super_block, block_index); } #[inline(always)] diff --git a/src/lib.rs b/src/lib.rs index 0b1d729..efa9d31 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -3,6 +3,7 @@ #![allow(clippy::assertions_on_constants)] // for asserts warning about incompatible constant values #![allow(clippy::inline_always)] // we actually measure performance increases with most of these #![cfg_attr(docsrs, feature(doc_cfg))] // for conditional compilation in docs +#![cfg_attr(feature = "simd", feature(portable_simd))] // portable_simd feature on nightly //! This crate provides a collection of data structures supported by fast implementations of //! rank and select queries. The data structures are static, meaning that they cannot be modified