Cydhra · Cydhra · Oct 19, 2025 · Oct 19, 2025 · Oct 19, 2025 · Oct 19, 2025
diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
@@ -31,4 +31,14 @@ jobs:
       - name: Build
         run: cargo build --verbose --features serde
       - name: Run tests
-        run: cargo test --verbose --features serde
+        run: cargo test --verbose --features serde
+
+  docs:
+    runs-on: ubuntu-latest
+    env:
+      RUSTFLAGS: -C target-cpu=x86-64
+      RUSTDOCFLAGS: -C target-cpu=x86-64
+    steps:
+      - uses: actions/checkout@v4
+      - name: Docs
+        run: cargo doc --verbose --all-features
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "vers-vecs"
-version = "1.7.0"
+version = "1.8.1"
 edition = "2021"
 authors = ["Johannes \"Cydhra\" Hengstler"]
 description = "A collection of succinct data structures supported by fast implementations of rank and select queries."

diff --git a/benches/bp.rs b/benches/bp.rs
@@ -1,5 +1,3 @@
-#![allow(long_running_const_eval)]
-
 use criterion::{black_box, criterion_group, criterion_main, BatchSize, BenchmarkId, Criterion};
 use rand::rngs::StdRng;
 use rand::{Rng, SeedableRng};

diff --git a/readme.md b/readme.md
@@ -31,8 +31,6 @@ since the intrinsics speed up both `rank` and `select` operations by a factor of
 - `simd`: Enables the use of SIMD instructions for rank and select operations.
 This feature requires AVX-512 support and uses unsafe code.
 It also enables a special iterator for the rank/select bit vector that uses vectorized operations.
-The feature only works on nightly Rust.
-Enabling it on stable Rust is a no-op, because the required CPU features are not available there.
 - `serde`: Enables serialization and deserialization of the data structures using the `serde` crate.
 - `u16_lookup` Enables a larger lookup table for BP tree queries. The larger table requires 128 KiB instead of 4 KiB.
 

diff --git a/src/bit_vec/fast_rs_vec/mod.rs b/src/bit_vec/fast_rs_vec/mod.rs
@@ -42,6 +42,7 @@ const SELECT_BLOCK_SIZE: usize = 1 << 13;
 /// always stores the number zero, which serves as a sentinel value to avoid special-casing the
 /// first block in a super-block (which would be a performance hit due branch prediction failures).
 #[derive(Clone, Copy, Debug)]
+#[repr(C)]
 #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
 struct BlockDescriptor {
     zeros: u16,
@@ -144,7 +145,7 @@ impl RsVec {
             let mut new_zeros = word.count_zeros() as usize;
 
             // in the last block, remove remaining zeros of limb that aren't part of the vector
-            if idx == vec.data.len() - 1 && vec.len % WORD_SIZE > 0 {
+            if idx == vec.data.len() - 1 && !vec.len.is_multiple_of(WORD_SIZE) {
                 let mask = (1 << (vec.len % WORD_SIZE)) - 1;
                 new_zeros -= (word | mask).count_zeros() as usize;
             }
@@ -477,9 +478,9 @@ impl RsVec {
         }
 
         // if last incomplete block exists, test it without junk data
-        if self.len % 64 > 0
-            && self.data[self.len / 64] & ((1 << (self.len % 64)) - 1)
-                != other.data[self.len / 64] & ((1 << (other.len % 64)) - 1)
+        if !self.len.is_multiple_of(WORD_SIZE)
+            && self.data[self.len / WORD_SIZE] & ((1 << (self.len % WORD_SIZE)) - 1)
+                != other.data[self.len / WORD_SIZE] & ((1 << (other.len % WORD_SIZE)) - 1)
         {
             return false;
         }

diff --git a/src/bit_vec/fast_rs_vec/select.rs b/src/bit_vec/fast_rs_vec/select.rs
@@ -4,6 +4,7 @@ use crate::bit_vec::fast_rs_vec::{BLOCK_SIZE, SELECT_BLOCK_SIZE, SUPER_BLOCK_SIZ
 use crate::bit_vec::WORD_SIZE;
 use crate::util::pdep::Pdep;
 use crate::util::unroll;
+use std::slice::from_raw_parts;
 
 /// A safety constant for assertions to make sure that the block size doesn't change without
 /// adjusting the code.
@@ -229,89 +230,149 @@ impl super::RsVec {
     /// It loads the entire block into a SIMD register and compares the rank to the number of ones
     /// in the block. The resulting mask is popcounted to find how many blocks from the block boundary
     /// the rank is.
-    #[cfg(all(
-        feature = "simd",
-        target_arch = "x86_64",
-        target_feature = "avx",
-        target_feature = "avx2",
-        target_feature = "avx512vl",
-        target_feature = "avx512bw",
-    ))]
-    #[inline(always)]
-    pub(super) fn search_block1(
+    // #[cfg(all(
+    //     feature = "simd",
+    //     target_arch = "x86_64",
+    //     target_feature = "avx",
+    //     target_feature = "avx2",
+    //     target_feature = "avx512vl",
+    //     target_feature = "avx512bw",
+    // ))]
+    // #[inline(always)]
+    // pub(super) fn search_block1(
+    //     &self,
+    //     rank: usize,
+    //     block_at_super_block: usize,
+    //     block_index: &mut usize,
+    // ) {
+    //     use std::arch::x86_64::{
+    //         _mm256_cmpgt_epu16_mask, _mm256_loadu_epi16, _mm256_set1_epi16, _mm256_set_epi16,
+    //         _mm256_sub_epi16,
+    //     };
+    //
+    //     if self.blocks.len() > *block_index + BLOCKS_PER_SUPERBLOCK {
+    //         debug_assert!(
+    //             SUPER_BLOCK_SIZE / BLOCK_SIZE == BLOCKS_PER_SUPERBLOCK,
+    //             "change unroll constant to {}",
+    //             64 - (SUPER_BLOCK_SIZE / BLOCK_SIZE).leading_zeros() - 1
+    //         );
+    //
+    //         unsafe {
+    //             let bit_nums = _mm256_set_epi16(
+    //                 (15 * BLOCK_SIZE) as i16,
+    //                 (14 * BLOCK_SIZE) as i16,
+    //                 (13 * BLOCK_SIZE) as i16,
+    //                 (12 * BLOCK_SIZE) as i16,
+    //                 (11 * BLOCK_SIZE) as i16,
+    //                 (10 * BLOCK_SIZE) as i16,
+    //                 (9 * BLOCK_SIZE) as i16,
+    //                 (8 * BLOCK_SIZE) as i16,
+    //                 (7 * BLOCK_SIZE) as i16,
+    //                 (6 * BLOCK_SIZE) as i16,
+    //                 (5 * BLOCK_SIZE) as i16,
+    //                 (4 * BLOCK_SIZE) as i16,
+    //                 (3 * BLOCK_SIZE) as i16,
+    //                 (2 * BLOCK_SIZE) as i16,
+    //                 (1 * BLOCK_SIZE) as i16,
+    //                 (0 * BLOCK_SIZE) as i16,
+    //             );
+    //
+    //             let blocks = _mm256_loadu_epi16(self.blocks[*block_index..].as_ptr() as *const i16);
+    //             let ones = _mm256_sub_epi16(bit_nums, blocks);
+    //
+    //             let ranks = _mm256_set1_epi16(rank as i16);
+    //             let mask = _mm256_cmpgt_epu16_mask(ones, ranks);
+    //
+    //             debug_assert!(
+    //                 mask.count_zeros() > 0,
+    //                 "first block should always be zero, but still claims to be greater than rank"
+    //             );
+    //             *block_index += mask.count_zeros() as usize - 1;
+    //         }
+    //     } else {
+    //         self.search_block1_naive(rank, block_at_super_block, block_index)
+    //     }
+    // }
+
+    #[cfg(all(feature = "simd",))]
+    pub(super) fn search_block1_portable(
         &self,
         rank: usize,
-        block_at_super_block: usize,
+        _block_at_super_block: usize,
         block_index: &mut usize,
     ) {
-        use std::arch::x86_64::{
-            _mm256_cmpgt_epu16_mask, _mm256_loadu_epi16, _mm256_set1_epi16, _mm256_set_epi16,
-            _mm256_sub_epi16,
-        };
+        use std::simd::cmp::SimdPartialOrd;
+        use std::simd::u16x16;
 
-        if self.blocks.len() > *block_index + BLOCKS_PER_SUPERBLOCK {
-            debug_assert!(
-                SUPER_BLOCK_SIZE / BLOCK_SIZE == BLOCKS_PER_SUPERBLOCK,
-                "change unroll constant to {}",
-                64 - (SUPER_BLOCK_SIZE / BLOCK_SIZE).leading_zeros() - 1
-            );
+        debug_assert!(
+            SUPER_BLOCK_SIZE / BLOCK_SIZE == BLOCKS_PER_SUPERBLOCK,
+            "change unroll constant to {}",
+            64 - (SUPER_BLOCK_SIZE / BLOCK_SIZE).leading_zeros() - 1
+        );
 
-            unsafe {
-                let bit_nums = _mm256_set_epi16(
-                    (15 * BLOCK_SIZE) as i16,
-                    (14 * BLOCK_SIZE) as i16,
-                    (13 * BLOCK_SIZE) as i16,
-                    (12 * BLOCK_SIZE) as i16,
-                    (11 * BLOCK_SIZE) as i16,
-                    (10 * BLOCK_SIZE) as i16,
-                    (9 * BLOCK_SIZE) as i16,
-                    (8 * BLOCK_SIZE) as i16,
-                    (7 * BLOCK_SIZE) as i16,
-                    (6 * BLOCK_SIZE) as i16,
-                    (5 * BLOCK_SIZE) as i16,
-                    (4 * BLOCK_SIZE) as i16,
-                    (3 * BLOCK_SIZE) as i16,
-                    (2 * BLOCK_SIZE) as i16,
-                    (1 * BLOCK_SIZE) as i16,
-                    (0 * BLOCK_SIZE) as i16,
-                );
+        let bit_nums = u16x16::from([
+            (0 * BLOCK_SIZE) as u16,
+            (1 * BLOCK_SIZE) as u16,
+            (2 * BLOCK_SIZE) as u16,
+            (3 * BLOCK_SIZE) as u16,
+            (4 * BLOCK_SIZE) as u16,
+            (5 * BLOCK_SIZE) as u16,
+            (6 * BLOCK_SIZE) as u16,
+            (7 * BLOCK_SIZE) as u16,
+            (8 * BLOCK_SIZE) as u16,
+            (9 * BLOCK_SIZE) as u16,
+            (10 * BLOCK_SIZE) as u16,
+            (11 * BLOCK_SIZE) as u16,
+            (12 * BLOCK_SIZE) as u16,
+            (13 * BLOCK_SIZE) as u16,
+            (14 * BLOCK_SIZE) as u16,
+            (15 * BLOCK_SIZE) as u16,
+        ]);
+
+        let sentinel = u16x16::default();
+
+        let slice: &[u16] = unsafe {
+            from_raw_parts(
+                self.blocks[*block_index..].as_ptr() as *const u16,
+                self.blocks[*block_index..].len(),
+            )
+        };
+        let blocks = u16x16::load_or(slice, sentinel);
+        let ones = bit_nums - blocks;
 
-                let blocks = _mm256_loadu_epi16(self.blocks[*block_index..].as_ptr() as *const i16);
-                let ones = _mm256_sub_epi16(bit_nums, blocks);
+        let ranks = u16x16::splat(rank as u16);
+        let mask = ones.simd_gt(ranks);
 
-                let ranks = _mm256_set1_epi16(rank as i16);
-                let mask = _mm256_cmpgt_epu16_mask(ones, ranks);
+        // calculate the number of blocks where the number of ones does not exceed rank
+        // (subtract 48 unused bits because the bitmask has 64 bits)
+        let num = mask.to_bitmask().count_zeros() - 48;
 
-                debug_assert!(
-                    mask.count_zeros() > 0,
-                    "first block should always be zero, but still claims to be greater than rank"
-                );
-                *block_index += mask.count_zeros() as usize - 1;
-            }
-        } else {
-            self.search_block1_naive(rank, block_at_super_block, block_index)
-        }
+        debug_assert!(
+            num > 0,
+            "first block should always be zero, but still claims to be greater than rank"
+        );
+        *block_index += num as usize - 1;
     }
 
     /// Search for the block in a superblock that contains the rank. This function is only used
     /// internally and is not part of the public API.
     /// It compares blocks in a loop-unrolled binary search to find the block that contains the rank.
-    #[cfg(not(all(
-        feature = "simd",
-        target_arch = "x86_64",
-        target_feature = "avx",
-        target_feature = "avx2",
-        target_feature = "avx512vl",
-        target_feature = "avx512bw",
-    )))]
+    // #[cfg(not(all(
+    //     feature = "simd",
+    //     target_arch = "x86_64",
+    //     target_feature = "avx",
+    //     target_feature = "avx2",
+    //     target_feature = "avx512vl",
+    //     target_feature = "avx512bw",
+    // )))]
     #[inline(always)]
     pub(super) fn search_block1(
         &self,
         rank: usize,
         block_at_super_block: usize,
         block_index: &mut usize,
     ) {
-        self.search_block1_naive(rank, block_at_super_block, block_index);
+        self.search_block1_portable(rank, block_at_super_block, block_index);
     }
 
     #[inline(always)]

diff --git a/src/bit_vec/mask.rs b/src/bit_vec/mask.rs
@@ -180,7 +180,7 @@ where
             .take(self.vec.len / WORD_SIZE)
             .map(|limb| u64::from(limb.count_ones()))
             .sum();
-        if self.vec.len % WORD_SIZE > 0 {
+        if !self.vec.len.is_multiple_of(WORD_SIZE) {
             ones += u64::from(
                 ((self.bin_op)(
                     *self.vec.data.last().unwrap(),

diff --git a/src/bit_vec/mod.rs b/src/bit_vec/mod.rs
@@ -87,7 +87,7 @@ impl BitVec {
     #[must_use]
     pub fn from_zeros(len: usize) -> Self {
         let mut data = vec![0; len / WORD_SIZE];
-        if len % WORD_SIZE != 0 {
+        if !len.is_multiple_of(WORD_SIZE) {
             data.push(0);
         }
         Self { data, len }
@@ -98,7 +98,7 @@ impl BitVec {
     #[must_use]
     pub fn from_ones(len: usize) -> Self {
         let mut data = vec![u64::MAX; len / WORD_SIZE];
-        if len % WORD_SIZE != 0 {
+        if !len.is_multiple_of(WORD_SIZE) {
             data.push((1 << (len % WORD_SIZE)) - 1);
         }
         Self { data, len }
@@ -500,7 +500,7 @@ impl BitVec {
     /// [`append_bit_u8`]: BitVec::append_bit_u8
     /// [`append_word`]: BitVec::append_word
     pub fn append(&mut self, bit: bool) {
-        if self.len % WORD_SIZE == 0 {
+        if self.len.is_multiple_of(WORD_SIZE) {
             self.data.push(0);
         }
         if bit {
@@ -574,7 +574,7 @@ impl BitVec {
     /// [`append_bit_u8`]: BitVec::append_bit_u8
     /// [`append_word`]: BitVec::append_word
     pub fn append_bit(&mut self, bit: u64) {
-        if self.len % WORD_SIZE == 0 {
+        if self.len.is_multiple_of(WORD_SIZE) {
             self.data.push(0);
         }
         if bit % 2 == 1 {
@@ -653,7 +653,7 @@ impl BitVec {
     /// [`append_bit_u16`]: BitVec::append_bit_u16
     /// [`append_bit_u8`]: BitVec::append_bit_u8
     pub fn append_word(&mut self, word: u64) {
-        if self.len % WORD_SIZE == 0 {
+        if self.len.is_multiple_of(WORD_SIZE) {
             self.data.push(word);
         } else {
             // zero out the unused bits before or-ing the new one, to ensure no garbage data remains
@@ -688,7 +688,7 @@ impl BitVec {
     pub fn append_bits(&mut self, bits: u64, len: usize) {
         assert!(len <= 64, "Cannot append more than 64 bits");
 
-        if self.len % WORD_SIZE == 0 {
+        if self.len.is_multiple_of(WORD_SIZE) {
             self.data.push(bits);
         } else {
             // zero out the unused bits before or-ing the new one, to ensure no garbage data remains
@@ -725,7 +725,7 @@ impl BitVec {
     /// [`append_bits`]: BitVec::append_bits
     /// [`drop_last`]: BitVec::drop_last
     pub fn append_bits_unchecked(&mut self, bits: u64, len: usize) {
-        if self.len % WORD_SIZE == 0 {
+        if self.len.is_multiple_of(WORD_SIZE) {
             self.data.push(bits);
         } else {
             self.data[self.len / WORD_SIZE] |= bits << (self.len % WORD_SIZE);
@@ -820,6 +820,8 @@ impl BitVec {
     /// assert_eq!(bv.get(1), Some(0));
     /// assert_eq!(bv.get(2), Some(1));
     /// ```
+    ///
+    /// [`get_unchecked`]: Self::get_unchecked
     #[must_use]
     pub fn get(&self, pos: usize) -> Option<u64> {
         if pos >= self.len {
@@ -1043,7 +1045,7 @@ impl BitVec {
             .iter()
             .map(|limb| u64::from(limb.count_ones()))
             .sum();
-        if self.len % WORD_SIZE > 0 {
+        if !self.len.is_multiple_of(WORD_SIZE) {
             ones += u64::from(
                 (self.data.last().unwrap() & ((1 << (self.len % WORD_SIZE)) - 1)).count_ones(),
             );
@@ -1226,6 +1228,8 @@ impl BitVec {
     /// containing the original vector.
     ///
     /// See also: [`split_at_unchecked`]
+    ///
+    /// [`split_at_unchecked`]: Self::split_at_unchecked
     pub fn split_at(self, at: usize) -> Result<(Self, Self), Self> {
         if at > self.len {
             Err(self)
@@ -1241,6 +1245,8 @@ impl BitVec {
     /// If the index is larger than the length of the vector the function will panic or run
     /// out of memory.
     /// Use [`split_at`] to properly handle this case.
+    ///
+    /// [`split_at`]: Self::split_at
     #[must_use]
     pub fn split_at_unchecked(mut self, at: usize) -> (Self, Self) {
         let other_len = self.len - at;