From b4fc4f1aad67a87356f9d4f9b8d1bf33d77c41a3 Mon Sep 17 00:00:00 2001
From: Johannes Hengstler <mail.jhengstler@gmail.com>
Date: Sun, 19 Oct 2025 14:45:56 +0200
Subject: [PATCH 01/12] un-hide elided lifetimes because we are so conformant

---
 src/wavelet/mod.rs | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/wavelet/mod.rs b/src/wavelet/mod.rs
index 3d08602..e8676be 100644
--- a/src/wavelet/mod.rs
+++ b/src/wavelet/mod.rs
@@ -1942,7 +1942,7 @@ impl WaveletMatrix {
     /// assert_eq!(iter.collect::<Vec<_>>(), vec![1, 4, 4, 1, 2, 7]);
     /// ```
     #[must_use]
-    pub fn iter_u64(&self) -> Option<WaveletNumRefIter> {
+    pub fn iter_u64(&self) -> Option<WaveletNumRefIter<'_>> {
         if self.bits_per_element() > 64 {
             None
         } else {
@@ -1967,7 +1967,7 @@ impl WaveletMatrix {
     ///
     /// See also [`iter_sorted_u64`] for an iterator that yields `u64` elements.
     #[must_use]
-    pub fn iter_sorted(&self) -> WaveletSortedRefIter {
+    pub fn iter_sorted(&self) -> WaveletSortedRefIter<'_> {
         WaveletSortedRefIter::new(self)
     }
 
@@ -1993,7 +1993,7 @@ impl WaveletMatrix {
     /// assert_eq!(iter.collect::<Vec<_>>(), vec![1, 1, 2, 4, 4, 7]);
     /// ```
     #[must_use]
-    pub fn iter_sorted_u64(&self) -> Option<WaveletSortedNumRefIter> {
+    pub fn iter_sorted_u64(&self) -> Option<WaveletSortedNumRefIter<'_>> {
         if self.bits_per_element() > 64 {
             None
         } else {

From 8829120e578f57ce636d22723f6e5cc06a98243a Mon Sep 17 00:00:00 2001
From: Johannes Hengstler <mail.jhengstler@gmail.com>
Date: Sun, 19 Oct 2025 14:53:21 +0200
Subject: [PATCH 02/12] replace manual modulo checks with is_multiple_of

---
 src/bit_vec/fast_rs_vec/mod.rs |  8 ++++----
 src/bit_vec/mask.rs            |  2 +-
 src/bit_vec/mod.rs             | 16 ++++++++--------
 src/trees/mmt.rs               |  2 +-
 4 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/src/bit_vec/fast_rs_vec/mod.rs b/src/bit_vec/fast_rs_vec/mod.rs
index 2c35643..e33b82f 100644
--- a/src/bit_vec/fast_rs_vec/mod.rs
+++ b/src/bit_vec/fast_rs_vec/mod.rs
@@ -144,7 +144,7 @@ impl RsVec {
             let mut new_zeros = word.count_zeros() as usize;
 
             // in the last block, remove remaining zeros of limb that aren't part of the vector
-            if idx == vec.data.len() - 1 && vec.len % WORD_SIZE > 0 {
+            if idx == vec.data.len() - 1 && !vec.len.is_multiple_of(WORD_SIZE) {
                 let mask = (1 << (vec.len % WORD_SIZE)) - 1;
                 new_zeros -= (word | mask).count_zeros() as usize;
             }
@@ -477,9 +477,9 @@ impl RsVec {
         }
 
         // if last incomplete block exists, test it without junk data
-        if self.len % 64 > 0
-            && self.data[self.len / 64] & ((1 << (self.len % 64)) - 1)
-                != other.data[self.len / 64] & ((1 << (other.len % 64)) - 1)
+        if !self.len.is_multiple_of(WORD_SIZE)
+            && self.data[self.len / WORD_SIZE] & ((1 << (self.len % WORD_SIZE)) - 1)
+                != other.data[self.len / WORD_SIZE] & ((1 << (other.len % WORD_SIZE)) - 1)
         {
             return false;
         }
diff --git a/src/bit_vec/mask.rs b/src/bit_vec/mask.rs
index a146b24..4669829 100644
--- a/src/bit_vec/mask.rs
+++ b/src/bit_vec/mask.rs
@@ -180,7 +180,7 @@ where
             .take(self.vec.len / WORD_SIZE)
             .map(|limb| u64::from(limb.count_ones()))
             .sum();
-        if self.vec.len % WORD_SIZE > 0 {
+        if !self.vec.len.is_multiple_of(WORD_SIZE) {
             ones += u64::from(
                 ((self.bin_op)(
                     *self.vec.data.last().unwrap(),
diff --git a/src/bit_vec/mod.rs b/src/bit_vec/mod.rs
index 056091e..a7844b7 100644
--- a/src/bit_vec/mod.rs
+++ b/src/bit_vec/mod.rs
@@ -87,7 +87,7 @@ impl BitVec {
     #[must_use]
     pub fn from_zeros(len: usize) -> Self {
         let mut data = vec![0; len / WORD_SIZE];
-        if len % WORD_SIZE != 0 {
+        if !len.is_multiple_of(WORD_SIZE) {
             data.push(0);
         }
         Self { data, len }
@@ -98,7 +98,7 @@ impl BitVec {
     #[must_use]
     pub fn from_ones(len: usize) -> Self {
         let mut data = vec![u64::MAX; len / WORD_SIZE];
-        if len % WORD_SIZE != 0 {
+        if !len.is_multiple_of(WORD_SIZE) {
             data.push((1 << (len % WORD_SIZE)) - 1);
         }
         Self { data, len }
@@ -500,7 +500,7 @@ impl BitVec {
     /// [`append_bit_u8`]: BitVec::append_bit_u8
     /// [`append_word`]: BitVec::append_word
     pub fn append(&mut self, bit: bool) {
-        if self.len % WORD_SIZE == 0 {
+        if self.len.is_multiple_of(WORD_SIZE) {
             self.data.push(0);
         }
         if bit {
@@ -574,7 +574,7 @@ impl BitVec {
     /// [`append_bit_u8`]: BitVec::append_bit_u8
     /// [`append_word`]: BitVec::append_word
     pub fn append_bit(&mut self, bit: u64) {
-        if self.len % WORD_SIZE == 0 {
+        if self.len.is_multiple_of(WORD_SIZE) {
             self.data.push(0);
         }
         if bit % 2 == 1 {
@@ -653,7 +653,7 @@ impl BitVec {
     /// [`append_bit_u16`]: BitVec::append_bit_u16
     /// [`append_bit_u8`]: BitVec::append_bit_u8
     pub fn append_word(&mut self, word: u64) {
-        if self.len % WORD_SIZE == 0 {
+        if self.len.is_multiple_of(WORD_SIZE) {
             self.data.push(word);
         } else {
             // zero out the unused bits before or-ing the new one, to ensure no garbage data remains
@@ -688,7 +688,7 @@ impl BitVec {
     pub fn append_bits(&mut self, bits: u64, len: usize) {
         assert!(len <= 64, "Cannot append more than 64 bits");
 
-        if self.len % WORD_SIZE == 0 {
+        if self.len.is_multiple_of(WORD_SIZE) {
             self.data.push(bits);
         } else {
             // zero out the unused bits before or-ing the new one, to ensure no garbage data remains
@@ -725,7 +725,7 @@ impl BitVec {
     /// [`append_bits`]: BitVec::append_bits
     /// [`drop_last`]: BitVec::drop_last
     pub fn append_bits_unchecked(&mut self, bits: u64, len: usize) {
-        if self.len % WORD_SIZE == 0 {
+        if self.len.is_multiple_of(WORD_SIZE) {
             self.data.push(bits);
         } else {
             self.data[self.len / WORD_SIZE] |= bits << (self.len % WORD_SIZE);
@@ -1043,7 +1043,7 @@ impl BitVec {
             .iter()
             .map(|limb| u64::from(limb.count_ones()))
             .sum();
-        if self.len % WORD_SIZE > 0 {
+        if !self.len.is_multiple_of(WORD_SIZE) {
             ones += u64::from(
                 (self.data.last().unwrap() & ((1 << (self.len % WORD_SIZE)) - 1)).count_ones(),
             );
diff --git a/src/trees/mmt.rs b/src/trees/mmt.rs
index e66aa09..d796f2b 100644
--- a/src/trees/mmt.rs
+++ b/src/trees/mmt.rs
@@ -170,7 +170,7 @@ impl MinMaxTree {
     /// Get the index of the left sibling of the node at `index` if it exists
     #[allow(clippy::unused_self)] // self is used for consistency with other methods
     pub(crate) fn left_sibling(&self, index: NonZeroUsize) -> Option<NonZeroUsize> {
-        if index.get() % 2 == 0 {
+        if index.get().is_multiple_of(2) {
             // index is at least 2
             NonZeroUsize::new(index.get() - 1)
         } else {

From f9fb40aaf96e0b1f8ec965b98508f3e715f87cb7 Mon Sep 17 00:00:00 2001
From: Johannes Hengstler <mail.jhengstler@gmail.com>
Date: Sun, 19 Oct 2025 14:54:50 +0200
Subject: [PATCH 03/12] bump patch version

---
 Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Cargo.toml b/Cargo.toml
index 371da6e..963c846 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "vers-vecs"
-version = "1.7.0"
+version = "1.7.1"
 edition = "2021"
 authors = ["Johannes \"Cydhra\" Hengstler"]
 description = "A collection of succinct data structures supported by fast implementations of rank and select queries."

From ef3b35c2001069f471ec304676c801f786aa3d6f Mon Sep 17 00:00:00 2001
From: Johannes Hengstler <mail.jhengstler@gmail.com>
Date: Sun, 19 Oct 2025 15:05:45 +0200
Subject: [PATCH 04/12] avx512 is stable

---
 src/lib.rs | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index 2e1c297..3baf4c6 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,14 +1,3 @@
-#![cfg_attr(
-    all(
-        feature = "simd",
-        target_arch = "x86_64",
-        target_feature = "avx",
-        target_feature = "avx2",
-        target_feature = "avx512f",
-        target_feature = "avx512bw",
-    ),
-    feature(stdarch_x86_avx512)
-)]
 #![warn(missing_docs)]
 #![allow(clippy::module_name_repetitions)]
 #![allow(clippy::assertions_on_constants)] // for asserts warning about incompatible constant values

From 4fdf71d19279cf4997bc9116f4642ea2118d1c5e Mon Sep 17 00:00:00 2001
From: Johannes Hengstler <mail.jhengstler@gmail.com>
Date: Sun, 19 Oct 2025 15:07:57 +0200
Subject: [PATCH 05/12] since I very explicitly change MSRV it requires
 upgrading to 1.8

---
 Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Cargo.toml b/Cargo.toml
index 963c846..5a6edde 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "vers-vecs"
-version = "1.7.1"
+version = "1.8.0"
 edition = "2021"
 authors = ["Johannes \"Cydhra\" Hengstler"]
 description = "A collection of succinct data structures supported by fast implementations of rank and select queries."

From 8d2a525a36df9fca907c54ea1708e959e796d5cc Mon Sep 17 00:00:00 2001
From: Johannes Hengstler <mail.jhengstler@gmail.com>
Date: Sun, 19 Oct 2025 15:10:36 +0200
Subject: [PATCH 06/12] update readme about simd feature stability

---
 readme.md | 2 --
 1 file changed, 2 deletions(-)

diff --git a/readme.md b/readme.md
index ea64ed1..d21edac 100644
--- a/readme.md
+++ b/readme.md
@@ -31,8 +31,6 @@ since the intrinsics speed up both `rank` and `select` operations by a factor of
 - `simd`: Enables the use of SIMD instructions for rank and select operations.
 This feature requires AVX-512 support and uses unsafe code.
 It also enables a special iterator for the rank/select bit vector that uses vectorized operations.
-The feature only works on nightly Rust.
-Enabling it on stable Rust is a no-op, because the required CPU features are not available there.
 - `serde`: Enables serialization and deserialization of the data structures using the `serde` crate.
 - `u16_lookup` Enables a larger lookup table for BP tree queries. The larger table requires 128 KiB instead of 4 KiB.
 

From 0232dcbf13319c9c0ed5a596449b7de891d24442 Mon Sep 17 00:00:00 2001
From: Johannes Hengstler <mail.jhengstler@gmail.com>
Date: Sun, 19 Oct 2025 15:31:00 +0200
Subject: [PATCH 07/12] doc_auto_cfg was removed

---
 src/lib.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/lib.rs b/src/lib.rs
index 3baf4c6..0b1d729 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -2,7 +2,7 @@
 #![allow(clippy::module_name_repetitions)]
 #![allow(clippy::assertions_on_constants)] // for asserts warning about incompatible constant values
 #![allow(clippy::inline_always)] // we actually measure performance increases with most of these
-#![cfg_attr(docsrs, feature(doc_cfg), feature(doc_auto_cfg))] // for conditional compilation in docs
+#![cfg_attr(docsrs, feature(doc_cfg))] // for conditional compilation in docs
 
 //! This crate provides a collection of data structures supported by fast implementations of
 //! rank and select queries. The data structures are static, meaning that they cannot be modified

From 2a463177bde4b9c0dd9ddc642efc600f062dbcd6 Mon Sep 17 00:00:00 2001
From: Johannes Hengstler <mail.jhengstler@gmail.com>
Date: Sun, 19 Oct 2025 16:12:58 +0200
Subject: [PATCH 08/12] bump patch for docsrs fix

---
 Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Cargo.toml b/Cargo.toml
index 5a6edde..7798dd4 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "vers-vecs"
-version = "1.8.0"
+version = "1.8.1"
 edition = "2021"
 authors = ["Johannes \"Cydhra\" Hengstler"]
 description = "A collection of succinct data structures supported by fast implementations of rank and select queries."

From ed0bbaacb5ad871f8c36d054b28ac700192fa9d3 Mon Sep 17 00:00:00 2001
From: Johannes Hengstler <mail.jhengstler@gmail.com>
Date: Sun, 19 Oct 2025 16:26:53 +0200
Subject: [PATCH 09/12] fix broken doc links

---
 src/bit_vec/mod.rs      |  6 ++++++
 src/bit_vec/sparse.rs   |  4 +++-
 src/elias_fano/mod.rs   |  2 ++
 src/trees/bp/builder.rs |  3 ++-
 src/trees/mod.rs        |  7 +++++++
 src/wavelet/mod.rs      | 10 +++++++++-
 6 files changed, 29 insertions(+), 3 deletions(-)

diff --git a/src/bit_vec/mod.rs b/src/bit_vec/mod.rs
index a7844b7..95779cc 100644
--- a/src/bit_vec/mod.rs
+++ b/src/bit_vec/mod.rs
@@ -820,6 +820,8 @@ impl BitVec {
     /// assert_eq!(bv.get(1), Some(0));
     /// assert_eq!(bv.get(2), Some(1));
     /// ```
+    ///
+    /// [`get_unchecked`]: Self::get_unchecked
     #[must_use]
     pub fn get(&self, pos: usize) -> Option<u64> {
         if pos >= self.len {
@@ -1226,6 +1228,8 @@ impl BitVec {
     /// containing the original vector.
     ///
     /// See also: [`split_at_unchecked`]
+    ///
+    /// [`split_at_unchecked`]: Self::split_at_unchecked
     pub fn split_at(self, at: usize) -> Result<(Self, Self), Self> {
         if at > self.len {
             Err(self)
@@ -1241,6 +1245,8 @@ impl BitVec {
     /// If the index is larger than the length of the vector the function will panic or run
     /// out of memory.
     /// Use [`split_at`] to properly handle this case.
+    ///
+    /// [`split_at`]: Self::split_at
     #[must_use]
     pub fn split_at_unchecked(mut self, at: usize) -> (Self, Self) {
         let other_len = self.len - at;
diff --git a/src/bit_vec/sparse.rs b/src/bit_vec/sparse.rs
index bc0dbe5..9e016b5 100644
--- a/src/bit_vec/sparse.rs
+++ b/src/bit_vec/sparse.rs
@@ -2,7 +2,7 @@
 //! The vector requires `O(n log u/n) + 2n + o(n)` bits of space, where `n` is the number of bits in the vector
 //! and `u` is the number of 1-bits.
 //! The vector is constructed from a sorted list of indices of 1-bits, or from an existing
-//! [`BitVec`](crate::BitVec).
+//! [`BitVec`].
 
 use crate::{BitVec, EliasFanoVec};
 
@@ -170,6 +170,8 @@ impl SparseRSVec {
     /// # Panics
     /// If `i` is out of bounds the function might panic or produce incorrect results.
     /// Use [`get`] for a checked version.
+    ///
+    /// [`get`]: Self::get
     #[must_use]
     pub fn get_unchecked(&self, i: u64) -> u64 {
         self.is_set_unchecked(i).into()
diff --git a/src/elias_fano/mod.rs b/src/elias_fano/mod.rs
index 75b009c..0d80410 100644
--- a/src/elias_fano/mod.rs
+++ b/src/elias_fano/mod.rs
@@ -164,6 +164,8 @@ impl EliasFanoVec {
     ///
     /// Note, that select in bit-vectors returns an index, while select in Elias-Fano returns the
     /// element at the given rank.
+    ///
+    /// [`get`]: Self::get
     #[must_use]
     pub fn select(&self, rank: usize) -> Option<u64> {
         self.get(rank)
diff --git a/src/trees/bp/builder.rs b/src/trees/bp/builder.rs
index 753600c..b896d13 100644
--- a/src/trees/bp/builder.rs
+++ b/src/trees/bp/builder.rs
@@ -5,7 +5,8 @@ use crate::BitVec;
 /// A builder for [`BpTrees`] using depth-first traversal of the tree. See the documentation of
 /// [`TreeBuilder`].
 ///
-/// [`BpTree`]: BpTree
+/// [`BpTrees`]: BpTree
+/// [`TreeBuilder`]: TreeBuilder
 pub struct BpBuilder<const BLOCK_SIZE: usize = DEFAULT_BLOCK_SIZE> {
     excess: i64,
     bit_vec: BitVec,
diff --git a/src/trees/mod.rs b/src/trees/mod.rs
index 3e2f0eb..5df9bc8 100644
--- a/src/trees/mod.rs
+++ b/src/trees/mod.rs
@@ -122,6 +122,10 @@ pub trait LevelTree: Tree {
 ///
 /// Once the full tree has been visited, the caller must call [`build`] to create an instance of the
 /// implementing tree type.
+///
+/// [`enter_node`]: TreeBuilder::enter_node
+/// [`leave_node`]: TreeBuilder::leave_node
+/// [`build`]: TreeBuilder::build
 pub trait TreeBuilder {
     /// The tree type constructed with this interface
     type Tree;
@@ -139,5 +143,8 @@ pub trait TreeBuilder {
     /// (i.e. there are nodes for which [`leave_node`] has not been called,
     /// or there are more calls to `leave_node` than to [`enter_node`];
     /// the number of extraneous calls to `enter_node` is returned in the error).
+    ///
+    /// [`leave_node`]: Self::leave_node
+    /// [`enter_node`]: Self::enter_node
     fn build(self) -> Result<Self::Tree, i64>;
 }
diff --git a/src/wavelet/mod.rs b/src/wavelet/mod.rs
index e8676be..4795436 100644
--- a/src/wavelet/mod.rs
+++ b/src/wavelet/mod.rs
@@ -62,6 +62,10 @@ use std::ops::Range;
 /// ```
 ///
 /// [`RsVec`]: RsVec
+/// [`from_bit_vec`]: WaveletMatrix::from_bit_vec
+/// [`from_slice`]: WaveletMatrix::from_slice
+/// [`from_bit_vec_pc`]: WaveletMatrix::from_bit_vec_pc
+/// [`from_slice_pc`]: WaveletMatrix::from_slice_pc
 #[derive(Clone, Debug)]
 #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
 pub struct WaveletMatrix {
@@ -1080,7 +1084,7 @@ impl WaveletMatrix {
     /// Get the `k`-th smallest element in the encoded sequence in the specified `range`,
     /// where `k = 0` returns the smallest element.
     /// The `range` is a half-open interval, meaning that the `end` index is exclusive.
-    /// The `k`-th smallest element is returned as a `BitVec`,
+    /// The `k`-th smallest element is returned as a [`BitVec`],
     /// where the least significant bit is the first element.
     ///
     /// Returns `None` if the `range` is out of bounds, or if `k` is greater than the size of the range.
@@ -1114,6 +1118,8 @@ impl WaveletMatrix {
     ///
     /// # Panics
     /// May panic if the `i` is out of bounds, or returns an empty bit vector.
+    ///
+    /// [`get_sorted`]: Self::get_sorted
     #[must_use]
     pub fn get_sorted_unchecked(&self, i: usize) -> BitVec {
         self.quantile_unchecked(0..self.len(), i)
@@ -1966,6 +1972,8 @@ impl WaveletMatrix {
     /// The iterator yields `BitVec` elements.
     ///
     /// See also [`iter_sorted_u64`] for an iterator that yields `u64` elements.
+    ///
+    /// [`iter_sorted_u64`]: Self::iter_sorted_u64
     #[must_use]
     pub fn iter_sorted(&self) -> WaveletSortedRefIter<'_> {
         WaveletSortedRefIter::new(self)

From e22dfbd54933bfc5be33e766c506fbede6ea045e Mon Sep 17 00:00:00 2001
From: Johannes Hengstler <mail.jhengstler@gmail.com>
Date: Sun, 19 Oct 2025 16:34:24 +0200
Subject: [PATCH 10/12] add workflow for docs

---
 .github/workflows/rust.yml | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
index c0b0208..2daeb6f 100644
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -31,4 +31,14 @@ jobs:
       - name: Build
         run: cargo build --verbose --features serde
       - name: Run tests
-        run: cargo test --verbose --features serde
\ No newline at end of file
+        run: cargo test --verbose --features serde
+
+  docs:
+    runs-on: ubuntu-latest
+    env:
+      RUSTFLAGS: -C target-cpu=x86-64
+      RUSTDOCFLAGS: -C target-cpu=x86-64
+    steps:
+      - uses: actions/checkout@v4
+      - name: Docs
+        run: cargo doc --verbose --all-features
\ No newline at end of file

From 13a356afb0245d8f51f390b63eb4804df5adc5a5 Mon Sep 17 00:00:00 2001
From: Cydhra <ubezl@student.kit.edu>
Date: Sun, 19 Oct 2025 23:43:34 +0200
Subject: [PATCH 11/12] change `const` to `static` for lookup table to prevent
 expensive (and transitively applied) inlining (#37)

---
 benches/bp.rs                | 2 --
 src/trees/bp/lookup.rs       | 2 +-
 src/trees/bp/lookup_query.rs | 4 ++--
 src/trees/bp/mod.rs          | 2 --
 4 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/benches/bp.rs b/benches/bp.rs
index c278694..941f9d1 100644
--- a/benches/bp.rs
+++ b/benches/bp.rs
@@ -1,5 +1,3 @@
-#![allow(long_running_const_eval)]
-
 use criterion::{black_box, criterion_group, criterion_main, BatchSize, BenchmarkId, Criterion};
 use rand::rngs::StdRng;
 use rand::{Rng, SeedableRng};
diff --git a/src/trees/bp/lookup.rs b/src/trees/bp/lookup.rs
index 2c8fc8b..0095fd9 100644
--- a/src/trees/bp/lookup.rs
+++ b/src/trees/bp/lookup.rs
@@ -48,7 +48,7 @@ const LOOKUP_MAX_VALUE: u32 = u8::MAX as u32;
 ///
 /// The rest of the bits are zero.
 #[allow(long_running_const_eval)]
-const PAREN_BLOCK_LOOKUP: [EncodedTableType; 1 << LOOKUP_BLOCK_SIZE] = calculate_lookup_table();
+static PAREN_BLOCK_LOOKUP: [EncodedTableType; 1 << LOOKUP_BLOCK_SIZE] = calculate_lookup_table();
 
 /// Offset to add to encoded excess values, so negative numbers are stored as positive integers, reducing
 /// encoding complexity
diff --git a/src/trees/bp/lookup_query.rs b/src/trees/bp/lookup_query.rs
index a16aea9..e817743 100644
--- a/src/trees/bp/lookup_query.rs
+++ b/src/trees/bp/lookup_query.rs
@@ -22,14 +22,14 @@ const LOOKUP_MAX_VALUE: u32 = u8::MAX as u32;
 /// to dual-encode negative excess), and another 51 bits for all 17 queries that may end in this block
 /// (-8 to 8 relative excess).
 #[allow(long_running_const_eval)]
-const PAREN_BLOCK_LOOKUP_FWD: [u64; 1 << LOOKUP_BLOCK_SIZE] = calculate_lookup_table(true);
+static PAREN_BLOCK_LOOKUP_FWD: [u64; 1 << LOOKUP_BLOCK_SIZE] = calculate_lookup_table(true);
 
 /// Encoded bwd query results for all possible 8-bit blocks.
 /// The encoding reserves 10 bits for minimum and maximum excess (shifted by 8 bits so we don't have
 /// to dual-encode negative excess), and another 51 bits for all 17 queries that may end in this block
 /// (-8 to 8 relative excess).
 #[allow(long_running_const_eval)]
-const PAREN_BLOCK_LOOKUP_BWD: [u64; 1 << LOOKUP_BLOCK_SIZE] = calculate_lookup_table(false);
+static PAREN_BLOCK_LOOKUP_BWD: [u64; 1 << LOOKUP_BLOCK_SIZE] = calculate_lookup_table(false);
 
 /// Bitmask for one of the lookup values.
 const ENCODING_MASK: u64 = 0b11111;
diff --git a/src/trees/bp/mod.rs b/src/trees/bp/mod.rs
index 6b9e89c..43e9e13 100644
--- a/src/trees/bp/mod.rs
+++ b/src/trees/bp/mod.rs
@@ -85,7 +85,6 @@ use lookup_query::{process_block_bwd, process_block_fwd, LOOKUP_BLOCK_SIZE};
 /// The high-level approach to building a tree is to use the [`BpBuilder`] to construct the tree
 /// using depth-first traversal of all its nodes.
 /// ```rust
-/// # #![allow(long_running_const_eval)] // for some reason this is needed for test cases
 /// use vers_vecs::{BitVec, BpBuilder, BpTree, TreeBuilder, Tree};
 ///
 /// let mut builder = BpBuilder::<512>::new();
@@ -119,7 +118,6 @@ use lookup_query::{process_block_bwd, process_block_fwd, LOOKUP_BLOCK_SIZE};
 /// This is also how trees with unbalanced parenthesis expressions can be constructed.
 ///
 /// ```rust
-/// # #![allow(long_running_const_eval)]
 /// use vers_vecs::{BitVec, BpTree, Tree};
 /// let bv = BitVec::pack_sequence_u8(&[0b1101_0111, 0b0010_0100], 8);
 /// let tree = BpTree::<4>::from_bit_vector(bv);

From 365964c8278e0db09c482965bea7ae16f96a996d Mon Sep 17 00:00:00 2001
From: Johannes Hengstler <mail.jhengstler@gmail.com>
Date: Sun, 26 Oct 2025 15:46:01 +0100
Subject: [PATCH 12/12] experimental port simd setup with fallbacks disabled
 and not compiling on stable

---
 src/bit_vec/fast_rs_vec/mod.rs    |   1 +
 src/bit_vec/fast_rs_vec/select.rs | 185 ++++++++++++++++++++----------
 src/lib.rs                        |   1 +
 3 files changed, 125 insertions(+), 62 deletions(-)

diff --git a/src/bit_vec/fast_rs_vec/mod.rs b/src/bit_vec/fast_rs_vec/mod.rs
index e33b82f..ed7701f 100644
--- a/src/bit_vec/fast_rs_vec/mod.rs
+++ b/src/bit_vec/fast_rs_vec/mod.rs
@@ -42,6 +42,7 @@ const SELECT_BLOCK_SIZE: usize = 1 << 13;
 /// always stores the number zero, which serves as a sentinel value to avoid special-casing the
 /// first block in a super-block (which would be a performance hit due branch prediction failures).
 #[derive(Clone, Copy, Debug)]
+#[repr(C)]
 #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
 struct BlockDescriptor {
     zeros: u16,
diff --git a/src/bit_vec/fast_rs_vec/select.rs b/src/bit_vec/fast_rs_vec/select.rs
index b8721d7..4e2a6a3 100644
--- a/src/bit_vec/fast_rs_vec/select.rs
+++ b/src/bit_vec/fast_rs_vec/select.rs
@@ -4,6 +4,7 @@ use crate::bit_vec::fast_rs_vec::{BLOCK_SIZE, SELECT_BLOCK_SIZE, SUPER_BLOCK_SIZ
 use crate::bit_vec::WORD_SIZE;
 use crate::util::pdep::Pdep;
 use crate::util::unroll;
+use std::slice::from_raw_parts;
 
 /// A safety constant for assertions to make sure that the block size doesn't change without
 /// adjusting the code.
@@ -229,81 +230,141 @@ impl super::RsVec {
     /// It loads the entire block into a SIMD register and compares the rank to the number of ones
     /// in the block. The resulting mask is popcounted to find how many blocks from the block boundary
     /// the rank is.
-    #[cfg(all(
-        feature = "simd",
-        target_arch = "x86_64",
-        target_feature = "avx",
-        target_feature = "avx2",
-        target_feature = "avx512vl",
-        target_feature = "avx512bw",
-    ))]
-    #[inline(always)]
-    pub(super) fn search_block1(
+    // #[cfg(all(
+    //     feature = "simd",
+    //     target_arch = "x86_64",
+    //     target_feature = "avx",
+    //     target_feature = "avx2",
+    //     target_feature = "avx512vl",
+    //     target_feature = "avx512bw",
+    // ))]
+    // #[inline(always)]
+    // pub(super) fn search_block1(
+    //     &self,
+    //     rank: usize,
+    //     block_at_super_block: usize,
+    //     block_index: &mut usize,
+    // ) {
+    //     use std::arch::x86_64::{
+    //         _mm256_cmpgt_epu16_mask, _mm256_loadu_epi16, _mm256_set1_epi16, _mm256_set_epi16,
+    //         _mm256_sub_epi16,
+    //     };
+    //
+    //     if self.blocks.len() > *block_index + BLOCKS_PER_SUPERBLOCK {
+    //         debug_assert!(
+    //             SUPER_BLOCK_SIZE / BLOCK_SIZE == BLOCKS_PER_SUPERBLOCK,
+    //             "change unroll constant to {}",
+    //             64 - (SUPER_BLOCK_SIZE / BLOCK_SIZE).leading_zeros() - 1
+    //         );
+    //
+    //         unsafe {
+    //             let bit_nums = _mm256_set_epi16(
+    //                 (15 * BLOCK_SIZE) as i16,
+    //                 (14 * BLOCK_SIZE) as i16,
+    //                 (13 * BLOCK_SIZE) as i16,
+    //                 (12 * BLOCK_SIZE) as i16,
+    //                 (11 * BLOCK_SIZE) as i16,
+    //                 (10 * BLOCK_SIZE) as i16,
+    //                 (9 * BLOCK_SIZE) as i16,
+    //                 (8 * BLOCK_SIZE) as i16,
+    //                 (7 * BLOCK_SIZE) as i16,
+    //                 (6 * BLOCK_SIZE) as i16,
+    //                 (5 * BLOCK_SIZE) as i16,
+    //                 (4 * BLOCK_SIZE) as i16,
+    //                 (3 * BLOCK_SIZE) as i16,
+    //                 (2 * BLOCK_SIZE) as i16,
+    //                 (1 * BLOCK_SIZE) as i16,
+    //                 (0 * BLOCK_SIZE) as i16,
+    //             );
+    //
+    //             let blocks = _mm256_loadu_epi16(self.blocks[*block_index..].as_ptr() as *const i16);
+    //             let ones = _mm256_sub_epi16(bit_nums, blocks);
+    //
+    //             let ranks = _mm256_set1_epi16(rank as i16);
+    //             let mask = _mm256_cmpgt_epu16_mask(ones, ranks);
+    //
+    //             debug_assert!(
+    //                 mask.count_zeros() > 0,
+    //                 "first block should always be zero, but still claims to be greater than rank"
+    //             );
+    //             *block_index += mask.count_zeros() as usize - 1;
+    //         }
+    //     } else {
+    //         self.search_block1_naive(rank, block_at_super_block, block_index)
+    //     }
+    // }
+
+    #[cfg(all(feature = "simd",))]
+    pub(super) fn search_block1_portable(
         &self,
         rank: usize,
-        block_at_super_block: usize,
+        _block_at_super_block: usize,
         block_index: &mut usize,
     ) {
-        use std::arch::x86_64::{
-            _mm256_cmpgt_epu16_mask, _mm256_loadu_epi16, _mm256_set1_epi16, _mm256_set_epi16,
-            _mm256_sub_epi16,
-        };
+        use std::simd::cmp::SimdPartialOrd;
+        use std::simd::u16x16;
 
-        if self.blocks.len() > *block_index + BLOCKS_PER_SUPERBLOCK {
-            debug_assert!(
-                SUPER_BLOCK_SIZE / BLOCK_SIZE == BLOCKS_PER_SUPERBLOCK,
-                "change unroll constant to {}",
-                64 - (SUPER_BLOCK_SIZE / BLOCK_SIZE).leading_zeros() - 1
-            );
+        debug_assert!(
+            SUPER_BLOCK_SIZE / BLOCK_SIZE == BLOCKS_PER_SUPERBLOCK,
+            "change unroll constant to {}",
+            64 - (SUPER_BLOCK_SIZE / BLOCK_SIZE).leading_zeros() - 1
+        );
 
-            unsafe {
-                let bit_nums = _mm256_set_epi16(
-                    (15 * BLOCK_SIZE) as i16,
-                    (14 * BLOCK_SIZE) as i16,
-                    (13 * BLOCK_SIZE) as i16,
-                    (12 * BLOCK_SIZE) as i16,
-                    (11 * BLOCK_SIZE) as i16,
-                    (10 * BLOCK_SIZE) as i16,
-                    (9 * BLOCK_SIZE) as i16,
-                    (8 * BLOCK_SIZE) as i16,
-                    (7 * BLOCK_SIZE) as i16,
-                    (6 * BLOCK_SIZE) as i16,
-                    (5 * BLOCK_SIZE) as i16,
-                    (4 * BLOCK_SIZE) as i16,
-                    (3 * BLOCK_SIZE) as i16,
-                    (2 * BLOCK_SIZE) as i16,
-                    (1 * BLOCK_SIZE) as i16,
-                    (0 * BLOCK_SIZE) as i16,
-                );
+        let bit_nums = u16x16::from([
+            (0 * BLOCK_SIZE) as u16,
+            (1 * BLOCK_SIZE) as u16,
+            (2 * BLOCK_SIZE) as u16,
+            (3 * BLOCK_SIZE) as u16,
+            (4 * BLOCK_SIZE) as u16,
+            (5 * BLOCK_SIZE) as u16,
+            (6 * BLOCK_SIZE) as u16,
+            (7 * BLOCK_SIZE) as u16,
+            (8 * BLOCK_SIZE) as u16,
+            (9 * BLOCK_SIZE) as u16,
+            (10 * BLOCK_SIZE) as u16,
+            (11 * BLOCK_SIZE) as u16,
+            (12 * BLOCK_SIZE) as u16,
+            (13 * BLOCK_SIZE) as u16,
+            (14 * BLOCK_SIZE) as u16,
+            (15 * BLOCK_SIZE) as u16,
+        ]);
+
+        let sentinel = u16x16::default();
+
+        let slice: &[u16] = unsafe {
+            from_raw_parts(
+                self.blocks[*block_index..].as_ptr() as *const u16,
+                self.blocks[*block_index..].len(),
+            )
+        };
+        let blocks = u16x16::load_or(slice, sentinel);
+        let ones = bit_nums - blocks;
 
-                let blocks = _mm256_loadu_epi16(self.blocks[*block_index..].as_ptr() as *const i16);
-                let ones = _mm256_sub_epi16(bit_nums, blocks);
+        let ranks = u16x16::splat(rank as u16);
+        let mask = ones.simd_gt(ranks);
 
-                let ranks = _mm256_set1_epi16(rank as i16);
-                let mask = _mm256_cmpgt_epu16_mask(ones, ranks);
+        // calculate the number of blocks where the number of ones does not exceed rank
+        // (subtract 48 unused bits because the bitmask has 64 bits)
+        let num = mask.to_bitmask().count_zeros() - 48;
 
-                debug_assert!(
-                    mask.count_zeros() > 0,
-                    "first block should always be zero, but still claims to be greater than rank"
-                );
-                *block_index += mask.count_zeros() as usize - 1;
-            }
-        } else {
-            self.search_block1_naive(rank, block_at_super_block, block_index)
-        }
+        debug_assert!(
+            num > 0,
+            "first block should always be zero, but still claims to be greater than rank"
+        );
+        *block_index += num as usize - 1;
     }
 
     /// Search for the block in a superblock that contains the rank. This function is only used
     /// internally and is not part of the public API.
     /// It compares blocks in a loop-unrolled binary search to find the block that contains the rank.
-    #[cfg(not(all(
-        feature = "simd",
-        target_arch = "x86_64",
-        target_feature = "avx",
-        target_feature = "avx2",
-        target_feature = "avx512vl",
-        target_feature = "avx512bw",
-    )))]
+    // #[cfg(not(all(
+    //     feature = "simd",
+    //     target_arch = "x86_64",
+    //     target_feature = "avx",
+    //     target_feature = "avx2",
+    //     target_feature = "avx512vl",
+    //     target_feature = "avx512bw",
+    // )))]
     #[inline(always)]
     pub(super) fn search_block1(
         &self,
@@ -311,7 +372,7 @@ impl super::RsVec {
         block_at_super_block: usize,
         block_index: &mut usize,
     ) {
-        self.search_block1_naive(rank, block_at_super_block, block_index);
+        self.search_block1_portable(rank, block_at_super_block, block_index);
     }
 
     #[inline(always)]
diff --git a/src/lib.rs b/src/lib.rs
index 0b1d729..efa9d31 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -3,6 +3,7 @@
 #![allow(clippy::assertions_on_constants)] // for asserts warning about incompatible constant values
 #![allow(clippy::inline_always)] // we actually measure performance increases with most of these
 #![cfg_attr(docsrs, feature(doc_cfg))] // for conditional compilation in docs
+#![cfg_attr(feature = "simd", feature(portable_simd))] // portable_simd feature on nightly
 
 //! This crate provides a collection of data structures supported by fast implementations of
 //! rank and select queries. The data structures are static, meaning that they cannot be modified