Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion .github/workflows/rust.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,4 +31,14 @@ jobs:
- name: Build
run: cargo build --verbose --features serde
- name: Run tests
run: cargo test --verbose --features serde
run: cargo test --verbose --features serde

docs:
runs-on: ubuntu-latest
env:
RUSTFLAGS: -C target-cpu=x86-64
RUSTDOCFLAGS: -C target-cpu=x86-64
steps:
- uses: actions/checkout@v4
- name: Docs
run: cargo doc --verbose --all-features
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "vers-vecs"
version = "1.7.0"
version = "1.8.1"
edition = "2021"
authors = ["Johannes \"Cydhra\" Hengstler"]
description = "A collection of succinct data structures supported by fast implementations of rank and select queries."
Expand Down
4 changes: 2 additions & 2 deletions benches/bp.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ use vers_vecs::trees::{Tree, TreeBuilder};

mod common;

const BLOCK_SIZE: usize = 1024;
const BLOCK_SIZE: u64 = 1024;

// TODO this function has nlogn runtime, which is a bit too much for the largest trees
fn generate_tree<R: Rng>(rng: &mut R, nodes: u64) -> BpTree<BLOCK_SIZE> {
Expand Down Expand Up @@ -107,7 +107,7 @@ fn bench_navigation(b: &mut Criterion) {
let mut rng = StdRng::from_seed([0; 32]);

let bp = generate_tree(&mut rng, l as u64);
let node_handles = (0..l).map(|i| bp.node_handle(i)).collect::<Vec<_>>();
let node_handles = (0..l as u64).map(|i| bp.node_handle(i)).collect::<Vec<_>>();

group.bench_with_input(BenchmarkId::new("parent", l), &l, |b, _| {
b.iter_batched(
Expand Down
2 changes: 1 addition & 1 deletion benches/elias_fano_iterator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ fn bench_ef(b: &mut Criterion) {

let start = Instant::now();
while i < iters {
black_box(ef_vec.get_unchecked(i as usize % l));
black_box(ef_vec.get_unchecked(i % l as u64));
i += 1;
}
time += start.elapsed();
Expand Down
4 changes: 2 additions & 2 deletions benches/rmq.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
use criterion::{black_box, criterion_group, criterion_main, BatchSize, BenchmarkId, Criterion};
use rand::distributions::{Distribution, Uniform};
use rand::Rng;
use vers_vecs::rmq::fast_rmq::FastRmq;
use vers_vecs::rmq::small::SmallRmq;

mod common;

Expand All @@ -12,7 +12,7 @@ fn bench_rmq(b: &mut Criterion) {
group.plot_config(common::plot_config());

for l in common::SIZES {
let rmq = FastRmq::from_vec(common::fill_random_vec(&mut rng, l));
let rmq = SmallRmq::from_vec(common::fill_random_vec(&mut rng, l));
let sample = Uniform::new(0, rmq.len());
group.bench_with_input(BenchmarkId::new("range_min", l), &l, |b, _| {
b.iter_batched(
Expand Down
2 changes: 1 addition & 1 deletion benches/select_adversarial.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ fn select_worst_case(b: &mut Criterion) {
// construct a vector with only one select block and put its last one bit at the end
// of the vector

let mut bit_vec = BitVec::with_capacity(length / 64);
let mut bit_vec = BitVec::with_capacity(length as u64 / 64);
for _ in 0..(1usize << 13) / 64 - 1 {
bit_vec.append_word(u64::MAX);
}
Expand Down
4 changes: 2 additions & 2 deletions benches/select_iter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,11 @@ fn bench_select_iter(b: &mut Criterion) {
group.bench_with_input(BenchmarkId::new("select queries", l), &l, |b, _| {
b.iter_custom(|iters| {
let mut time = Duration::new(0, 0);
let mut i = 0usize;
let mut i = 0;
let rank1 = bit_vec.rank1(bit_vec.len());

let start = Instant::now();
while (i as u64) < iters {
while (i) < iters {
black_box(bit_vec.select1(i % rank1));
i += 1;
}
Expand Down
7 changes: 4 additions & 3 deletions benches/sparse_equals.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,14 +22,14 @@ pub const SIZES: [usize; 7] = [
const FILL_FACTORS: [f64; 6] = [0.05, 0.1, 0.2, 0.3, 0.4, 0.5];

/// Generate a bitvector with `fill_factors` percent ones at random positions
fn generate_vector_with_fill(rng: &mut ThreadRng, len: usize, fill_factor: f64) -> BitVec {
fn generate_vector_with_fill(rng: &mut ThreadRng, len: u64, fill_factor: f64) -> BitVec {
let mut bit_vec1 = BitVec::from_zeros(len);

// flip exactly fill-factor * len bits so the equality check is not trivial
sample(rng, len, (fill_factor * len as f64) as usize)
sample(rng, len as usize, (fill_factor * len as f64) as usize)
.iter()
.for_each(|i| {
bit_vec1.flip_bit(i);
bit_vec1.flip_bit(i as u64);
});

bit_vec1
Expand All @@ -39,6 +39,7 @@ fn bench(b: &mut Criterion<TimeDiff>) {
let mut rng = rand::thread_rng();

for len in SIZES {
let len = len as u64;
let mut group = b.benchmark_group(format!("Equals Benchmark: {}", len));
group.plot_config(common::plot_config());

Expand Down
29 changes: 29 additions & 0 deletions migrate.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# Migration Guide from 1.X to 2.0
The following guide explains the changes from versions 1.X to the 2.0 release and points out what changes are necessary
to downstream crates.

## Renamed Members
The following structures and functions were renamed
- `BitVec::from_bit_vector` to `BitVec::from_bit_vec`
- `SparseRSVec` to `SparseRsVec`
- `FastRmq` to `SmallRmq`
- `BinaryRmq` to `SparseRmq`
- `BitVec::from_bits` to `BitVec::from_bits_u8`
- module `fast_rs_vec` to `rs`
- module `elias_fano` to `ef`
- module `fast_rmq` to `small`
- module `binary_rmq` to `sparse`

## Changed Index Type
All vector types that operate on bits or sub-byte words are now indexed by `u64` instead of `usize`,
allowing full utilization of the memory in 32-bit architectures.
This affects `BitVec`, `RsVec`, `EliasFano`, `SparseRsVec`, `BpTree`, and `WaveletMatrix`
This changes the parameter and return types of various functions on the affected types from `usize` to `u64`.
The only adverse effect is that `len()` and `count()` of iterators over these data structures may panic if the
iterator has more than `usize::MAX` elements.

## Changed Backing Structures
`RsVec`, `SparseRmq`, and `FastRmq` now use `Box<[_]>` instead of `Vec<_>` as backing structs, which reduces the stack
footprint.
This breaks the serde-compatibility with already serialized data.
It also changes the `Deref` implementation of the RMQ structs, which previously returned `Vec<_>`.
2 changes: 0 additions & 2 deletions readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,6 @@ since the intrinsics speed up both `rank` and `select` operations by a factor of
- `simd`: Enables the use of SIMD instructions for rank and select operations.
This feature requires AVX-512 support and uses unsafe code.
It also enables a special iterator for the rank/select bit vector that uses vectorized operations.
The feature only works on nightly Rust.
Enabling it on stable Rust is a no-op, because the required CPU features are not available there.
- `serde`: Enables serialization and deserialization of the data structures using the `serde` crate.
- `u16_lookup` Enables a larger lookup table for BP tree queries. The larger table requires 128 KiB instead of 4 KiB.

Expand Down
31 changes: 16 additions & 15 deletions src/bit_vec/mask.rs
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ where
/// If the position is larger than the length of the vector, None is returned.
#[inline]
#[must_use]
pub fn get(&self, pos: usize) -> Option<u64> {
pub fn get(&self, pos: u64) -> Option<u64> {
if pos >= self.vec.len {
None
} else {
Expand All @@ -67,10 +67,10 @@ where
/// [`get`]: MaskedBitVec::get
#[inline]
#[must_use]
pub fn get_unchecked(&self, pos: usize) -> u64 {
pub fn get_unchecked(&self, pos: u64) -> u64 {
((self.bin_op)(
self.vec.data[pos / WORD_SIZE],
self.mask.data[pos / WORD_SIZE],
self.vec.data[(pos / WORD_SIZE) as usize],
self.mask.data[(pos / WORD_SIZE) as usize],
) >> (pos % WORD_SIZE))
& 1
}
Expand All @@ -79,7 +79,7 @@ where
/// If the position is larger than the length of the vector, None is returned.
#[inline]
#[must_use]
pub fn is_bit_set(&self, pos: usize) -> Option<bool> {
pub fn is_bit_set(&self, pos: u64) -> Option<bool> {
if pos >= self.vec.len {
None
} else {
Expand All @@ -97,7 +97,7 @@ where
/// [`is_bit_set`]: MaskedBitVec::is_bit_set
#[inline]
#[must_use]
pub fn is_bit_set_unchecked(&self, pos: usize) -> bool {
pub fn is_bit_set_unchecked(&self, pos: u64) -> bool {
self.get_unchecked(pos) != 0
}

Expand All @@ -108,7 +108,7 @@ where
/// If the length of the query is larger than 64, None is returned.
#[inline]
#[must_use]
pub fn get_bits(&self, pos: usize, len: usize) -> Option<u64> {
pub fn get_bits(&self, pos: u64, len: u64) -> Option<u64> {
if len > WORD_SIZE || len == 0 {
return None;
}
Expand Down Expand Up @@ -138,12 +138,13 @@ where
#[must_use]
#[allow(clippy::inline_always)]
#[allow(clippy::comparison_chain)] // rust-clippy #5354
#[allow(clippy::cast_possible_truncation)] // safe due to the division
#[inline]
pub fn get_bits_unchecked(&self, pos: usize, len: usize) -> u64 {
pub fn get_bits_unchecked(&self, pos: u64, len: u64) -> u64 {
debug_assert!(len <= WORD_SIZE);
let partial_word = (self.bin_op)(
self.vec.data[pos / WORD_SIZE],
self.mask.data[pos / WORD_SIZE],
self.vec.data[(pos / WORD_SIZE) as usize],
self.mask.data[(pos / WORD_SIZE) as usize],
) >> (pos % WORD_SIZE);

if pos % WORD_SIZE + len == WORD_SIZE {
Expand All @@ -152,8 +153,8 @@ where
partial_word & ((1 << (len % WORD_SIZE)) - 1)
} else {
let next_half = (self.bin_op)(
self.vec.data[pos / WORD_SIZE + 1],
self.mask.data[pos / WORD_SIZE + 1],
self.vec.data[(pos / WORD_SIZE + 1) as usize],
self.mask.data[(pos / WORD_SIZE + 1) as usize],
) << (WORD_SIZE - pos % WORD_SIZE);

(partial_word | next_half) & ((1 << (len % WORD_SIZE)) - 1)
Expand All @@ -167,7 +168,7 @@ where
#[inline]
#[must_use]
pub fn count_zeros(&self) -> u64 {
self.vec.len as u64 - self.count_ones()
self.vec.len - self.count_ones()
}

/// Return the number of ones in the masked bit vector.
Expand All @@ -177,10 +178,10 @@ where
pub fn count_ones(&self) -> u64 {
let mut ones = self
.iter_limbs()
.take(self.vec.len / WORD_SIZE)
.take((self.vec.len / WORD_SIZE) as usize)
.map(|limb| u64::from(limb.count_ones()))
.sum();
if self.vec.len % WORD_SIZE > 0 {
if !self.vec.len.is_multiple_of(WORD_SIZE) {
ones += u64::from(
((self.bin_op)(
*self.vec.data.last().unwrap(),
Expand Down
Loading