From fba4a8c81f0c9910d0f65713c9603e963e034728 Mon Sep 17 00:00:00 2001 From: husharp Date: Thu, 1 Feb 2024 21:51:21 +0800 Subject: [PATCH 01/22] week1_day1 Signed-off-by: husharp --- mini-lsm-starter/src/lsm_storage.rs | 74 +++- mini-lsm-starter/src/mem_table.rs | 21 +- mini-lsm-starter/src/tests.rs | 3 + mini-lsm-starter/src/tests/harness.rs | 436 +++++++++++++++++++++++ mini-lsm-starter/src/tests/week1_day1.rs | 149 ++++++++ 5 files changed, 671 insertions(+), 12 deletions(-) create mode 100644 mini-lsm-starter/src/tests/harness.rs create mode 100644 mini-lsm-starter/src/tests/week1_day1.rs diff --git a/mini-lsm-starter/src/lsm_storage.rs b/mini-lsm-starter/src/lsm_storage.rs index 428ef25bf..049e52044 100644 --- a/mini-lsm-starter/src/lsm_storage.rs +++ b/mini-lsm-starter/src/lsm_storage.rs @@ -262,8 +262,27 @@ impl LsmStorageInner { } /// Get a key from the storage. In day 7, this can be further optimized by using a bloom filter. - pub fn get(&self, _key: &[u8]) -> Result> { - unimplemented!() + pub fn get(&self, key: &[u8]) -> Result> { + let guard = self.state.read(); + // search memtable firstly + if let Some(value) = guard.memtable.get(key) { + if value.is_empty() { + return Ok(None); + } + return Ok(Some(value)); + } + + // traverse imm-memtable + for memtable in guard.imm_memtables.iter() { + if let Some(value) = memtable.get(key) { + if value.is_empty() { + return Ok(None); + } + return Ok(Some(value)); + } + } + + Ok(None) } /// Write a batch of data into the storage. Implement in week 2 day 7. @@ -272,13 +291,45 @@ impl LsmStorageInner { } /// Put a key-value pair into the storage by writing into the current memtable. - pub fn put(&self, _key: &[u8], _value: &[u8]) -> Result<()> { - unimplemented!() + pub fn put(&self, key: &[u8], value: &[u8]) -> Result<()> { + let size; + { + let guard = self.state.write(); + guard.memtable.put(key, value)?; + size = guard.memtable.approximate_size(); + } + + self.try_freeze(size)?; + + Ok(()) } /// Remove a key from the storage by writing an empty value. - pub fn delete(&self, _key: &[u8]) -> Result<()> { - unimplemented!() + pub fn delete(&self, key: &[u8]) -> Result<()> { + let size; + { + let guard = self.state.write(); + guard.memtable.put(key, b"")?; + size = guard.memtable.approximate_size(); + } + + self.try_freeze(size)?; + + Ok(()) + } + + pub fn try_freeze(&self, size: usize) -> Result<()> { + // using double check for concurrency + if size >= self.options.target_sst_size { + let state_lock = self.state_lock.lock(); + let guard = self.state.read(); + + if guard.memtable.approximate_size() >= self.options.target_sst_size { + drop(guard); + self.force_freeze_memtable(&state_lock)?; + } + } + Ok(()) } pub(crate) fn path_of_sst_static(path: impl AsRef, id: usize) -> PathBuf { @@ -303,7 +354,16 @@ impl LsmStorageInner { /// Force freeze the current memtable to an immutable memtable pub fn force_freeze_memtable(&self, _state_lock_observer: &MutexGuard<'_, ()>) -> Result<()> { - unimplemented!() + let new_memtable = Arc::new(MemTable::create(self.next_sst_id())); + { + let mut guard = self.state.write(); + let mut snapshot = guard.as_ref().clone(); + let old_memtable = std::mem::replace(&mut snapshot.memtable, new_memtable); + // imm_memtables.first() should be the last frozen memtable + snapshot.imm_memtables.insert(0, old_memtable); + *guard = Arc::new(snapshot); + } + Ok(()) } /// Force flush the earliest-created immutable memtable to disk diff --git a/mini-lsm-starter/src/mem_table.rs b/mini-lsm-starter/src/mem_table.rs index 57c961482..aa375e1a9 100644 --- a/mini-lsm-starter/src/mem_table.rs +++ b/mini-lsm-starter/src/mem_table.rs @@ -38,7 +38,12 @@ pub(crate) fn map_bound(bound: Bound<&[u8]>) -> Bound { impl MemTable { /// Create a new mem-table. pub fn create(_id: usize) -> Self { - unimplemented!() + Self { + map: Arc::new(SkipMap::new()), + wal: None, + id: _id, + approximate_size: Arc::new(AtomicUsize::new(0)), + } } /// Create a new mem-table with WAL @@ -68,16 +73,22 @@ impl MemTable { } /// Get a value by key. - pub fn get(&self, _key: &[u8]) -> Option { - unimplemented!() + pub fn get(&self, key: &[u8]) -> Option { + self.map.get(key).map(|x| x.value().clone()) } /// Put a key-value pair into the mem-table. /// /// In week 1, day 1, simply put the key-value pair into the skipmap. /// In week 2, day 6, also flush the data to WAL. - pub fn put(&self, _key: &[u8], _value: &[u8]) -> Result<()> { - unimplemented!() + pub fn put(&self, key: &[u8], value: &[u8]) -> Result<()> { + self.approximate_size.fetch_add( + key.len() + value.len(), + std::sync::atomic::Ordering::Relaxed, + ); + self.map + .insert(Bytes::copy_from_slice(key), Bytes::copy_from_slice(value)); + Ok(()) } pub fn sync_wal(&self) -> Result<()> { diff --git a/mini-lsm-starter/src/tests.rs b/mini-lsm-starter/src/tests.rs index 688adfa83..283b258ac 100644 --- a/mini-lsm-starter/src/tests.rs +++ b/mini-lsm-starter/src/tests.rs @@ -1,2 +1,5 @@ //! DO NOT MODIFY -- Mini-LSM tests modules //! This file will be automatically rewritten by the copy-test command. + +mod harness; +mod week1_day1; diff --git a/mini-lsm-starter/src/tests/harness.rs b/mini-lsm-starter/src/tests/harness.rs new file mode 100644 index 000000000..104cfdaec --- /dev/null +++ b/mini-lsm-starter/src/tests/harness.rs @@ -0,0 +1,436 @@ +use std::{ + collections::BTreeMap, ops::Bound, os::unix::fs::MetadataExt, path::Path, sync::Arc, + time::Duration, +}; + +use anyhow::{bail, Result}; +use bytes::Bytes; + +use crate::{ + compact::{ + CompactionOptions, LeveledCompactionOptions, SimpleLeveledCompactionOptions, + TieredCompactionOptions, + }, + iterators::{merge_iterator::MergeIterator, StorageIterator}, + key::{KeySlice, TS_ENABLED}, + lsm_storage::{BlockCache, LsmStorageInner, LsmStorageState, MiniLsm}, + table::{SsTable, SsTableBuilder, SsTableIterator}, +}; + +#[derive(Clone)] +pub struct MockIterator { + pub data: Vec<(Bytes, Bytes)>, + pub error_when: Option, + pub index: usize, +} + +impl MockIterator { + pub fn new(data: Vec<(Bytes, Bytes)>) -> Self { + Self { + data, + index: 0, + error_when: None, + } + } + + pub fn new_with_error(data: Vec<(Bytes, Bytes)>, error_when: usize) -> Self { + Self { + data, + index: 0, + error_when: Some(error_when), + } + } +} + +impl StorageIterator for MockIterator { + type KeyType<'a> = KeySlice<'a>; + + fn next(&mut self) -> Result<()> { + if self.index < self.data.len() { + self.index += 1; + } + if let Some(error_when) = self.error_when { + if self.index == error_when { + bail!("fake error!"); + } + } + Ok(()) + } + + fn key(&self) -> KeySlice { + if let Some(error_when) = self.error_when { + if self.index >= error_when { + panic!("invalid access after next returns an error!"); + } + } + KeySlice::for_testing_from_slice_no_ts(self.data[self.index].0.as_ref()) + } + + fn value(&self) -> &[u8] { + if let Some(error_when) = self.error_when { + if self.index >= error_when { + panic!("invalid access after next returns an error!"); + } + } + self.data[self.index].1.as_ref() + } + + fn is_valid(&self) -> bool { + if let Some(error_when) = self.error_when { + if self.index >= error_when { + panic!("invalid access after next returns an error!"); + } + } + self.index < self.data.len() + } +} + +pub fn as_bytes(x: &[u8]) -> Bytes { + Bytes::copy_from_slice(x) +} + +pub fn check_iter_result_by_key(iter: &mut I, expected: Vec<(Bytes, Bytes)>) +where + I: for<'a> StorageIterator = KeySlice<'a>>, +{ + for (k, v) in expected { + assert!(iter.is_valid()); + assert_eq!( + k, + iter.key().for_testing_key_ref(), + "expected key: {:?}, actual key: {:?}", + k, + as_bytes(iter.key().for_testing_key_ref()), + ); + assert_eq!( + v, + iter.value(), + "expected value: {:?}, actual value: {:?}", + v, + as_bytes(iter.value()), + ); + iter.next().unwrap(); + } + assert!(!iter.is_valid()); +} + +#[allow(dead_code)] +pub fn check_iter_result_by_key_and_ts(iter: &mut I, expected: Vec<((Bytes, u64), Bytes)>) +where + I: for<'a> StorageIterator = KeySlice<'a>>, +{ + for ((k, ts), v) in expected { + assert!(iter.is_valid()); + assert_eq!( + (&k[..], ts), + ( + iter.key().for_testing_key_ref(), + iter.key().for_testing_ts() + ), + "expected key: {:?}@{}, actual key: {:?}@{}", + k, + ts, + as_bytes(iter.key().for_testing_key_ref()), + iter.key().for_testing_ts(), + ); + assert_eq!( + v, + iter.value(), + "expected value: {:?}, actual value: {:?}", + v, + as_bytes(iter.value()), + ); + iter.next().unwrap(); + } + assert!(!iter.is_valid()); +} + +pub fn check_lsm_iter_result_by_key(iter: &mut I, expected: Vec<(Bytes, Bytes)>) +where + I: for<'a> StorageIterator = &'a [u8]>, +{ + for (k, v) in expected { + assert!(iter.is_valid()); + assert_eq!( + k, + iter.key(), + "expected key: {:?}, actual key: {:?}", + k, + as_bytes(iter.key()), + ); + assert_eq!( + v, + iter.value(), + "expected value: {:?}, actual value: {:?}", + v, + as_bytes(iter.value()), + ); + iter.next().unwrap(); + } + assert!(!iter.is_valid()); +} + +pub fn expect_iter_error(mut iter: impl StorageIterator) { + loop { + match iter.next() { + Ok(_) if iter.is_valid() => continue, + Ok(_) => panic!("expect an error"), + Err(_) => break, + } + } +} + +pub fn generate_sst( + id: usize, + path: impl AsRef, + data: Vec<(Bytes, Bytes)>, + block_cache: Option>, +) -> SsTable { + let mut builder = SsTableBuilder::new(128); + for (key, value) in data { + builder.add(KeySlice::for_testing_from_slice_no_ts(&key[..]), &value[..]); + } + builder.build(id, block_cache, path.as_ref()).unwrap() +} + +#[allow(dead_code)] +pub fn generate_sst_with_ts( + id: usize, + path: impl AsRef, + data: Vec<((Bytes, u64), Bytes)>, + block_cache: Option>, +) -> SsTable { + let mut builder = SsTableBuilder::new(128); + for ((key, ts), value) in data { + builder.add( + KeySlice::for_testing_from_slice_with_ts(&key[..], ts), + &value[..], + ); + } + builder.build(id, block_cache, path.as_ref()).unwrap() +} + +pub(crate) fn sync(storage: &LsmStorageInner) { + storage + .force_freeze_memtable(&storage.state_lock.lock()) + .unwrap(); + storage.force_flush_next_imm_memtable().unwrap(); +} + +pub fn compaction_bench(storage: Arc) { + let mut key_map = BTreeMap::::new(); + let gen_key = |i| format!("{:010}", i); // 10B + let gen_value = |i| format!("{:0110}", i); // 110B + let mut max_key = 0; + let overlaps = if TS_ENABLED { 10000 } else { 20000 }; + for iter in 0..10 { + let range_begin = iter * 5000; + for i in range_begin..(range_begin + overlaps) { + // 120B per key, 4MB data populated + let key: String = gen_key(i); + let version = key_map.get(&i).copied().unwrap_or_default() + 1; + let value = gen_value(version); + key_map.insert(i, version); + storage.put(key.as_bytes(), value.as_bytes()).unwrap(); + max_key = max_key.max(i); + } + } + + std::thread::sleep(Duration::from_secs(1)); // wait until all memtables flush + while { + let snapshot = storage.inner.state.read(); + !snapshot.imm_memtables.is_empty() + } { + storage.inner.force_flush_next_imm_memtable().unwrap(); + } + + let mut prev_snapshot = storage.inner.state.read().clone(); + while { + std::thread::sleep(Duration::from_secs(1)); + let snapshot = storage.inner.state.read().clone(); + let to_cont = prev_snapshot.levels != snapshot.levels + || prev_snapshot.l0_sstables != snapshot.l0_sstables; + prev_snapshot = snapshot; + to_cont + } { + println!("waiting for compaction to converge"); + } + + let mut expected_key_value_pairs = Vec::new(); + for i in 0..(max_key + 40000) { + let key = gen_key(i); + let value = storage.get(key.as_bytes()).unwrap(); + if let Some(val) = key_map.get(&i) { + let expected_value = gen_value(*val); + assert_eq!(value, Some(Bytes::from(expected_value.clone()))); + expected_key_value_pairs.push((Bytes::from(key), Bytes::from(expected_value))); + } else { + assert!(value.is_none()); + } + } + + check_lsm_iter_result_by_key( + &mut storage.scan(Bound::Unbounded, Bound::Unbounded).unwrap(), + expected_key_value_pairs, + ); + + storage.dump_structure(); + + println!("This test case does not guarantee your compaction algorithm produces a LSM state as expected. It only does minimal checks on the size of the levels. Please use the compaction simulator to check if the compaction is correctly going on."); +} + +pub fn check_compaction_ratio(storage: Arc) { + let state = storage.inner.state.read().clone(); + let compaction_options = storage.inner.options.compaction_options.clone(); + let mut level_size = Vec::new(); + let l0_sst_num = state.l0_sstables.len(); + for (_, files) in &state.levels { + let size = match &compaction_options { + CompactionOptions::Leveled(_) => files + .iter() + .map(|x| state.sstables.get(x).as_ref().unwrap().table_size()) + .sum::(), + CompactionOptions::Simple(_) | CompactionOptions::Tiered(_) => files.len() as u64, + _ => unreachable!(), + }; + level_size.push(size); + } + let num_iters = storage + .scan(Bound::Unbounded, Bound::Unbounded) + .unwrap() + .num_active_iterators(); + let num_memtables = storage.inner.state.read().imm_memtables.len() + 1; + match compaction_options { + CompactionOptions::NoCompaction => unreachable!(), + CompactionOptions::Simple(SimpleLeveledCompactionOptions { + size_ratio_percent, + level0_file_num_compaction_trigger, + max_levels, + }) => { + assert!(l0_sst_num < level0_file_num_compaction_trigger); + assert!(level_size.len() <= max_levels); + for idx in 1..level_size.len() { + let prev_size = level_size[idx - 1]; + let this_size = level_size[idx]; + if prev_size == 0 && this_size == 0 { + continue; + } + assert!( + this_size as f64 / prev_size as f64 >= size_ratio_percent as f64 / 100.0, + "L{}/L{}, {}/{}<{}%", + state.levels[idx - 1].0, + state.levels[idx].0, + this_size, + prev_size, + size_ratio_percent + ); + } + assert!( + num_iters <= l0_sst_num + num_memtables + max_levels, + "did you use concat iterators?" + ); + } + CompactionOptions::Leveled(LeveledCompactionOptions { + level_size_multiplier, + level0_file_num_compaction_trigger, + max_levels, + .. + }) => { + assert!(l0_sst_num < level0_file_num_compaction_trigger); + assert!(level_size.len() <= max_levels); + for idx in 1..level_size.len() { + let prev_size = level_size[idx - 1]; + let this_size = level_size[idx]; + assert!( + // do not add hard requirement on level size multiplier considering bloom filters... + this_size as f64 / prev_size as f64 >= (level_size_multiplier as f64 - 0.5), + "L{}/L{}, {}/{}<<{}", + state.levels[idx].0, + state.levels[idx - 1].0, + this_size, + prev_size, + level_size_multiplier + ); + } + assert!( + num_iters <= l0_sst_num + num_memtables + max_levels, + "did you use concat iterators?" + ); + } + CompactionOptions::Tiered(TieredCompactionOptions { + num_tiers, + max_size_amplification_percent, + size_ratio, + min_merge_width, + }) => { + let size_ratio_trigger = (100.0 + size_ratio as f64) / 100.0; + assert_eq!(l0_sst_num, 0); + assert!(level_size.len() <= num_tiers); + let mut sum_size = level_size[0]; + for idx in 1..level_size.len() { + let this_size = level_size[idx]; + if level_size.len() > min_merge_width { + assert!( + sum_size as f64 / this_size as f64 <= size_ratio_trigger, + "violation of size ratio: sum(⬆️L{})/L{}, {}/{}>{}", + state.levels[idx - 1].0, + state.levels[idx].0, + sum_size, + this_size, + size_ratio_trigger + ); + } + if idx + 1 == level_size.len() { + assert!( + sum_size as f64 / this_size as f64 + <= max_size_amplification_percent as f64 / 100.0, + "violation of space amp: sum(⬆️L{})/L{}, {}/{}>{}%", + state.levels[idx - 1].0, + state.levels[idx].0, + sum_size, + this_size, + max_size_amplification_percent + ); + } + sum_size += this_size; + } + assert!( + num_iters <= num_memtables + num_tiers, + "did you use concat iterators?" + ); + } + } +} + +pub fn dump_files_in_dir(path: impl AsRef) { + println!("--- DIR DUMP ---"); + for f in path.as_ref().read_dir().unwrap() { + let f = f.unwrap(); + print!("{}", f.path().display()); + println!( + ", size={:.3}KB", + f.metadata().unwrap().size() as f64 / 1024.0 + ); + } +} + +pub fn construct_merge_iterator_over_storage( + state: &LsmStorageState, +) -> MergeIterator { + let mut iters = Vec::new(); + for t in &state.l0_sstables { + iters.push(Box::new( + SsTableIterator::create_and_seek_to_first(state.sstables.get(t).cloned().unwrap()) + .unwrap(), + )); + } + for (_, files) in &state.levels { + for f in files { + iters.push(Box::new( + SsTableIterator::create_and_seek_to_first(state.sstables.get(f).cloned().unwrap()) + .unwrap(), + )); + } + } + MergeIterator::create(iters) +} diff --git a/mini-lsm-starter/src/tests/week1_day1.rs b/mini-lsm-starter/src/tests/week1_day1.rs new file mode 100644 index 000000000..25ff8ff7c --- /dev/null +++ b/mini-lsm-starter/src/tests/week1_day1.rs @@ -0,0 +1,149 @@ +use std::sync::Arc; + +use tempfile::tempdir; + +use crate::{ + lsm_storage::{LsmStorageInner, LsmStorageOptions}, + mem_table::MemTable, +}; + +#[test] +fn test_task1_memtable_get() { + let memtable = MemTable::create(0); + memtable.for_testing_put_slice(b"key1", b"value1").unwrap(); + memtable.for_testing_put_slice(b"key2", b"value2").unwrap(); + memtable.for_testing_put_slice(b"key3", b"value3").unwrap(); + assert_eq!( + &memtable.for_testing_get_slice(b"key1").unwrap()[..], + b"value1" + ); + assert_eq!( + &memtable.for_testing_get_slice(b"key2").unwrap()[..], + b"value2" + ); + assert_eq!( + &memtable.for_testing_get_slice(b"key3").unwrap()[..], + b"value3" + ); +} + +#[test] +fn test_task1_memtable_overwrite() { + let memtable = MemTable::create(0); + memtable.for_testing_put_slice(b"key1", b"value1").unwrap(); + memtable.for_testing_put_slice(b"key2", b"value2").unwrap(); + memtable.for_testing_put_slice(b"key3", b"value3").unwrap(); + memtable.for_testing_put_slice(b"key1", b"value11").unwrap(); + memtable.for_testing_put_slice(b"key2", b"value22").unwrap(); + memtable.for_testing_put_slice(b"key3", b"value33").unwrap(); + assert_eq!( + &memtable.for_testing_get_slice(b"key1").unwrap()[..], + b"value11" + ); + assert_eq!( + &memtable.for_testing_get_slice(b"key2").unwrap()[..], + b"value22" + ); + assert_eq!( + &memtable.for_testing_get_slice(b"key3").unwrap()[..], + b"value33" + ); +} + +#[test] +fn test_task2_storage_integration() { + let dir = tempdir().unwrap(); + let storage = Arc::new( + LsmStorageInner::open(dir.path(), LsmStorageOptions::default_for_week1_test()).unwrap(), + ); + assert_eq!(&storage.get(b"0").unwrap(), &None); + storage.put(b"1", b"233").unwrap(); + storage.put(b"2", b"2333").unwrap(); + storage.put(b"3", b"23333").unwrap(); + assert_eq!(&storage.get(b"1").unwrap().unwrap()[..], b"233"); + assert_eq!(&storage.get(b"2").unwrap().unwrap()[..], b"2333"); + assert_eq!(&storage.get(b"3").unwrap().unwrap()[..], b"23333"); + storage.delete(b"2").unwrap(); + assert!(storage.get(b"2").unwrap().is_none()); + storage.delete(b"0").unwrap(); // should NOT report any error +} + +#[test] +fn test_task3_storage_integration() { + let dir = tempdir().unwrap(); + let storage = Arc::new( + LsmStorageInner::open(dir.path(), LsmStorageOptions::default_for_week1_test()).unwrap(), + ); + storage.put(b"1", b"233").unwrap(); + storage.put(b"2", b"2333").unwrap(); + storage.put(b"3", b"23333").unwrap(); + storage + .force_freeze_memtable(&storage.state_lock.lock()) + .unwrap(); + assert_eq!(storage.state.read().imm_memtables.len(), 1); + let previous_approximate_size = storage.state.read().imm_memtables[0].approximate_size(); + assert!(previous_approximate_size >= 15); + storage.put(b"1", b"2333").unwrap(); + storage.put(b"2", b"23333").unwrap(); + storage.put(b"3", b"233333").unwrap(); + storage + .force_freeze_memtable(&storage.state_lock.lock()) + .unwrap(); + assert_eq!(storage.state.read().imm_memtables.len(), 2); + assert!( + storage.state.read().imm_memtables[1].approximate_size() == previous_approximate_size, + "wrong order of memtables?" + ); + assert!(storage.state.read().imm_memtables[0].approximate_size() > previous_approximate_size); +} + +#[test] +fn test_task3_freeze_on_capacity() { + let dir = tempdir().unwrap(); + let mut options = LsmStorageOptions::default_for_week1_test(); + options.target_sst_size = 1024; + options.num_memtable_limit = 1000; + let storage = Arc::new(LsmStorageInner::open(dir.path(), options).unwrap()); + for _ in 0..1000 { + storage.put(b"1", b"2333").unwrap(); + } + let num_imm_memtables = storage.state.read().imm_memtables.len(); + assert!(num_imm_memtables >= 1, "no memtable frozen?"); + for _ in 0..1000 { + storage.delete(b"1").unwrap(); + } + + assert!( + storage.state.read().imm_memtables.len() > num_imm_memtables, + "no more memtable frozen?" + ); +} + +#[test] +fn test_task4_storage_integration() { + let dir = tempdir().unwrap(); + let storage = Arc::new( + LsmStorageInner::open(dir.path(), LsmStorageOptions::default_for_week1_test()).unwrap(), + ); + assert_eq!(&storage.get(b"0").unwrap(), &None); + storage.put(b"1", b"233").unwrap(); + storage.put(b"2", b"2333").unwrap(); + storage.put(b"3", b"23333").unwrap(); + storage + .force_freeze_memtable(&storage.state_lock.lock()) + .unwrap(); + storage.delete(b"1").unwrap(); + storage.delete(b"2").unwrap(); + storage.put(b"3", b"2333").unwrap(); + storage.put(b"4", b"23333").unwrap(); + storage + .force_freeze_memtable(&storage.state_lock.lock()) + .unwrap(); + storage.put(b"1", b"233333").unwrap(); + storage.put(b"3", b"233333").unwrap(); + assert_eq!(storage.state.read().imm_memtables.len(), 2); + assert_eq!(&storage.get(b"1").unwrap().unwrap()[..], b"233333"); + assert_eq!(&storage.get(b"2").unwrap(), &None); + assert_eq!(&storage.get(b"3").unwrap().unwrap()[..], b"233333"); + assert_eq!(&storage.get(b"4").unwrap().unwrap()[..], b"23333"); +} From 820a22365af570d6bac8fc83be0542f2e9c5f2db Mon Sep 17 00:00:00 2001 From: husharp Date: Sat, 3 Feb 2024 21:35:02 +0800 Subject: [PATCH 02/22] week1_day2 Signed-off-by: husharp --- .../src/iterators/merge_iterator.rs | 61 +++- mini-lsm-starter/src/lsm_iterator.rs | 45 ++- mini-lsm-starter/src/lsm_storage.rs | 18 +- mini-lsm-starter/src/mem_table.rs | 34 +- mini-lsm-starter/src/tests.rs | 1 + mini-lsm-starter/src/tests/harness.rs | 4 +- mini-lsm-starter/src/tests/week1_day2.rs | 313 ++++++++++++++++++ 7 files changed, 450 insertions(+), 26 deletions(-) create mode 100644 mini-lsm-starter/src/tests/week1_day2.rs diff --git a/mini-lsm-starter/src/iterators/merge_iterator.rs b/mini-lsm-starter/src/iterators/merge_iterator.rs index a3e911ddb..490bb28de 100644 --- a/mini-lsm-starter/src/iterators/merge_iterator.rs +++ b/mini-lsm-starter/src/iterators/merge_iterator.rs @@ -2,6 +2,7 @@ #![allow(dead_code)] // TODO(you): remove this lint after implementing this mod use std::cmp::{self}; +use std::collections::binary_heap::PeekMut; use std::collections::BinaryHeap; use anyhow::Result; @@ -42,12 +43,21 @@ impl Ord for HeapWrapper { /// iterators, perfer the one with smaller index. pub struct MergeIterator { iters: BinaryHeap>, - current: HeapWrapper, + current: Option>, } impl MergeIterator { pub fn create(iters: Vec>) -> Self { - unimplemented!() + let mut iters = iters + .into_iter() + .enumerate() + .filter(|(_, iter)| iter.is_valid()) + .map(|(i, iter)| HeapWrapper(i, iter)) + .collect::>(); + Self { + current: iters.pop(), + iters, + } } } @@ -57,18 +67,57 @@ impl StorageIterator = KeySlice<'a>>> StorageIt type KeyType<'a> = KeySlice<'a>; fn key(&self) -> KeySlice { - unimplemented!() + self.current.as_ref().unwrap().1.key() } fn value(&self) -> &[u8] { - unimplemented!() + self.current.as_ref().unwrap().1.value() } fn is_valid(&self) -> bool { - unimplemented!() + // check if current iterator is valid including when current iterator is None + self.current + .as_ref() + .map(|x| x.1.is_valid()) + .unwrap_or(false) } fn next(&mut self) -> Result<()> { - unimplemented!() + let current = self.current.as_mut().unwrap(); + // merge same key + while let Some(mut inner_iter) = self.iters.peek_mut() { + if current.1.key() == inner_iter.1.key() { + // check next error + if let Err(e) = inner_iter.1.next() { + PeekMut::pop(inner_iter); + return Err(e); + } + + // need to remove the iterator if it is invalid + if !inner_iter.1.is_valid() { + PeekMut::pop(inner_iter); + } + } else { + break; + } + } + + current.1.next()?; + // if current iterator is invalid, replace with next iterator + if !current.1.is_valid() { + if let Some(next_iter) = self.iters.pop() { + *current = next_iter; + } + return Ok(()); + } + + if let Some(mut inner_iter) = self.iters.peek_mut() { + // TODO: check value is reverse because heap is max heap + if *current < *inner_iter { + std::mem::swap(&mut *current, &mut *inner_iter); + } + } + + Ok(()) } } diff --git a/mini-lsm-starter/src/lsm_iterator.rs b/mini-lsm-starter/src/lsm_iterator.rs index 82842b2d9..6b47318d5 100644 --- a/mini-lsm-starter/src/lsm_iterator.rs +++ b/mini-lsm-starter/src/lsm_iterator.rs @@ -1,7 +1,7 @@ #![allow(unused_variables)] // TODO(you): remove this lint after implementing this mod #![allow(dead_code)] // TODO(you): remove this lint after implementing this mod -use anyhow::Result; +use anyhow::{bail, Ok, Result}; use crate::{ iterators::{merge_iterator::MergeIterator, StorageIterator}, @@ -17,7 +17,16 @@ pub struct LsmIterator { impl LsmIterator { pub(crate) fn new(iter: LsmIteratorInner) -> Result { - Ok(Self { inner: iter }) + let mut iter = Self { inner: iter }; + iter.move_to_non_delete()?; + Ok(iter) + } + + fn move_to_non_delete(&mut self) -> Result<()> { + while self.is_valid() && self.value().is_empty() { + self.next()?; + } + Ok(()) } } @@ -25,19 +34,22 @@ impl StorageIterator for LsmIterator { type KeyType<'a> = &'a [u8]; fn is_valid(&self) -> bool { - unimplemented!() + self.inner.is_valid() } fn key(&self) -> &[u8] { - unimplemented!() + self.inner.key().raw_ref() } fn value(&self) -> &[u8] { - unimplemented!() + self.inner.value() } fn next(&mut self) -> Result<()> { - unimplemented!() + self.inner.next()?; + // move to the next non-delete entry + self.move_to_non_delete()?; + Ok(()) } } @@ -46,11 +58,15 @@ impl StorageIterator for LsmIterator { /// `is_valid` should return false, and `next` should always return an error. pub struct FusedIterator { iter: I, + has_error: bool, } impl FusedIterator { pub fn new(iter: I) -> Self { - Self { iter } + Self { + iter, + has_error: false, + } } } @@ -58,18 +74,25 @@ impl StorageIterator for FusedIterator { type KeyType<'a> = I::KeyType<'a> where Self: 'a; fn is_valid(&self) -> bool { - unimplemented!() + !self.has_error && self.iter.is_valid() } fn key(&self) -> Self::KeyType<'_> { - unimplemented!() + self.iter.key() } fn value(&self) -> &[u8] { - unimplemented!() + self.iter.value() } fn next(&mut self) -> Result<()> { - unimplemented!() + if self.has_error { + bail!("Iterator has already returned an error") + } + if let Some(err) = self.iter.next().err() { + self.has_error = true; + return Err(err); + } + Ok(()) } } diff --git a/mini-lsm-starter/src/lsm_storage.rs b/mini-lsm-starter/src/lsm_storage.rs index 049e52044..c424cd806 100644 --- a/mini-lsm-starter/src/lsm_storage.rs +++ b/mini-lsm-starter/src/lsm_storage.rs @@ -15,6 +15,7 @@ use crate::compact::{ CompactionController, CompactionOptions, LeveledCompactionController, LeveledCompactionOptions, SimpleLeveledCompactionController, SimpleLeveledCompactionOptions, TieredCompactionController, }; +use crate::iterators::merge_iterator::MergeIterator; use crate::lsm_iterator::{FusedIterator, LsmIterator}; use crate::manifest::Manifest; use crate::mem_table::MemTable; @@ -300,7 +301,7 @@ impl LsmStorageInner { } self.try_freeze(size)?; - + Ok(()) } @@ -382,6 +383,19 @@ impl LsmStorageInner { _lower: Bound<&[u8]>, _upper: Bound<&[u8]>, ) -> Result> { - unimplemented!() + let snapshot = { + let guard = self.state.read(); + Arc::clone(&guard) + }; // drop global lock here + + // need to get all memtables and imm_memtables + let mut memtable_iters = Vec::with_capacity(snapshot.imm_memtables.len() + 1); + memtable_iters.push(Box::new(snapshot.memtable.scan(_lower, _upper))); + for imm_memtable in snapshot.imm_memtables.iter() { + memtable_iters.push(Box::new(imm_memtable.scan(_lower, _upper))); + } + // using merge iterator to merge all iterators + let merge_iter = MergeIterator::create(memtable_iters); + Ok(FusedIterator::new(LsmIterator::new(merge_iter)?)) } } diff --git a/mini-lsm-starter/src/mem_table.rs b/mini-lsm-starter/src/mem_table.rs index aa375e1a9..c303a3900 100644 --- a/mini-lsm-starter/src/mem_table.rs +++ b/mini-lsm-starter/src/mem_table.rs @@ -7,6 +7,7 @@ use std::sync::Arc; use anyhow::Result; use bytes::Bytes; +use crossbeam_skiplist::map::Entry; use crossbeam_skiplist::SkipMap; use ouroboros::self_referencing; @@ -99,8 +100,17 @@ impl MemTable { } /// Get an iterator over a range of keys. - pub fn scan(&self, _lower: Bound<&[u8]>, _upper: Bound<&[u8]>) -> MemTableIterator { - unimplemented!() + pub fn scan(&self, lower: Bound<&[u8]>, upper: Bound<&[u8]>) -> MemTableIterator { + let map = self.map.clone(); + let mut iter = MemTableIteratorBuilder { + map, + iter_builder: |map| map.range((map_bound(lower), map_bound(upper))), + item: (Bytes::new(), Bytes::new()), + } + .build(); + let next = iter.with_iter_mut(|iter| MemTableIterator::entry_to_item(iter.next())); + iter.with_item_mut(|item| *item = next); + iter } /// Flush the mem-table to SSTable. Implement in week 1 day 6. @@ -142,22 +152,34 @@ pub struct MemTableIterator { item: (Bytes, Bytes), } +impl MemTableIterator { + // This function is used to convert a `SkipMap` entry to a key-value pair. + fn entry_to_item(entry: Option>) -> (Bytes, Bytes) { + entry + .map(|x| (x.key().clone(), x.value().clone())) + .unwrap_or_else(|| (Bytes::new(), Bytes::new())) + } +} + impl StorageIterator for MemTableIterator { type KeyType<'a> = KeySlice<'a>; fn value(&self) -> &[u8] { - unimplemented!() + &self.borrow_item().1 } fn key(&self) -> KeySlice { - unimplemented!() + KeySlice::from_slice(&self.borrow_item().0) } + // is_valid returns if the iterator has reached the end or errored. fn is_valid(&self) -> bool { - unimplemented!() + !self.borrow_item().0.is_empty() } fn next(&mut self) -> Result<()> { - unimplemented!() + let next = self.with_iter_mut(|iter| MemTableIterator::entry_to_item(iter.next())); + self.with_item_mut(|item| *item = next); + Ok(()) } } diff --git a/mini-lsm-starter/src/tests.rs b/mini-lsm-starter/src/tests.rs index 283b258ac..bcdd96eb7 100644 --- a/mini-lsm-starter/src/tests.rs +++ b/mini-lsm-starter/src/tests.rs @@ -3,3 +3,4 @@ mod harness; mod week1_day1; +mod week1_day2; diff --git a/mini-lsm-starter/src/tests/harness.rs b/mini-lsm-starter/src/tests/harness.rs index 104cfdaec..023b92b0b 100644 --- a/mini-lsm-starter/src/tests/harness.rs +++ b/mini-lsm-starter/src/tests/harness.rs @@ -154,9 +154,11 @@ where assert_eq!( k, iter.key(), - "expected key: {:?}, actual key: {:?}", + "expected [key, value]: [{:?}, {:?}], actual [key, value]: [{:?}, {:?}]", k, + v, as_bytes(iter.key()), + as_bytes(iter.value()), ); assert_eq!( v, diff --git a/mini-lsm-starter/src/tests/week1_day2.rs b/mini-lsm-starter/src/tests/week1_day2.rs new file mode 100644 index 000000000..6c1b002e1 --- /dev/null +++ b/mini-lsm-starter/src/tests/week1_day2.rs @@ -0,0 +1,313 @@ +use std::{ops::Bound, sync::Arc}; + +use bytes::Bytes; +use tempfile::tempdir; + +use crate::{ + iterators::{merge_iterator::MergeIterator, StorageIterator}, + lsm_iterator::FusedIterator, + lsm_storage::{LsmStorageInner, LsmStorageOptions}, + mem_table::MemTable, + tests::harness::check_lsm_iter_result_by_key, +}; + +use super::harness::{check_iter_result_by_key, expect_iter_error, MockIterator}; + +#[test] +fn test_task1_memtable_iter() { + use std::ops::Bound; + let memtable = MemTable::create(0); + memtable.for_testing_put_slice(b"key1", b"value1").unwrap(); + memtable.for_testing_put_slice(b"key2", b"value2").unwrap(); + memtable.for_testing_put_slice(b"key3", b"value3").unwrap(); + + { + let mut iter = memtable.for_testing_scan_slice(Bound::Unbounded, Bound::Unbounded); + assert_eq!(iter.key().for_testing_key_ref(), b"key1"); + assert_eq!(iter.value(), b"value1"); + assert!(iter.is_valid()); + iter.next().unwrap(); + assert_eq!(iter.key().for_testing_key_ref(), b"key2"); + assert_eq!(iter.value(), b"value2"); + assert!(iter.is_valid()); + iter.next().unwrap(); + assert_eq!(iter.key().for_testing_key_ref(), b"key3"); + assert_eq!(iter.value(), b"value3"); + assert!(iter.is_valid()); + iter.next().unwrap(); + assert!(!iter.is_valid()); + } + + { + let mut iter = + memtable.for_testing_scan_slice(Bound::Included(b"key1"), Bound::Included(b"key2")); + assert_eq!(iter.key().for_testing_key_ref(), b"key1"); + assert_eq!(iter.value(), b"value1"); + assert!(iter.is_valid()); + iter.next().unwrap(); + assert_eq!(iter.key().for_testing_key_ref(), b"key2"); + assert_eq!(iter.value(), b"value2"); + assert!(iter.is_valid()); + iter.next().unwrap(); + assert!(!iter.is_valid()); + } + + { + let mut iter = + memtable.for_testing_scan_slice(Bound::Excluded(b"key1"), Bound::Excluded(b"key3")); + assert_eq!(iter.key().for_testing_key_ref(), b"key2"); + assert_eq!(iter.value(), b"value2"); + assert!(iter.is_valid()); + iter.next().unwrap(); + assert!(!iter.is_valid()); + } +} + +#[test] +fn test_task1_empty_memtable_iter() { + use std::ops::Bound; + let memtable = MemTable::create(0); + { + let iter = + memtable.for_testing_scan_slice(Bound::Excluded(b"key1"), Bound::Excluded(b"key3")); + assert!(!iter.is_valid()); + } + { + let iter = + memtable.for_testing_scan_slice(Bound::Included(b"key1"), Bound::Included(b"key2")); + assert!(!iter.is_valid()); + } + { + let iter = memtable.for_testing_scan_slice(Bound::Unbounded, Bound::Unbounded); + assert!(!iter.is_valid()); + } +} + +#[test] +fn test_task2_merge_1() { + let i1 = MockIterator::new(vec![ + (Bytes::from("a"), Bytes::from("1.1")), + (Bytes::from("b"), Bytes::from("2.1")), + (Bytes::from("c"), Bytes::from("3.1")), + (Bytes::from("e"), Bytes::new()), + ]); + let i2 = MockIterator::new(vec![ + (Bytes::from("a"), Bytes::from("1.2")), + (Bytes::from("b"), Bytes::from("2.2")), + (Bytes::from("c"), Bytes::from("3.2")), + (Bytes::from("d"), Bytes::from("4.2")), + ]); + let i3 = MockIterator::new(vec![ + (Bytes::from("b"), Bytes::from("2.3")), + (Bytes::from("c"), Bytes::from("3.3")), + (Bytes::from("d"), Bytes::from("4.3")), + ]); + + let mut iter = MergeIterator::create(vec![ + Box::new(i1.clone()), + Box::new(i2.clone()), + Box::new(i3.clone()), + ]); + + check_iter_result_by_key( + &mut iter, + vec![ + (Bytes::from("a"), Bytes::from("1.1")), + (Bytes::from("b"), Bytes::from("2.1")), + (Bytes::from("c"), Bytes::from("3.1")), + (Bytes::from("d"), Bytes::from("4.2")), + (Bytes::from("e"), Bytes::new()), + ], + ); + + let mut iter = MergeIterator::create(vec![Box::new(i3), Box::new(i1), Box::new(i2)]); + + check_iter_result_by_key( + &mut iter, + vec![ + (Bytes::from("a"), Bytes::from("1.1")), + (Bytes::from("b"), Bytes::from("2.3")), + (Bytes::from("c"), Bytes::from("3.3")), + (Bytes::from("d"), Bytes::from("4.3")), + (Bytes::from("e"), Bytes::new()), + ], + ); +} + +#[test] +fn test_task2_merge_2() { + let i1 = MockIterator::new(vec![ + (Bytes::from("a"), Bytes::from("1.1")), + (Bytes::from("b"), Bytes::from("2.1")), + (Bytes::from("c"), Bytes::from("3.1")), + ]); + let i2 = MockIterator::new(vec![ + (Bytes::from("d"), Bytes::from("1.2")), + (Bytes::from("e"), Bytes::from("2.2")), + (Bytes::from("f"), Bytes::from("3.2")), + (Bytes::from("g"), Bytes::from("4.2")), + ]); + let i3 = MockIterator::new(vec![ + (Bytes::from("h"), Bytes::from("1.3")), + (Bytes::from("i"), Bytes::from("2.3")), + (Bytes::from("j"), Bytes::from("3.3")), + (Bytes::from("k"), Bytes::from("4.3")), + ]); + let i4 = MockIterator::new(vec![]); + let result = vec![ + (Bytes::from("a"), Bytes::from("1.1")), + (Bytes::from("b"), Bytes::from("2.1")), + (Bytes::from("c"), Bytes::from("3.1")), + (Bytes::from("d"), Bytes::from("1.2")), + (Bytes::from("e"), Bytes::from("2.2")), + (Bytes::from("f"), Bytes::from("3.2")), + (Bytes::from("g"), Bytes::from("4.2")), + (Bytes::from("h"), Bytes::from("1.3")), + (Bytes::from("i"), Bytes::from("2.3")), + (Bytes::from("j"), Bytes::from("3.3")), + (Bytes::from("k"), Bytes::from("4.3")), + ]; + + let mut iter = MergeIterator::create(vec![ + Box::new(i1.clone()), + Box::new(i2.clone()), + Box::new(i3.clone()), + Box::new(i4.clone()), + ]); + check_iter_result_by_key(&mut iter, result.clone()); + + let mut iter = MergeIterator::create(vec![ + Box::new(i2.clone()), + Box::new(i4.clone()), + Box::new(i3.clone()), + Box::new(i1.clone()), + ]); + check_iter_result_by_key(&mut iter, result.clone()); + + let mut iter = + MergeIterator::create(vec![Box::new(i4), Box::new(i3), Box::new(i2), Box::new(i1)]); + check_iter_result_by_key(&mut iter, result); +} + +#[test] +fn test_task2_merge_empty() { + let mut iter = MergeIterator::::create(vec![]); + check_iter_result_by_key(&mut iter, vec![]); + + let i1 = MockIterator::new(vec![ + (Bytes::from("a"), Bytes::from("1.1")), + (Bytes::from("b"), Bytes::from("2.1")), + (Bytes::from("c"), Bytes::from("3.1")), + ]); + let i2 = MockIterator::new(vec![]); + let mut iter = MergeIterator::::create(vec![Box::new(i1), Box::new(i2)]); + check_iter_result_by_key( + &mut iter, + vec![ + (Bytes::from("a"), Bytes::from("1.1")), + (Bytes::from("b"), Bytes::from("2.1")), + (Bytes::from("c"), Bytes::from("3.1")), + ], + ); +} + +#[test] +fn test_task2_merge_error() { + let mut iter = MergeIterator::::create(vec![]); + check_iter_result_by_key(&mut iter, vec![]); + + let i1 = MockIterator::new(vec![ + (Bytes::from("a"), Bytes::from("1.1")), + (Bytes::from("b"), Bytes::from("2.1")), + (Bytes::from("c"), Bytes::from("3.1")), + ]); + let i2 = MockIterator::new_with_error( + vec![ + (Bytes::from("a"), Bytes::from("1.1")), + (Bytes::from("b"), Bytes::from("2.1")), + (Bytes::from("c"), Bytes::from("3.1")), + ], + 1, + ); + let iter = MergeIterator::::create(vec![Box::new(i1), Box::new(i2)]); + // your implementation should correctly throw an error instead of panic + expect_iter_error(iter); +} + +#[test] +fn test_task3_fused_iterator() { + let iter = MockIterator::new(vec![]); + let mut fused_iter = FusedIterator::new(iter); + assert!(!fused_iter.is_valid()); + fused_iter.next().unwrap(); + fused_iter.next().unwrap(); + fused_iter.next().unwrap(); + assert!(!fused_iter.is_valid()); + + let iter = MockIterator::new_with_error( + vec![ + (Bytes::from("a"), Bytes::from("1.1")), + (Bytes::from("a"), Bytes::from("1.1")), + ], + 1, + ); + let mut fused_iter = FusedIterator::new(iter); + assert!(fused_iter.is_valid()); + assert!(fused_iter.next().is_err()); + assert!(!fused_iter.is_valid()); + assert!(fused_iter.next().is_err()); + assert!(fused_iter.next().is_err()); +} + +#[test] +fn test_task4_integration() { + let dir = tempdir().unwrap(); + let storage = Arc::new( + LsmStorageInner::open(dir.path(), LsmStorageOptions::default_for_week1_test()).unwrap(), + ); + storage.put(b"1", b"233").unwrap(); + storage.put(b"2", b"2333").unwrap(); + storage.put(b"3", b"23333").unwrap(); + storage + .force_freeze_memtable(&storage.state_lock.lock()) + .unwrap(); + storage.delete(b"1").unwrap(); + storage.delete(b"2").unwrap(); + storage.put(b"3", b"2333").unwrap(); + storage.put(b"4", b"23333").unwrap(); + storage + .force_freeze_memtable(&storage.state_lock.lock()) + .unwrap(); + storage.put(b"1", b"233333").unwrap(); + storage.put(b"3", b"233333").unwrap(); + { + let mut iter = storage.scan(Bound::Unbounded, Bound::Unbounded).unwrap(); + check_lsm_iter_result_by_key( + &mut iter, + vec![ + (Bytes::from_static(b"1"), Bytes::from_static(b"233333")), + (Bytes::from_static(b"3"), Bytes::from_static(b"233333")), + (Bytes::from_static(b"4"), Bytes::from_static(b"23333")), + ], + ); + assert!(!iter.is_valid()); + iter.next().unwrap(); + iter.next().unwrap(); + iter.next().unwrap(); + assert!(!iter.is_valid()); + } + { + let mut iter = storage + .scan(Bound::Included(b"2"), Bound::Included(b"3")) + .unwrap(); + check_lsm_iter_result_by_key( + &mut iter, + vec![(Bytes::from_static(b"3"), Bytes::from_static(b"233333"))], + ); + assert!(!iter.is_valid()); + iter.next().unwrap(); + iter.next().unwrap(); + iter.next().unwrap(); + assert!(!iter.is_valid()); + } +} From 9090c5b89084b23f70b15a57cfb43ea79f04bb3e Mon Sep 17 00:00:00 2001 From: husharp Date: Sun, 4 Feb 2024 23:02:09 +0800 Subject: [PATCH 03/22] week1_day3 Signed-off-by: husharp --- mini-lsm-starter/src/block.rs | 37 +++++- mini-lsm-starter/src/block/builder.rs | 70 ++++++++++- mini-lsm-starter/src/block/iterator.rs | 73 +++++++++-- mini-lsm-starter/src/tests.rs | 1 + mini-lsm-starter/src/tests/harness.rs | 4 +- mini-lsm-starter/src/tests/week1_day3.rs | 147 +++++++++++++++++++++++ 6 files changed, 311 insertions(+), 21 deletions(-) create mode 100644 mini-lsm-starter/src/tests/week1_day3.rs diff --git a/mini-lsm-starter/src/block.rs b/mini-lsm-starter/src/block.rs index 41e3df2d4..0dd37d730 100644 --- a/mini-lsm-starter/src/block.rs +++ b/mini-lsm-starter/src/block.rs @@ -4,10 +4,16 @@ mod builder; mod iterator; +use std::u8; + pub use builder::BlockBuilder; -use bytes::Bytes; +use bytes::{Buf, BufMut, Bytes}; pub use iterator::BlockIterator; +use crate::key::KeyVec; + +pub(crate) const SIZEOF_U16: usize = std::mem::size_of::(); + /// A block is the smallest unit of read and caching in LSM tree. It is a collection of sorted key-value pairs. pub struct Block { pub(crate) data: Vec, @@ -18,11 +24,36 @@ impl Block { /// Encode the internal data to the data layout illustrated in the tutorial /// Note: You may want to recheck if any of the expected field is missing from your output pub fn encode(&self) -> Bytes { - unimplemented!() + let mut buf = self.data.clone(); + for offset in &self.offsets { + buf.put_u16(*offset); + } + // num of elements + buf.put_u16(self.offsets.len() as u16); + buf.into() } /// Decode from the data layout, transform the input `data` to a single `Block` pub fn decode(data: &[u8]) -> Self { - unimplemented!() + let num_of_elements = (&data[data.len() - SIZEOF_U16..]).get_u16(); + // transform offsets + let data_end = data.len() - SIZEOF_U16 - SIZEOF_U16 * (num_of_elements as usize); + let offsets = data[data_end..data.len() - SIZEOF_U16] + .chunks(SIZEOF_U16) + .map(|mut x| x.get_u16()) + .collect(); + // transform data + let data = data[..data_end].to_vec(); + Self { data, offsets } + } + + fn get_first_key(&self) -> KeyVec { + // The first key is the first entry in the data section + // redundant key is 0 + let mut entry = &self.data[0..]; + entry.get_u16(); + let key_len = entry.get_u16(); + let key = &entry[..key_len as usize]; + KeyVec::from_vec(key.to_vec()) } } diff --git a/mini-lsm-starter/src/block/builder.rs b/mini-lsm-starter/src/block/builder.rs index 4c8395997..800501c7e 100644 --- a/mini-lsm-starter/src/block/builder.rs +++ b/mini-lsm-starter/src/block/builder.rs @@ -1,11 +1,20 @@ #![allow(unused_variables)] // TODO(you): remove this lint after implementing this mod #![allow(dead_code)] // TODO(you): remove this lint after implementing this mod +use bytes::BufMut; + use crate::key::{KeySlice, KeyVec}; -use super::Block; +use super::{Block, SIZEOF_U16}; /// Builds a block. +/* +---------------------------------------------------------------------------------------------------- +| Data Section | Offset Section | Extra | +---------------------------------------------------------------------------------------------------- +| Entry #1 | Entry #2 | ... | Entry #N | Offset #1 | Offset #2 | ... | Offset #N | num_of_elements | +---------------------------------------------------------------------------------------------------- +*/ pub struct BlockBuilder { /// Offsets of each key-value entries. offsets: Vec, @@ -17,25 +26,76 @@ pub struct BlockBuilder { first_key: KeyVec, } +fn compute_redundant_key(first_key: KeySlice, cur_key: KeySlice) -> usize { + let mut i = 0; + while i < first_key.len() && i < cur_key.len() { + if first_key.raw_ref()[i] == cur_key.raw_ref()[i] { + i += 1; + } else { + break; + } + } + i +} + impl BlockBuilder { /// Creates a new block builder. pub fn new(block_size: usize) -> Self { - unimplemented!() + Self { + offsets: Vec::new(), + data: Vec::new(), + block_size, + first_key: KeyVec::new(), + } } + /* Each entry is a key-value pair. + ----------------------------------------------------------------------- + | Entry #1 | ... | + ----------------------------------------------------------------------- + | redundant_key(first_key_index) | key_len (2B) | key (keylen) | value_len (2B) | value (varlen) | ... | + ----------------------------------------------------------------------- + */ /// Adds a key-value pair to the block. Returns false when the block is full. #[must_use] pub fn add(&mut self, key: KeySlice, value: &[u8]) -> bool { - unimplemented!() + if self.data.len() + self.offsets.len() + SIZEOF_U16 /* num of elements */ + + key.len() + value.len() + SIZEOF_U16 /* key len */ + SIZEOF_U16 /* value len */ + > self.block_size + && !self.is_empty() + { + return false; + } + // Add the offset of the data into the offset array. + self.offsets.push(self.data.len() as u16); + // compute redundant key + // when is the first key, the redundant key is 0 + let redundant_index = compute_redundant_key(self.first_key.as_key_slice(), key); + self.data.put_u16(redundant_index as u16); + // rest of the key + self.data.put_u16((key.len() - redundant_index) as u16); + self.data.put(&key.raw_ref()[redundant_index..]); + + self.data.put_u16(value.len() as u16); + self.data.put(value); + + if self.first_key.is_empty() { + self.first_key = key.to_key_vec(); + } + + true } /// Check if there is no key-value pair in the block. pub fn is_empty(&self) -> bool { - unimplemented!() + self.data.is_empty() } /// Finalize the block. pub fn build(self) -> Block { - unimplemented!() + Block { + data: self.data, + offsets: self.offsets, + } } } diff --git a/mini-lsm-starter/src/block/iterator.rs b/mini-lsm-starter/src/block/iterator.rs index f823b5ad2..4c771f445 100644 --- a/mini-lsm-starter/src/block/iterator.rs +++ b/mini-lsm-starter/src/block/iterator.rs @@ -3,9 +3,11 @@ use std::sync::Arc; +use bytes::Buf; + use crate::key::{KeySlice, KeyVec}; -use super::Block; +use super::{Block, SIZEOF_U16}; /// Iterates on a block. pub struct BlockIterator { @@ -24,54 +26,105 @@ pub struct BlockIterator { impl BlockIterator { fn new(block: Arc) -> Self { Self { + first_key: block.get_first_key(), block, key: KeyVec::new(), value_range: (0, 0), idx: 0, - first_key: KeyVec::new(), } } /// Creates a block iterator and seek to the first entry. pub fn create_and_seek_to_first(block: Arc) -> Self { - unimplemented!() + let mut iter = BlockIterator::new(block); + iter.seek_to_first(); + iter } /// Creates a block iterator and seek to the first key that >= `key`. pub fn create_and_seek_to_key(block: Arc, key: KeySlice) -> Self { - unimplemented!() + let mut iter = BlockIterator::new(block); + iter.seek_to_key(key); + iter } /// Returns the key of the current entry. pub fn key(&self) -> KeySlice { - unimplemented!() + self.key.as_key_slice() } /// Returns the value of the current entry. pub fn value(&self) -> &[u8] { - unimplemented!() + &self.block.data[self.value_range.0..self.value_range.1] } /// Returns true if the iterator is valid. /// Note: You may want to make use of `key` pub fn is_valid(&self) -> bool { - unimplemented!() + self.key.len() > 0 } /// Seeks to the first key in the block. pub fn seek_to_first(&mut self) { - unimplemented!() + self.seek_to_idx(0); } /// Move to the next key in the block. pub fn next(&mut self) { - unimplemented!() + self.idx += 1; + self.seek_to_idx(self.idx); + } + + /// Seek to the specified position and update the current `key` and `value` + /// Index update will be handled by caller + pub fn seek_to_idx(&mut self, idx: usize) { + if idx >= self.block.offsets.len() { + self.key.clear(); + self.value_range = (0, 0); + return; + } + + let offset = self.block.offsets[idx] as usize; + let mut entry = &self.block.data[offset..]; + // since `get_u16()` will automatically move the ptr 2 bytes ahead here, + // we don't need to manually advance it + let redundant_len = entry.get_u16() as usize; + let key_len = entry.get_u16() as usize; + let key = &entry[..key_len]; + self.key.clear(); + self.key.append(&self.first_key.raw_ref()[..redundant_len]); + self.key.append(key); + // move the entry ptr to the begin of the value + entry.advance(key_len); + let value_len = entry.get_u16() as usize; + let value_offset_begin = offset + + SIZEOF_U16*2 /* redundant + key_len(2B) */ + + key_len + + SIZEOF_U16 /* value_len(2B) */; + let value_offset_end = value_offset_begin + value_len; + self.value_range = (value_offset_begin, value_offset_end); + // move the entry ptr to the end of the value + entry.advance(value_len); + // set index + self.idx = idx; } /// Seek to the first key that >= `key`. /// Note: You should assume the key-value pairs in the block are sorted when being added by /// callers. pub fn seek_to_key(&mut self, key: KeySlice) { - unimplemented!() + let mut low = 0; + let mut high = self.block.offsets.len(); + while low < high { + let mid = low + (high - low) / 2; + self.seek_to_idx(mid); + assert!(self.is_valid()); + match self.key().cmp(&key) { + std::cmp::Ordering::Less => low = mid + 1, + std::cmp::Ordering::Greater => high = mid, + std::cmp::Ordering::Equal => return, + } + } + self.seek_to_idx(low); } } diff --git a/mini-lsm-starter/src/tests.rs b/mini-lsm-starter/src/tests.rs index bcdd96eb7..69007cae9 100644 --- a/mini-lsm-starter/src/tests.rs +++ b/mini-lsm-starter/src/tests.rs @@ -4,3 +4,4 @@ mod harness; mod week1_day1; mod week1_day2; +mod week1_day3; diff --git a/mini-lsm-starter/src/tests/harness.rs b/mini-lsm-starter/src/tests/harness.rs index 023b92b0b..104cfdaec 100644 --- a/mini-lsm-starter/src/tests/harness.rs +++ b/mini-lsm-starter/src/tests/harness.rs @@ -154,11 +154,9 @@ where assert_eq!( k, iter.key(), - "expected [key, value]: [{:?}, {:?}], actual [key, value]: [{:?}, {:?}]", + "expected key: {:?}, actual key: {:?}", k, - v, as_bytes(iter.key()), - as_bytes(iter.value()), ); assert_eq!( v, diff --git a/mini-lsm-starter/src/tests/week1_day3.rs b/mini-lsm-starter/src/tests/week1_day3.rs new file mode 100644 index 000000000..91deea788 --- /dev/null +++ b/mini-lsm-starter/src/tests/week1_day3.rs @@ -0,0 +1,147 @@ +use std::sync::Arc; + +use bytes::Bytes; + +use crate::{ + block::{Block, BlockBuilder, BlockIterator}, + key::{KeySlice, KeyVec}, +}; + +#[test] +fn test_block_build_single_key() { + let mut builder = BlockBuilder::new(16); + assert!(builder.add(KeySlice::for_testing_from_slice_no_ts(b"233"), b"233333")); + builder.build(); +} + +#[test] +fn test_block_build_full() { + let mut builder = BlockBuilder::new(16); + assert!(builder.add(KeySlice::for_testing_from_slice_no_ts(b"11"), b"11")); + assert!(!builder.add(KeySlice::for_testing_from_slice_no_ts(b"22"), b"22")); + builder.build(); +} + +#[test] +fn test_block_build_large_1() { + let mut builder = BlockBuilder::new(16); + assert!(builder.add( + KeySlice::for_testing_from_slice_no_ts(b"11"), + &b"1".repeat(100) + )); + builder.build(); +} + +#[test] +fn test_block_build_large_2() { + let mut builder = BlockBuilder::new(16); + assert!(builder.add(KeySlice::for_testing_from_slice_no_ts(b"11"), b"1")); + assert!(!builder.add( + KeySlice::for_testing_from_slice_no_ts(b"11"), + &b"1".repeat(100) + )); +} + +fn key_of(idx: usize) -> KeyVec { + KeyVec::for_testing_from_vec_no_ts(format!("key_{:03}", idx * 5).into_bytes()) +} + +fn value_of(idx: usize) -> Vec { + format!("value_{:010}", idx).into_bytes() +} + +fn num_of_keys() -> usize { + 100 +} + +fn generate_block() -> Block { + let mut builder = BlockBuilder::new(10000); + for idx in 0..num_of_keys() { + let key = key_of(idx); + let value = value_of(idx); + assert!(builder.add(key.as_key_slice(), &value[..])); + } + builder.build() +} + +#[test] +fn test_block_build_all() { + generate_block(); +} + +#[test] +fn test_block_encode() { + let block = generate_block(); + block.encode(); +} + +#[test] +fn test_block_decode() { + let block = generate_block(); + let encoded = block.encode(); + let decoded_block = Block::decode(&encoded); + assert_eq!(block.offsets, decoded_block.offsets); + assert_eq!(block.data, decoded_block.data); +} + +fn as_bytes(x: &[u8]) -> Bytes { + Bytes::copy_from_slice(x) +} + +#[test] +fn test_block_iterator() { + let block = Arc::new(generate_block()); + let mut iter = BlockIterator::create_and_seek_to_first(block); + for _ in 0..5 { + for i in 0..num_of_keys() { + let key = iter.key(); + let value = iter.value(); + assert_eq!( + key.for_testing_key_ref(), + key_of(i).for_testing_key_ref(), + "expected key: {:?}, actual key: {:?}", + as_bytes(key_of(i).for_testing_key_ref()), + as_bytes(key.for_testing_key_ref()) + ); + assert_eq!( + value, + value_of(i), + "expected value: {:?}, actual value: {:?}", + as_bytes(&value_of(i)), + as_bytes(value) + ); + iter.next(); + } + iter.seek_to_first(); + } +} + +#[test] +fn test_block_seek_key() { + let block = Arc::new(generate_block()); + let mut iter = BlockIterator::create_and_seek_to_key(block, key_of(0).as_key_slice()); + for offset in 1..=5 { + for i in 0..num_of_keys() { + let key = iter.key(); + let value = iter.value(); + assert_eq!( + key.for_testing_key_ref(), + key_of(i).for_testing_key_ref(), + "expected key: {:?}, actual key: {:?}", + as_bytes(key_of(i).for_testing_key_ref()), + as_bytes(key.for_testing_key_ref()) + ); + assert_eq!( + value, + value_of(i), + "expected value: {:?}, actual value: {:?}", + as_bytes(&value_of(i)), + as_bytes(value) + ); + iter.seek_to_key(KeySlice::for_testing_from_slice_no_ts( + &format!("key_{:03}", i * 5 + offset).into_bytes(), + )); + } + iter.seek_to_key(KeySlice::for_testing_from_slice_no_ts(b"k")); + } +} From bc52923de82a0bd5dc3fbbe927d368883de8275f Mon Sep 17 00:00:00 2001 From: husharp Date: Tue, 6 Feb 2024 17:05:20 +0800 Subject: [PATCH 04/22] week1_day4 Signed-off-by: husharp --- mini-lsm-starter/src/block/builder.rs | 3 - mini-lsm-starter/src/block/iterator.rs | 1 + .../src/iterators/merge_iterator.rs | 3 - mini-lsm-starter/src/lsm_iterator.rs | 3 - mini-lsm-starter/src/table.rs | 111 +++++++++++--- mini-lsm-starter/src/table/builder.rs | 92 ++++++++++-- mini-lsm-starter/src/table/iterator.rs | 67 +++++++-- mini-lsm-starter/src/tests.rs | 1 + mini-lsm-starter/src/tests/week1_day4.rs | 141 ++++++++++++++++++ 9 files changed, 369 insertions(+), 53 deletions(-) create mode 100644 mini-lsm-starter/src/tests/week1_day4.rs diff --git a/mini-lsm-starter/src/block/builder.rs b/mini-lsm-starter/src/block/builder.rs index 800501c7e..b20662fa0 100644 --- a/mini-lsm-starter/src/block/builder.rs +++ b/mini-lsm-starter/src/block/builder.rs @@ -1,6 +1,3 @@ -#![allow(unused_variables)] // TODO(you): remove this lint after implementing this mod -#![allow(dead_code)] // TODO(you): remove this lint after implementing this mod - use bytes::BufMut; use crate::key::{KeySlice, KeyVec}; diff --git a/mini-lsm-starter/src/block/iterator.rs b/mini-lsm-starter/src/block/iterator.rs index 4c771f445..9c53e7fd7 100644 --- a/mini-lsm-starter/src/block/iterator.rs +++ b/mini-lsm-starter/src/block/iterator.rs @@ -107,6 +107,7 @@ impl BlockIterator { entry.advance(value_len); // set index self.idx = idx; + // idx will be 0 if the iterator is invalid } /// Seek to the first key that >= `key`. diff --git a/mini-lsm-starter/src/iterators/merge_iterator.rs b/mini-lsm-starter/src/iterators/merge_iterator.rs index 490bb28de..4bf8cc99a 100644 --- a/mini-lsm-starter/src/iterators/merge_iterator.rs +++ b/mini-lsm-starter/src/iterators/merge_iterator.rs @@ -1,6 +1,3 @@ -#![allow(unused_variables)] // TODO(you): remove this lint after implementing this mod -#![allow(dead_code)] // TODO(you): remove this lint after implementing this mod - use std::cmp::{self}; use std::collections::binary_heap::PeekMut; use std::collections::BinaryHeap; diff --git a/mini-lsm-starter/src/lsm_iterator.rs b/mini-lsm-starter/src/lsm_iterator.rs index 6b47318d5..fd1373bbf 100644 --- a/mini-lsm-starter/src/lsm_iterator.rs +++ b/mini-lsm-starter/src/lsm_iterator.rs @@ -1,6 +1,3 @@ -#![allow(unused_variables)] // TODO(you): remove this lint after implementing this mod -#![allow(dead_code)] // TODO(you): remove this lint after implementing this mod - use anyhow::{bail, Ok, Result}; use crate::{ diff --git a/mini-lsm-starter/src/table.rs b/mini-lsm-starter/src/table.rs index 02baaa230..e792de605 100644 --- a/mini-lsm-starter/src/table.rs +++ b/mini-lsm-starter/src/table.rs @@ -9,9 +9,9 @@ use std::fs::File; use std::path::Path; use std::sync::Arc; -use anyhow::Result; +use anyhow::{Ok, Result}; pub use builder::SsTableBuilder; -use bytes::Buf; +use bytes::{Buf, BufMut}; pub use iterator::SsTableIterator; use crate::block::Block; @@ -30,21 +30,44 @@ pub struct BlockMeta { pub last_key: KeyBytes, } +/* +----------------------------------------------------------------------- +| block meta | ... | +----------------------------------------------------------------------- +| offset(4B) | first_key_len (2B) | first_key (keylen) | last_key_len (2B) | last_key (keylen) | ... | +----------------------------------------------------------------------- +*/ impl BlockMeta { /// Encode block meta to a buffer. /// You may add extra fields to the buffer, /// in order to help keep track of `first_key` when decoding from the same buffer in the future. - pub fn encode_block_meta( - block_meta: &[BlockMeta], - #[allow(clippy::ptr_arg)] // remove this allow after you finish - buf: &mut Vec, - ) { - unimplemented!() + pub fn encode_block_metas(block_meta: &[BlockMeta], buf: &mut Vec) { + let original_len = buf.len(); + for meta in block_meta { + buf.put_u32(meta.offset as u32); + buf.put_u16(meta.first_key.len() as u16); + buf.put_slice(meta.first_key.raw_ref()); + buf.put_u16(meta.last_key.len() as u16); + buf.put_slice(meta.last_key.raw_ref()); + } } /// Decode block meta from a buffer. - pub fn decode_block_meta(buf: impl Buf) -> Vec { - unimplemented!() + pub fn decode_block_metas(mut buf: impl Buf) -> Result> { + let mut block_meta = vec![]; + while buf.has_remaining() { + let offset = buf.get_u32() as usize; + let first_key_len = buf.get_u16() as usize; + let first_key = buf.copy_to_bytes(first_key_len); + let last_key_len = buf.get_u16() as usize; + let last_key = buf.copy_to_bytes(last_key_len); + block_meta.push(BlockMeta { + offset, + first_key: KeyBytes::from_bytes(first_key), + last_key: KeyBytes::from_bytes(last_key), + }); + } + Ok(block_meta) } } @@ -85,16 +108,16 @@ impl FileObject { /// An SSTable. pub struct SsTable { + id: usize, /// The actual storage unit of SsTable, the format is as above. pub(crate) file: FileObject, /// The meta blocks that hold info for data blocks. - pub(crate) block_meta: Vec, + pub(crate) block_metas: Vec, /// The offset that indicates the start point of meta blocks in `file`. pub(crate) block_meta_offset: usize, - id: usize, - block_cache: Option>, first_key: KeyBytes, last_key: KeyBytes, + block_cache: Option>, pub(crate) bloom: Option, /// The maximum timestamp stored in this SST, implemented in week 3. max_ts: u64, @@ -108,7 +131,30 @@ impl SsTable { /// Open SSTable from a file. pub fn open(id: usize, block_cache: Option>, file: FileObject) -> Result { - unimplemented!() + let len = file.size(); + // u32 for extra info + let raw_metadata_offset = file.read(len - 4, 4)?; + let metadat_offset = (&raw_metadata_offset[..]).get_u32() as u64; + let raw_metadata = file.read( + metadat_offset, + len - metadat_offset - 4, /* extra size */ + )?; + let block_metas = BlockMeta::decode_block_metas(raw_metadata.as_slice())?; + + let raw_data = file.read(0, metadat_offset)?; + + let sst_table = SsTable { + id, + file, + block_meta_offset: metadat_offset as usize, + first_key: block_metas.first().unwrap().first_key.clone(), + last_key: block_metas.last().unwrap().last_key.clone(), + block_metas, + block_cache, + bloom: None, + max_ts: 0, + }; + Ok(sst_table) } /// Create a mock SST with only first key + last key metadata @@ -120,7 +166,7 @@ impl SsTable { ) -> Self { Self { file: FileObject(None, file_size), - block_meta: vec![], + block_metas: vec![], block_meta_offset: 0, id, block_cache: None, @@ -133,24 +179,47 @@ impl SsTable { /// Read a block from the disk. pub fn read_block(&self, block_idx: usize) -> Result> { - unimplemented!() + let offset = self.block_metas[block_idx].offset as u64; + let next_offset = self + .block_metas + .get(block_idx + 1) + .map_or(self.block_meta_offset, |x| x.offset) as u64; + let data = self.file.read(offset, next_offset - offset)?; + Ok(Arc::new(Block::decode(&data))) } /// Read a block from disk, with block cache. (Day 4) pub fn read_block_cached(&self, block_idx: usize) -> Result> { - unimplemented!() + if let Some(ref block_cache) = self.block_cache { + let block = block_cache + .try_get_with((self.id, block_idx), || self.read_block(block_idx)) + .map_err(|e| anyhow::anyhow!("block cache error: {:?}", e))?; + Ok(block) + } else { + self.read_block(block_idx) + } } + /* + -------------------------------------- + | block 1 | block 2 | block meta | + -------------------------------------- + | a, b, c | e, f, g | 1: a/c, 2: e/g | + -------------------------------------- + */ /// Find the block that may contain `key`. - /// Note: You may want to make use of the `first_key` stored in `BlockMeta`. - /// You may also assume the key-value pairs stored in each consecutive block are sorted. + /// make use of the `first_key` stored in `BlockMeta`. + /// because for example, if we want to get `b`, + /// we can directly we can know block 1 contains keys a <= keys < e. pub fn find_block_idx(&self, key: KeySlice) -> usize { - unimplemented!() + self.block_metas + .partition_point(|meta| meta.first_key.as_key_slice() <= key) + .saturating_sub(1) } /// Get number of data blocks. pub fn num_of_blocks(&self) -> usize { - self.block_meta.len() + self.block_metas.len() } pub fn first_key(&self) -> &KeyBytes { diff --git a/mini-lsm-starter/src/table/builder.rs b/mini-lsm-starter/src/table/builder.rs index cea3d08f8..7ddc72573 100644 --- a/mini-lsm-starter/src/table/builder.rs +++ b/mini-lsm-starter/src/table/builder.rs @@ -1,28 +1,47 @@ #![allow(unused_variables)] // TODO(you): remove this lint after implementing this mod #![allow(dead_code)] // TODO(you): remove this lint after implementing this mod -use std::path::Path; use std::sync::Arc; +use std::{borrow::Borrow, path::Path}; use anyhow::Result; +use bytes::BufMut; -use super::{BlockMeta, SsTable}; -use crate::{block::BlockBuilder, key::KeySlice, lsm_storage::BlockCache}; +use super::{BlockMeta, FileObject, SsTable}; +use crate::{ + block::BlockBuilder, + key::{KeySlice, KeyVec}, + lsm_storage::BlockCache, +}; /// Builds an SSTable from key-value pairs. pub struct SsTableBuilder { - builder: BlockBuilder, - first_key: Vec, - last_key: Vec, + block_builder: BlockBuilder, + first_key: KeyVec, + last_key: KeyVec, data: Vec, - pub(crate) meta: Vec, + pub(crate) metas: Vec, block_size: usize, } +/* +------------------------------------------------------------------------------------------- +| Block Section | Meta Section | Extra | +------------------------------------------------------------------------------------------- +| data block | ... | data block | vec | meta block offset (u32) | +------------------------------------------------------------------------------------------- +*/ impl SsTableBuilder { /// Create a builder based on target block size. pub fn new(block_size: usize) -> Self { - unimplemented!() + Self { + block_builder: BlockBuilder::new(block_size), + first_key: KeyVec::new(), + last_key: KeyVec::new(), + data: Vec::new(), + metas: Vec::new(), + block_size, + } } /// Adds a key-value pair to SSTable. @@ -30,7 +49,35 @@ impl SsTableBuilder { /// Note: You should split a new block when the current block is full.(`std::mem::replace` may /// be helpful here) pub fn add(&mut self, key: KeySlice, value: &[u8]) { - unimplemented!() + // if the first time add to this block, set the first key + if self.first_key.is_empty() { + self.first_key.set_from_slice(key); + } + + // block builder returns false when the block is full. + if self.block_builder.add(key, value) { + self.last_key.set_from_slice(key); + return; + } + + // if the block is full, build the block and add the block to the data. + self.finalize_block_to_sst(); + + // add (key, value) to next block + assert!(self.block_builder.add(key, value)); + self.first_key.set_from_slice(key); + self.last_key.set_from_slice(key); + } + + // finalize the block and add it to the data. + fn finalize_block_to_sst(&mut self) { + let block = std::mem::replace(&mut self.block_builder, BlockBuilder::new(self.block_size)); + self.metas.push(BlockMeta { + offset: self.data.len(), /* previous data len */ + first_key: std::mem::take(&mut self.first_key).into_key_bytes(), + last_key: std::mem::take(&mut self.last_key).into_key_bytes(), + }); + self.data.extend_from_slice(&block.build().encode()); } /// Get the estimated size of the SSTable. @@ -38,17 +85,38 @@ impl SsTableBuilder { /// Since the data blocks contain much more data than meta blocks, just return the size of data /// blocks here. pub fn estimated_size(&self) -> usize { - unimplemented!() + self.data.len() } /// Builds the SSTable and writes it to the given path. Use the `FileObject` structure to manipulate the disk objects. pub fn build( - self, + mut self, id: usize, block_cache: Option>, path: impl AsRef, ) -> Result { - unimplemented!() + // finalize the last block whatever it is full or not. + self.finalize_block_to_sst(); + // encode the block meta and write it to the disk. + let meta_len = self.metas.len(); + let block_meta_offset = self.data.len(); + let mut buf = self.data; + BlockMeta::encode_block_metas(&self.metas, &mut buf); + // extra info for the meta block offset + buf.put_u32(block_meta_offset as u32); + let file = FileObject::create(path.as_ref(), buf)?; + let sst_table = SsTable { + id, + file, + first_key: self.metas.first().unwrap().first_key.clone(), + last_key: self.metas.last().unwrap().last_key.clone(), + block_metas: self.metas, + block_meta_offset, + block_cache, + bloom: None, + max_ts: 0, + }; + Ok(sst_table) } #[cfg(test)] diff --git a/mini-lsm-starter/src/table/iterator.rs b/mini-lsm-starter/src/table/iterator.rs index 32b06a2a3..f12b8eb38 100644 --- a/mini-lsm-starter/src/table/iterator.rs +++ b/mini-lsm-starter/src/table/iterator.rs @@ -3,7 +3,7 @@ use std::sync::Arc; -use anyhow::Result; +use anyhow::{Ok, Result}; use super::SsTable; use crate::{block::BlockIterator, iterators::StorageIterator, key::KeySlice}; @@ -16,26 +16,62 @@ pub struct SsTableIterator { } impl SsTableIterator { + fn seek_to_first_inner(table: &Arc) -> Result<(usize, BlockIterator)> { + Ok(( + 0, + BlockIterator::create_and_seek_to_first(table.read_block_cached(0)?), + )) + } + /// Create a new iterator and seek to the first key-value pair in the first data block. pub fn create_and_seek_to_first(table: Arc) -> Result { - unimplemented!() + let (idx, iter) = SsTableIterator::seek_to_first_inner(&table)?; + Ok(Self { + table, + blk_iter: iter, + blk_idx: idx, + }) } /// Seek to the first key-value pair in the first data block. pub fn seek_to_first(&mut self) -> Result<()> { - unimplemented!() + let (blk_idx, blk_iter) = Self::seek_to_first_inner(&self.table)?; + self.blk_idx = blk_idx; + self.blk_iter = blk_iter; + Ok(()) + } + + // using the first key of each block to do the binary search so as to reduce the complexity. + fn seek_to_key_inner(table: &Arc, key: KeySlice) -> Result<(usize, BlockIterator)> { + let mut blk_idx = table.find_block_idx(key); + let mut blk_iter = + BlockIterator::create_and_seek_to_key(table.read_block_cached(blk_idx)?, key); + if !blk_iter.is_valid() { + blk_idx += 1; + if blk_idx < table.num_of_blocks() { + blk_iter = + BlockIterator::create_and_seek_to_first(table.read_block_cached(blk_idx)?); + } + } + Ok((blk_idx, blk_iter)) } /// Create a new iterator and seek to the first key-value pair which >= `key`. pub fn create_and_seek_to_key(table: Arc, key: KeySlice) -> Result { - unimplemented!() + let (idx, iter) = SsTableIterator::seek_to_key_inner(&table, key)?; + Ok(Self { + table, + blk_iter: iter, + blk_idx: idx, + }) } /// Seek to the first key-value pair which >= `key`. - /// Note: You probably want to review the handout for detailed explanation when implementing - /// this function. pub fn seek_to_key(&mut self, key: KeySlice) -> Result<()> { - unimplemented!() + let (blk_idx, blk_iter) = Self::seek_to_key_inner(&self.table, key)?; + self.blk_idx = blk_idx; + self.blk_iter = blk_iter; + Ok(()) } } @@ -44,22 +80,31 @@ impl StorageIterator for SsTableIterator { /// Return the `key` that's held by the underlying block iterator. fn key(&self) -> KeySlice { - unimplemented!() + self.blk_iter.key() } /// Return the `value` that's held by the underlying block iterator. fn value(&self) -> &[u8] { - unimplemented!() + self.blk_iter.value() } /// Return whether the current block iterator is valid or not. fn is_valid(&self) -> bool { - unimplemented!() + self.blk_iter.is_valid() } /// Move to the next `key` in the block. /// Note: You may want to check if the current block iterator is valid after the move. fn next(&mut self) -> Result<()> { - unimplemented!() + self.blk_iter.next(); + if !self.blk_iter.is_valid() { + self.blk_idx += 1; + if self.blk_idx < self.table.num_of_blocks() { + self.blk_iter = BlockIterator::create_and_seek_to_first( + self.table.read_block_cached(self.blk_idx)?, + ); + } + } + Ok(()) } } diff --git a/mini-lsm-starter/src/tests.rs b/mini-lsm-starter/src/tests.rs index 69007cae9..590f76e7e 100644 --- a/mini-lsm-starter/src/tests.rs +++ b/mini-lsm-starter/src/tests.rs @@ -5,3 +5,4 @@ mod harness; mod week1_day1; mod week1_day2; mod week1_day3; +mod week1_day4; diff --git a/mini-lsm-starter/src/tests/week1_day4.rs b/mini-lsm-starter/src/tests/week1_day4.rs new file mode 100644 index 000000000..690080694 --- /dev/null +++ b/mini-lsm-starter/src/tests/week1_day4.rs @@ -0,0 +1,141 @@ +use std::sync::Arc; + +use bytes::Bytes; +use tempfile::{tempdir, TempDir}; + +use crate::iterators::StorageIterator; +use crate::key::{KeySlice, KeyVec}; +use crate::table::{SsTable, SsTableBuilder, SsTableIterator}; + +#[test] +fn test_sst_build_single_key() { + let mut builder = SsTableBuilder::new(16); + builder.add(KeySlice::for_testing_from_slice_no_ts(b"233"), b"233333"); + let dir = tempdir().unwrap(); + builder.build_for_test(dir.path().join("1.sst")).unwrap(); +} + +#[test] +fn test_sst_build_two_blocks() { + let mut builder = SsTableBuilder::new(16); + builder.add(KeySlice::for_testing_from_slice_no_ts(b"11"), b"11"); + builder.add(KeySlice::for_testing_from_slice_no_ts(b"22"), b"22"); + builder.add(KeySlice::for_testing_from_slice_no_ts(b"33"), b"11"); + builder.add(KeySlice::for_testing_from_slice_no_ts(b"44"), b"22"); + builder.add(KeySlice::for_testing_from_slice_no_ts(b"55"), b"11"); + builder.add(KeySlice::for_testing_from_slice_no_ts(b"66"), b"22"); + assert!(builder.metas.len() >= 2); + let dir = tempdir().unwrap(); + builder.build_for_test(dir.path().join("1.sst")).unwrap(); +} + +fn key_of(idx: usize) -> KeyVec { + KeyVec::for_testing_from_vec_no_ts(format!("key_{:03}", idx * 5).into_bytes()) +} + +fn value_of(idx: usize) -> Vec { + format!("value_{:010}", idx).into_bytes() +} + +fn num_of_keys() -> usize { + 100 +} + +fn generate_sst() -> (TempDir, SsTable) { + let mut builder = SsTableBuilder::new(128); + for idx in 0..num_of_keys() { + let key = key_of(idx); + let value = value_of(idx); + builder.add(key.as_key_slice(), &value[..]); + } + let dir = tempdir().unwrap(); + let path = dir.path().join("1.sst"); + (dir, builder.build_for_test(path).unwrap()) +} + +#[test] +fn test_sst_build_all() { + generate_sst(); +} + +#[test] +fn test_sst_decode() { + let (_dir, sst) = generate_sst(); + let meta = sst.block_metas.clone(); + let new_sst = SsTable::open_for_test(sst.file).unwrap(); + assert_eq!(new_sst.block_metas, meta); + assert_eq!( + new_sst.first_key().for_testing_key_ref(), + key_of(0).for_testing_key_ref() + ); + assert_eq!( + new_sst.last_key().for_testing_key_ref(), + key_of(num_of_keys() - 1).for_testing_key_ref() + ); +} + +fn as_bytes(x: &[u8]) -> Bytes { + Bytes::copy_from_slice(x) +} + +#[test] +fn test_sst_iterator() { + let (_dir, sst) = generate_sst(); + let sst = Arc::new(sst); + let mut iter = SsTableIterator::create_and_seek_to_first(sst).unwrap(); + for _ in 0..5 { + for i in 0..num_of_keys() { + let key = iter.key(); + let value = iter.value(); + assert_eq!( + key.for_testing_key_ref(), + key_of(i).for_testing_key_ref(), + "expected key: {:?}, actual key: {:?}", + as_bytes(key_of(i).for_testing_key_ref()), + as_bytes(key.for_testing_key_ref()) + ); + assert_eq!( + value, + value_of(i), + "expected value: {:?}, actual value: {:?}", + as_bytes(&value_of(i)), + as_bytes(value) + ); + iter.next().unwrap(); + } + iter.seek_to_first().unwrap(); + } +} + +#[test] +fn test_sst_seek_key() { + let (_dir, sst) = generate_sst(); + let sst = Arc::new(sst); + let mut iter = SsTableIterator::create_and_seek_to_key(sst, key_of(0).as_key_slice()).unwrap(); + for offset in 1..=5 { + for i in 0..num_of_keys() { + let key = iter.key(); + let value = iter.value(); + assert_eq!( + key.for_testing_key_ref(), + key_of(i).for_testing_key_ref(), + "expected key: {:?}, actual key: {:?}", + as_bytes(key_of(i).for_testing_key_ref()), + as_bytes(key.for_testing_key_ref()) + ); + assert_eq!( + value, + value_of(i), + "expected value: {:?}, actual value: {:?}", + as_bytes(&value_of(i)), + as_bytes(value) + ); + iter.seek_to_key(KeySlice::for_testing_from_slice_no_ts( + &format!("key_{:03}", i * 5 + offset).into_bytes(), + )) + .unwrap(); + } + iter.seek_to_key(KeySlice::for_testing_from_slice_no_ts(b"k")) + .unwrap(); + } +} From 6e06832badb73adda6b1b398b106ff405da806ba Mon Sep 17 00:00:00 2001 From: husharp Date: Tue, 6 Feb 2024 22:25:34 +0800 Subject: [PATCH 05/22] week1_day5 Signed-off-by: husharp --- mini-lsm-starter/src/block/iterator.rs | 2 +- .../src/iterators/merge_iterator.rs | 3 + .../src/iterators/two_merge_iterator.rs | 52 +++- mini-lsm-starter/src/lsm_iterator.rs | 9 +- mini-lsm-starter/src/lsm_storage.rs | 69 ++++- mini-lsm-starter/src/table/builder.rs | 2 +- mini-lsm-starter/src/tests.rs | 1 + mini-lsm-starter/src/tests/week1_day5.rs | 254 ++++++++++++++++++ 8 files changed, 370 insertions(+), 22 deletions(-) create mode 100644 mini-lsm-starter/src/tests/week1_day5.rs diff --git a/mini-lsm-starter/src/block/iterator.rs b/mini-lsm-starter/src/block/iterator.rs index 9c53e7fd7..1dde5abd3 100644 --- a/mini-lsm-starter/src/block/iterator.rs +++ b/mini-lsm-starter/src/block/iterator.rs @@ -61,7 +61,7 @@ impl BlockIterator { /// Returns true if the iterator is valid. /// Note: You may want to make use of `key` pub fn is_valid(&self) -> bool { - self.key.len() > 0 + !self.key.is_empty() } /// Seeks to the first key in the block. diff --git a/mini-lsm-starter/src/iterators/merge_iterator.rs b/mini-lsm-starter/src/iterators/merge_iterator.rs index 4bf8cc99a..0a2269ec0 100644 --- a/mini-lsm-starter/src/iterators/merge_iterator.rs +++ b/mini-lsm-starter/src/iterators/merge_iterator.rs @@ -80,6 +80,9 @@ impl StorageIterator = KeySlice<'a>>> StorageIt } fn next(&mut self) -> Result<()> { + if self.current.is_none() { + return Ok(()); + } let current = self.current.as_mut().unwrap(); // merge same key while let Some(mut inner_iter) = self.iters.peek_mut() { diff --git a/mini-lsm-starter/src/iterators/two_merge_iterator.rs b/mini-lsm-starter/src/iterators/two_merge_iterator.rs index bb7b4a8a8..d22ebbb18 100644 --- a/mini-lsm-starter/src/iterators/two_merge_iterator.rs +++ b/mini-lsm-starter/src/iterators/two_merge_iterator.rs @@ -1,7 +1,7 @@ #![allow(unused_variables)] // TODO(you): remove this lint after implementing this mod #![allow(dead_code)] // TODO(you): remove this lint after implementing this mod -use anyhow::Result; +use anyhow::{Ok, Result}; use super::StorageIterator; @@ -10,7 +10,7 @@ use super::StorageIterator; pub struct TwoMergeIterator { a: A, b: B, - // Add fields as need + choose_a: bool, } impl< @@ -19,7 +19,21 @@ impl< > TwoMergeIterator { pub fn create(a: A, b: B) -> Result { - unimplemented!() + Ok(Self { + choose_a: Self::choose_a(&a, &b), + a, + b, + }) + } + + fn choose_a(a: &A, b: &B) -> bool { + if !a.is_valid() { + return false; + } + if !b.is_valid() { + return true; + } + a.key() <= b.key() } } @@ -31,18 +45,42 @@ impl< type KeyType<'a> = A::KeyType<'a>; fn key(&self) -> Self::KeyType<'_> { - unimplemented!() + if self.choose_a { + self.a.key() + } else { + self.b.key() + } } fn value(&self) -> &[u8] { - unimplemented!() + if self.choose_a { + self.a.value() + } else { + self.b.value() + } } fn is_valid(&self) -> bool { - unimplemented!() + self.a.is_valid() || self.b.is_valid() } fn next(&mut self) -> Result<()> { - unimplemented!() + println!("next"); + // skip same key for the next + if self.choose_a { + if self.a.is_valid() && self.b.is_valid() && self.a.key() == self.b.key() { + self.b.next()?; + } + self.a.next()?; + } else { + if self.a.is_valid() && self.b.is_valid() && self.a.key() == self.b.key() { + self.a.next()?; + } + self.b.next()?; + } + + self.choose_a = Self::choose_a(&self.a, &self.b); + + Ok(()) } } diff --git a/mini-lsm-starter/src/lsm_iterator.rs b/mini-lsm-starter/src/lsm_iterator.rs index fd1373bbf..4e3a55047 100644 --- a/mini-lsm-starter/src/lsm_iterator.rs +++ b/mini-lsm-starter/src/lsm_iterator.rs @@ -1,12 +1,17 @@ use anyhow::{bail, Ok, Result}; use crate::{ - iterators::{merge_iterator::MergeIterator, StorageIterator}, + iterators::{ + merge_iterator::MergeIterator, two_merge_iterator::TwoMergeIterator, StorageIterator, + }, mem_table::MemTableIterator, + table::SsTableIterator, }; /// Represents the internal type for an LSM iterator. This type will be changed across the tutorial for multiple times. -type LsmIteratorInner = MergeIterator; +/// choosing memtable firstly +type LsmIteratorInner = + TwoMergeIterator, MergeIterator>; pub struct LsmIterator { inner: LsmIteratorInner, diff --git a/mini-lsm-starter/src/lsm_storage.rs b/mini-lsm-starter/src/lsm_storage.rs index c424cd806..ec8412339 100644 --- a/mini-lsm-starter/src/lsm_storage.rs +++ b/mini-lsm-starter/src/lsm_storage.rs @@ -6,7 +6,7 @@ use std::path::{Path, PathBuf}; use std::sync::atomic::AtomicUsize; use std::sync::Arc; -use anyhow::Result; +use anyhow::{Ok, Result}; use bytes::Bytes; use parking_lot::{Mutex, MutexGuard, RwLock}; @@ -16,11 +16,14 @@ use crate::compact::{ SimpleLeveledCompactionController, SimpleLeveledCompactionOptions, TieredCompactionController, }; use crate::iterators::merge_iterator::MergeIterator; +use crate::iterators::two_merge_iterator::TwoMergeIterator; +use crate::iterators::StorageIterator; +use crate::key::KeySlice; use crate::lsm_iterator::{FusedIterator, LsmIterator}; use crate::manifest::Manifest; use crate::mem_table::MemTable; use crate::mvcc::LsmMvccInner; -use crate::table::SsTable; +use crate::table::{SsTable, SsTableIterator}; pub type BlockCache = moka::sync::Cache<(usize, usize), Arc>; @@ -264,9 +267,9 @@ impl LsmStorageInner { /// Get a key from the storage. In day 7, this can be further optimized by using a bloom filter. pub fn get(&self, key: &[u8]) -> Result> { - let guard = self.state.read(); + let snapshot = self.state.read(); // search memtable firstly - if let Some(value) = guard.memtable.get(key) { + if let Some(value) = snapshot.memtable.get(key) { if value.is_empty() { return Ok(None); } @@ -274,7 +277,7 @@ impl LsmStorageInner { } // traverse imm-memtable - for memtable in guard.imm_memtables.iter() { + for memtable in snapshot.imm_memtables.iter() { if let Some(value) = memtable.get(key) { if value.is_empty() { return Ok(None); @@ -283,6 +286,23 @@ impl LsmStorageInner { } } + // create merge iterator for l0_sstables + let mut table_iters = Vec::with_capacity(snapshot.l0_sstables.len()); + for sst_id in snapshot.l0_sstables.iter() { + let sst = snapshot.sstables.get(sst_id).unwrap(); + let iter = + SsTableIterator::create_and_seek_to_key(sst.clone(), KeySlice::from_slice(key))?; + table_iters.push(Box::new(iter)); + } + + let merge_l0_sstable_iter = MergeIterator::create(table_iters); + if merge_l0_sstable_iter.is_valid() + && merge_l0_sstable_iter.key() == KeySlice::from_slice(key) + && !merge_l0_sstable_iter.value().is_empty() + { + return Ok(Some(Bytes::copy_from_slice(merge_l0_sstable_iter.value()))); + } + Ok(None) } @@ -380,8 +400,8 @@ impl LsmStorageInner { /// Create an iterator over a range of keys. pub fn scan( &self, - _lower: Bound<&[u8]>, - _upper: Bound<&[u8]>, + lower: Bound<&[u8]>, + upper: Bound<&[u8]>, ) -> Result> { let snapshot = { let guard = self.state.read(); @@ -390,12 +410,39 @@ impl LsmStorageInner { // need to get all memtables and imm_memtables let mut memtable_iters = Vec::with_capacity(snapshot.imm_memtables.len() + 1); - memtable_iters.push(Box::new(snapshot.memtable.scan(_lower, _upper))); + memtable_iters.push(Box::new(snapshot.memtable.scan(lower, upper))); for imm_memtable in snapshot.imm_memtables.iter() { - memtable_iters.push(Box::new(imm_memtable.scan(_lower, _upper))); + memtable_iters.push(Box::new(imm_memtable.scan(lower, upper))); } // using merge iterator to merge all iterators - let merge_iter = MergeIterator::create(memtable_iters); - Ok(FusedIterator::new(LsmIterator::new(merge_iter)?)) + let merge_memtable_iter = MergeIterator::create(memtable_iters); + + // using merge iterator to merge all sstables + let mut table_iters = Vec::with_capacity(snapshot.l0_sstables.len()); + for sst_id in snapshot.l0_sstables.iter() { + let sst = snapshot.sstables[sst_id].clone(); + // SST iterator does not support passing an end bound to it. + // Therefore, need to handle the end_bound manually in LsmIterator + let iter = match lower { + Bound::Included(key) => { + SsTableIterator::create_and_seek_to_key(sst, KeySlice::from_slice(key))? + } + Bound::Excluded(key) => { + let mut iter = + SsTableIterator::create_and_seek_to_key(sst, KeySlice::from_slice(key))?; + if iter.is_valid() && iter.key() == KeySlice::from_slice(key) { + iter.next()?; + } + iter + } + Bound::Unbounded => SsTableIterator::create_and_seek_to_first(sst)?, + }; + table_iters.push(Box::new(iter)); + } + let merge_l0_sstable_iter = MergeIterator::create(table_iters); + + let two_merge_iter = TwoMergeIterator::create(merge_memtable_iter, merge_l0_sstable_iter)?; + + Ok(FusedIterator::new(LsmIterator::new(two_merge_iter)?)) } } diff --git a/mini-lsm-starter/src/table/builder.rs b/mini-lsm-starter/src/table/builder.rs index 7ddc72573..3f1264fc6 100644 --- a/mini-lsm-starter/src/table/builder.rs +++ b/mini-lsm-starter/src/table/builder.rs @@ -1,8 +1,8 @@ #![allow(unused_variables)] // TODO(you): remove this lint after implementing this mod #![allow(dead_code)] // TODO(you): remove this lint after implementing this mod +use std::path::Path; use std::sync::Arc; -use std::{borrow::Borrow, path::Path}; use anyhow::Result; use bytes::BufMut; diff --git a/mini-lsm-starter/src/tests.rs b/mini-lsm-starter/src/tests.rs index 590f76e7e..0eef03848 100644 --- a/mini-lsm-starter/src/tests.rs +++ b/mini-lsm-starter/src/tests.rs @@ -6,3 +6,4 @@ mod week1_day1; mod week1_day2; mod week1_day3; mod week1_day4; +mod week1_day5; diff --git a/mini-lsm-starter/src/tests/week1_day5.rs b/mini-lsm-starter/src/tests/week1_day5.rs new file mode 100644 index 000000000..1bf2fc5a6 --- /dev/null +++ b/mini-lsm-starter/src/tests/week1_day5.rs @@ -0,0 +1,254 @@ +use std::ops::Bound; +use std::sync::Arc; + +use self::harness::{check_iter_result_by_key, MockIterator}; +use self::harness::{check_lsm_iter_result_by_key, generate_sst}; +use bytes::Bytes; +use tempfile::tempdir; + +use super::*; +use crate::{ + iterators::two_merge_iterator::TwoMergeIterator, + lsm_storage::{LsmStorageInner, LsmStorageOptions}, +}; + +#[test] +fn test_task1_merge_1() { + let i1 = MockIterator::new(vec![ + (Bytes::from("a"), Bytes::from("1.1")), + (Bytes::from("b"), Bytes::from("2.1")), + (Bytes::from("c"), Bytes::from("3.1")), + ]); + let i2 = MockIterator::new(vec![ + (Bytes::from("a"), Bytes::from("1.2")), + (Bytes::from("b"), Bytes::from("2.2")), + (Bytes::from("c"), Bytes::from("3.2")), + (Bytes::from("d"), Bytes::from("4.2")), + ]); + let mut iter = TwoMergeIterator::create(i1, i2).unwrap(); + check_iter_result_by_key( + &mut iter, + vec![ + (Bytes::from("a"), Bytes::from("1.1")), + (Bytes::from("b"), Bytes::from("2.1")), + (Bytes::from("c"), Bytes::from("3.1")), + (Bytes::from("d"), Bytes::from("4.2")), + ], + ) +} + +#[test] +fn test_task1_merge_2() { + let i2 = MockIterator::new(vec![ + (Bytes::from("a"), Bytes::from("1.1")), + (Bytes::from("b"), Bytes::from("2.1")), + (Bytes::from("c"), Bytes::from("3.1")), + ]); + let i1 = MockIterator::new(vec![ + (Bytes::from("a"), Bytes::from("1.2")), + (Bytes::from("b"), Bytes::from("2.2")), + (Bytes::from("c"), Bytes::from("3.2")), + (Bytes::from("d"), Bytes::from("4.2")), + ]); + let mut iter = TwoMergeIterator::create(i1, i2).unwrap(); + check_iter_result_by_key( + &mut iter, + vec![ + (Bytes::from("a"), Bytes::from("1.2")), + (Bytes::from("b"), Bytes::from("2.2")), + (Bytes::from("c"), Bytes::from("3.2")), + (Bytes::from("d"), Bytes::from("4.2")), + ], + ) +} + +#[test] +fn test_task1_merge_3() { + let i2 = MockIterator::new(vec![ + (Bytes::from("a"), Bytes::from("1.1")), + (Bytes::from("b"), Bytes::from("2.1")), + (Bytes::from("c"), Bytes::from("3.1")), + ]); + let i1 = MockIterator::new(vec![ + (Bytes::from("b"), Bytes::from("2.2")), + (Bytes::from("c"), Bytes::from("3.2")), + (Bytes::from("d"), Bytes::from("4.2")), + ]); + let mut iter = TwoMergeIterator::create(i1, i2).unwrap(); + check_iter_result_by_key( + &mut iter, + vec![ + (Bytes::from("a"), Bytes::from("1.1")), + (Bytes::from("b"), Bytes::from("2.2")), + (Bytes::from("c"), Bytes::from("3.2")), + (Bytes::from("d"), Bytes::from("4.2")), + ], + ) +} + +#[test] +fn test_task1_merge_4() { + let i2 = MockIterator::new(vec![]); + let i1 = MockIterator::new(vec![ + (Bytes::from("b"), Bytes::from("2.2")), + (Bytes::from("c"), Bytes::from("3.2")), + (Bytes::from("d"), Bytes::from("4.2")), + ]); + let mut iter = TwoMergeIterator::create(i1, i2).unwrap(); + check_iter_result_by_key( + &mut iter, + vec![ + (Bytes::from("b"), Bytes::from("2.2")), + (Bytes::from("c"), Bytes::from("3.2")), + (Bytes::from("d"), Bytes::from("4.2")), + ], + ); + let i1 = MockIterator::new(vec![]); + let i2 = MockIterator::new(vec![ + (Bytes::from("b"), Bytes::from("2.2")), + (Bytes::from("c"), Bytes::from("3.2")), + (Bytes::from("d"), Bytes::from("4.2")), + ]); + let mut iter = TwoMergeIterator::create(i1, i2).unwrap(); + check_iter_result_by_key( + &mut iter, + vec![ + (Bytes::from("b"), Bytes::from("2.2")), + (Bytes::from("c"), Bytes::from("3.2")), + (Bytes::from("d"), Bytes::from("4.2")), + ], + ); +} + +#[test] +fn test_task1_merge_5() { + let i2 = MockIterator::new(vec![]); + let i1 = MockIterator::new(vec![]); + let mut iter = TwoMergeIterator::create(i1, i2).unwrap(); + check_iter_result_by_key(&mut iter, vec![]) +} + +#[test] +fn test_task2_storage_scan() { + let dir = tempdir().unwrap(); + let storage = + Arc::new(LsmStorageInner::open(&dir, LsmStorageOptions::default_for_week1_test()).unwrap()); + storage.put(b"1", b"233").unwrap(); + storage.put(b"2", b"2333").unwrap(); + storage.put(b"00", b"2333").unwrap(); + storage + .force_freeze_memtable(&storage.state_lock.lock()) + .unwrap(); + storage.put(b"3", b"23333").unwrap(); + storage.delete(b"1").unwrap(); + let sst1 = generate_sst( + 10, + dir.path().join("10.sst"), + vec![ + (Bytes::from_static(b"0"), Bytes::from_static(b"2333333")), + (Bytes::from_static(b"00"), Bytes::from_static(b"2333333")), + (Bytes::from_static(b"4"), Bytes::from_static(b"23")), + ], + Some(storage.block_cache.clone()), + ); + let sst2 = generate_sst( + 11, + dir.path().join("11.sst"), + vec![(Bytes::from_static(b"4"), Bytes::from_static(b""))], + Some(storage.block_cache.clone()), + ); + { + let mut state = storage.state.write(); + let mut snapshot = state.as_ref().clone(); + snapshot.l0_sstables.push(sst2.sst_id()); // this is the latest SST + snapshot.l0_sstables.push(sst1.sst_id()); + snapshot.sstables.insert(sst2.sst_id(), sst2.into()); + snapshot.sstables.insert(sst1.sst_id(), sst1.into()); + *state = snapshot.into(); + } + + println!("dump structure"); + storage.dump_structure(); + + check_lsm_iter_result_by_key( + &mut storage.scan(Bound::Unbounded, Bound::Unbounded).unwrap(), + vec![ + (Bytes::from("0"), Bytes::from("2333333")), + (Bytes::from("00"), Bytes::from("2333")), + (Bytes::from("2"), Bytes::from("2333")), + (Bytes::from("3"), Bytes::from("23333")), + ], + ); + check_lsm_iter_result_by_key( + &mut storage + .scan(Bound::Included(b"1"), Bound::Included(b"2")) + .unwrap(), + vec![(Bytes::from("2"), Bytes::from("2333"))], + ); + check_lsm_iter_result_by_key( + &mut storage + .scan(Bound::Excluded(b"1"), Bound::Excluded(b"3")) + .unwrap(), + vec![(Bytes::from("2"), Bytes::from("2333"))], + ); +} + +#[test] +fn test_task3_storage_get() { + let dir = tempdir().unwrap(); + let storage = + Arc::new(LsmStorageInner::open(&dir, LsmStorageOptions::default_for_week1_test()).unwrap()); + storage.put(b"1", b"233").unwrap(); + storage.put(b"2", b"2333").unwrap(); + storage.put(b"00", b"2333").unwrap(); + storage + .force_freeze_memtable(&storage.state_lock.lock()) + .unwrap(); + storage.put(b"3", b"23333").unwrap(); + storage.delete(b"1").unwrap(); + let sst1 = generate_sst( + 10, + dir.path().join("10.sst"), + vec![ + (Bytes::from_static(b"0"), Bytes::from_static(b"2333333")), + (Bytes::from_static(b"00"), Bytes::from_static(b"2333333")), + (Bytes::from_static(b"4"), Bytes::from_static(b"23")), + ], + Some(storage.block_cache.clone()), + ); + let sst2 = generate_sst( + 11, + dir.path().join("11.sst"), + vec![(Bytes::from_static(b"4"), Bytes::from_static(b""))], + Some(storage.block_cache.clone()), + ); + { + let mut state = storage.state.write(); + let mut snapshot = state.as_ref().clone(); + snapshot.l0_sstables.push(sst2.sst_id()); // this is the latest SST + snapshot.l0_sstables.push(sst1.sst_id()); + snapshot.sstables.insert(sst2.sst_id(), sst2.into()); + snapshot.sstables.insert(sst1.sst_id(), sst1.into()); + *state = snapshot.into(); + } + + assert_eq!( + storage.get(b"0").unwrap(), + Some(Bytes::from_static(b"2333333")) + ); + assert_eq!( + storage.get(b"00").unwrap(), + Some(Bytes::from_static(b"2333")) + ); + assert_eq!( + storage.get(b"2").unwrap(), + Some(Bytes::from_static(b"2333")) + ); + assert_eq!( + storage.get(b"3").unwrap(), + Some(Bytes::from_static(b"23333")) + ); + assert_eq!(storage.get(b"4").unwrap(), None); + assert_eq!(storage.get(b"--").unwrap(), None); + assert_eq!(storage.get(b"555").unwrap(), None); +} From 3f9a9ed99b6551ca5d21cfea7f8a70084370ed07 Mon Sep 17 00:00:00 2001 From: husharp Date: Wed, 7 Feb 2024 23:20:06 +0800 Subject: [PATCH 06/22] week1_day6 Signed-off-by: husharp --- Cargo.lock | 162 +++++++- mini-lsm-starter/Cargo.toml | 2 + mini-lsm-starter/src/bin/mini-lsm-cli.rs | 375 +++++++++++++----- mini-lsm-starter/src/block/iterator.rs | 8 +- mini-lsm-starter/src/compact.rs | 8 + .../src/iterators/merge_iterator.rs | 12 + .../src/iterators/two_merge_iterator.rs | 45 ++- mini-lsm-starter/src/lsm_iterator.rs | 43 +- mini-lsm-starter/src/lsm_storage.rs | 294 +++++++++----- mini-lsm-starter/src/mem_table.rs | 7 +- mini-lsm-starter/src/table/iterator.rs | 1 + mini-lsm-starter/src/tests.rs | 1 + mini-lsm-starter/src/tests/week1_day6.rs | 200 ++++++++++ 13 files changed, 934 insertions(+), 224 deletions(-) create mode 100644 mini-lsm-starter/src/tests/week1_day6.rs diff --git a/Cargo.lock b/Cargo.lock index 66dd13e06..fa83e9465 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -80,6 +80,12 @@ version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" +[[package]] +name = "bitflags" +version = "2.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed570934406eb16438a4e976b1b4500774099c13b8cb96eec99f620f05090ddf" + [[package]] name = "bumpalo" version = "3.11.1" @@ -175,6 +181,15 @@ version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "702fc72eb24e5a1e48ce58027a675bc24edd52096d5397d4aea7c6dd9eca0bd1" +[[package]] +name = "clipboard-win" +version = "5.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3ec832972fefb8cf9313b45a0d1945e29c9c251f1d4c6eafc5fe2124c02d2e81" +dependencies = [ + "error-code", +] + [[package]] name = "colorchoice" version = "1.0.0" @@ -268,6 +283,22 @@ version = "0.3.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a357d28ed41a50f9c765dbfe56cbc04a64e53e5fc58ba79fbc34c10ef3df831f" +[[package]] +name = "endian-type" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c34f04666d835ff5d62e058c3995147c06f42fe86ff053337632bca83e42702d" + +[[package]] +name = "errno" +version = "0.3.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a258e46cdc063eb8519c00b9fc845fc47bcfca4130e2f08e88665ceda8474245" +dependencies = [ + "libc", + "windows-sys 0.52.0", +] + [[package]] name = "error-chain" version = "0.12.4" @@ -277,6 +308,12 @@ dependencies = [ "version_check", ] +[[package]] +name = "error-code" +version = "3.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "281e452d3bad4005426416cdba5ccfd4f5c1280e10099e21db27f7c1c28347fc" + [[package]] name = "farmhash" version = "1.1.5" @@ -292,6 +329,17 @@ dependencies = [ "instant", ] +[[package]] +name = "fd-lock" +version = "4.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e5768da2206272c81ef0b5e951a41862938a6070da63bcea197899942d3b947" +dependencies = [ + "cfg-if", + "rustix", + "windows-sys 0.52.0", +] + [[package]] name = "getrandom" version = "0.2.8" @@ -324,6 +372,15 @@ dependencies = [ "libc", ] +[[package]] +name = "home" +version = "0.5.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3d1354bf6b7235cb4a0576c2619fd4ed18183f689b12b006a0ee7329eeff9a5" +dependencies = [ + "windows-sys 0.52.0", +] + [[package]] name = "instant" version = "0.1.12" @@ -365,9 +422,15 @@ checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" [[package]] name = "libc" -version = "0.2.139" +version = "0.2.153" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd" + +[[package]] +name = "linux-raw-sys" +version = "0.4.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "201de327520df007757c1f0adce6e827fe8562fbc28bfd9c15571c66ca1f5f79" +checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c" [[package]] name = "lock_api" @@ -469,9 +532,11 @@ dependencies = [ "crossbeam-skiplist", "farmhash", "moka", + "nom", "ouroboros", "parking_lot", "rand", + "rustyline", "serde", "serde_json", "tempfile", @@ -487,6 +552,12 @@ dependencies = [ "duct", ] +[[package]] +name = "minimal-lexical" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" + [[package]] name = "moka" version = "0.9.6" @@ -510,6 +581,36 @@ dependencies = [ "uuid", ] +[[package]] +name = "nibble_vec" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77a5d83df9f36fe23f0c3648c6bbb8b0298bb5f1939c8f2704431371f4b84d43" +dependencies = [ + "smallvec", +] + +[[package]] +name = "nix" +version = "0.27.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2eb04e9c688eff1c89d72b407f168cf79bb9e867a9d3323ed6c01519eb9cc053" +dependencies = [ + "bitflags 2.4.2", + "cfg-if", + "libc", +] + +[[package]] +name = "nom" +version = "7.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a" +dependencies = [ + "memchr", + "minimal-lexical", +] + [[package]] name = "num_cpus" version = "1.15.0" @@ -618,7 +719,7 @@ version = "0.9.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2d9cc634bc78768157b5cbfe988ffcd1dcba95cd2b2f03a88316c08c6d00ed63" dependencies = [ - "bitflags", + "bitflags 1.3.2", "memchr", "unicase", ] @@ -648,6 +749,16 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "radix_trie" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c069c179fcdc6a2fe24d8d18305cf085fdbd4f922c041943e203685d6a1c58fd" +dependencies = [ + "endian-type", + "nibble_vec", +] + [[package]] name = "rand" version = "0.8.5" @@ -684,7 +795,7 @@ version = "10.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a6823ea29436221176fe662da99998ad3b4db2c7f31e7b6f5fe43adccd6320bb" dependencies = [ - "bitflags", + "bitflags 1.3.2", ] [[package]] @@ -693,7 +804,7 @@ version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fb5a58c1855b4b6819d59012155603f0b22ad30cad752600aadfcb695265519a" dependencies = [ - "bitflags", + "bitflags 1.3.2", ] [[package]] @@ -714,6 +825,41 @@ dependencies = [ "semver", ] +[[package]] +name = "rustix" +version = "0.38.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ea3e1a662af26cd7a3ba09c0297a31af215563ecf42817c98df621387f4e949" +dependencies = [ + "bitflags 2.4.2", + "errno", + "libc", + "linux-raw-sys", + "windows-sys 0.52.0", +] + +[[package]] +name = "rustyline" +version = "13.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "02a2d683a4ac90aeef5b1013933f6d977bd37d51ff3f4dad829d4931a7e6be86" +dependencies = [ + "bitflags 2.4.2", + "cfg-if", + "clipboard-win", + "fd-lock", + "home", + "libc", + "log", + "memchr", + "nix", + "radix_trie", + "unicode-segmentation", + "unicode-width", + "utf8parse", + "winapi", +] + [[package]] name = "ryu" version = "1.0.12" @@ -920,6 +1066,12 @@ version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "84a22b9f218b40614adcb3f4ff08b703773ad44fa9423e4e0d346d5db86e4ebc" +[[package]] +name = "unicode-segmentation" +version = "1.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1dd624098567895118886609431a7c3b8f516e41d30e0643f03d94592a147e36" + [[package]] name = "unicode-width" version = "0.1.10" diff --git a/mini-lsm-starter/Cargo.toml b/mini-lsm-starter/Cargo.toml index 25943e510..e84ac93ff 100644 --- a/mini-lsm-starter/Cargo.toml +++ b/mini-lsm-starter/Cargo.toml @@ -19,6 +19,8 @@ crossbeam-channel = "0.5.11" serde_json = { version = "1.0" } serde = { version = "1.0", features = ["derive"] } farmhash = "1" +nom = "7.1.3" +rustyline = "13.0.0" [dev-dependencies] tempfile = "3" diff --git a/mini-lsm-starter/src/bin/mini-lsm-cli.rs b/mini-lsm-starter/src/bin/mini-lsm-cli.rs index f44ef912d..ee2290cb6 100644 --- a/mini-lsm-starter/src/bin/mini-lsm-cli.rs +++ b/mini-lsm-starter/src/bin/mini-lsm-cli.rs @@ -1,5 +1,4 @@ mod wrapper; -use wrapper::mini_lsm_wrapper; use anyhow::Result; use bytes::Bytes; @@ -10,7 +9,10 @@ use mini_lsm_wrapper::compact::{ }; use mini_lsm_wrapper::iterators::StorageIterator; use mini_lsm_wrapper::lsm_storage::{LsmStorageOptions, MiniLsm}; +use rustyline::DefaultEditor; use std::path::PathBuf; +use std::sync::Arc; +use wrapper::mini_lsm_wrapper; #[derive(Debug, Clone, ValueEnum)] enum CompactionStrategy { @@ -33,6 +35,279 @@ struct Args { serializable: bool, } +struct ReplHandler { + epoch: u64, + lsm: Arc, +} + +impl ReplHandler { + fn handle(&mut self, command: &Command) -> Result<()> { + match command { + Command::Fill { begin, end } => { + for i in *begin..=*end { + self.lsm.put( + format!("{}", i).as_bytes(), + format!("value{}@{}", i, self.epoch).as_bytes(), + )?; + } + + println!( + "{} values filled with epoch {}", + end - begin + 1, + self.epoch + ); + } + Command::Del { key } => { + self.lsm.delete(key.as_bytes())?; + println!("{} deleted", key); + } + Command::Get { key } => { + if let Some(value) = self.lsm.get(key.as_bytes())? { + println!("{}={:?}", key, value); + } else { + println!("{} not exist", key); + } + } + Command::Scan { begin, end } => match (begin, end) { + (None, None) => { + let mut iter = self + .lsm + .scan(std::ops::Bound::Unbounded, std::ops::Bound::Unbounded)?; + let mut cnt = 0; + while iter.is_valid() { + println!( + "{:?}={:?}", + Bytes::copy_from_slice(iter.key()), + Bytes::copy_from_slice(iter.value()), + ); + iter.next()?; + cnt += 1; + } + println!(); + println!("{} keys scanned", cnt); + } + (Some(begin), Some(end)) => { + let mut iter = self.lsm.scan( + std::ops::Bound::Included(begin.as_bytes()), + std::ops::Bound::Included(end.as_bytes()), + )?; + let mut cnt = 0; + while iter.is_valid() { + println!( + "{:?}={:?}", + Bytes::copy_from_slice(iter.key()), + Bytes::copy_from_slice(iter.value()), + ); + iter.next()?; + cnt += 1; + } + println!(); + println!("{} keys scanned", cnt); + } + _ => { + println!("invalid command"); + } + }, + Command::Dump => { + self.lsm.dump_structure(); + println!("dump success"); + } + Command::Flush => { + self.lsm.force_flush()?; + println!("flush success"); + } + Command::FullCompaction => { + self.lsm.force_full_compaction()?; + println!("full compaction success"); + } + Command::Quit | Command::Close => std::process::exit(0), + }; + + self.epoch += 1; + + Ok(()) + } +} + +#[derive(Debug)] +enum Command { + Fill { + begin: u64, + end: u64, + }, + Del { + key: String, + }, + Get { + key: String, + }, + Scan { + begin: Option, + end: Option, + }, + + Dump, + Flush, + FullCompaction, + Quit, + Close, +} + +impl Command { + pub fn parse(input: &str) -> Result { + use nom::bytes::complete::*; + use nom::character::complete::*; + + use nom::branch::*; + use nom::combinator::*; + use nom::sequence::*; + + let uint = |i| { + map_res(digit1::<&str, nom::error::Error<_>>, |s: &str| { + s.parse() + .map_err(|_| nom::error::Error::new(s, nom::error::ErrorKind::Digit)) + })(i) + }; + + let string = |i| { + map(take_till1(|c: char| c.is_whitespace()), |s: &str| { + s.to_string() + })(i) + }; + + let fill = |i| { + map( + tuple((tag_no_case("fill"), space1, uint, space1, uint)), + |(_, _, key, _, value)| Command::Fill { + begin: key, + end: value, + }, + )(i) + }; + + let del = |i| { + map( + tuple((tag_no_case("del"), space1, string)), + |(_, _, key)| Command::Del { key }, + )(i) + }; + + let get = |i| { + map( + tuple((tag_no_case("get"), space1, string)), + |(_, _, key)| Command::Get { key }, + )(i) + }; + + let scan = |i| { + map( + tuple(( + tag_no_case("scan"), + opt(tuple((space1, string, space1, string))), + )), + |(_, opt_args)| { + let (begin, end) = opt_args + .map_or((None, None), |(_, begin, _, end)| (Some(begin), Some(end))); + Command::Scan { begin, end } + }, + )(i) + }; + + let command = |i| { + alt(( + fill, + del, + get, + scan, + map(tag_no_case("dump"), |_| Command::Dump), + map(tag_no_case("flush"), |_| Command::Flush), + map(tag_no_case("full_compaction"), |_| Command::FullCompaction), + map(tag_no_case("quit"), |_| Command::Quit), + map(tag_no_case("close"), |_| Command::Close), + ))(i) + }; + + command(input) + .map(|(_, c)| c) + .map_err(|e| anyhow::anyhow!("{}", e)) + } +} + +struct Repl { + app_name: String, + description: String, + prompt: String, + + handler: ReplHandler, + + editor: DefaultEditor, +} + +impl Repl { + pub fn run(mut self) -> Result<()> { + self.bootstrap()?; + + loop { + let readline = self.editor.readline(&self.prompt)?; + if readline.trim().is_empty() { + // Skip noop + continue; + } + let command = Command::parse(&readline)?; + self.handler.handle(&command)?; + self.editor.add_history_entry(readline)?; + } + } + + fn bootstrap(&mut self) -> Result<()> { + println!("Welcome to {}!", self.app_name); + println!("{}", self.description); + println!(); + Ok(()) + } +} + +struct ReplBuilder { + app_name: String, + description: String, + prompt: String, +} + +impl ReplBuilder { + pub fn new() -> Self { + Self { + app_name: "mini-lsm-cli".to_string(), + description: "A CLI for mini-lsm".to_string(), + prompt: "mini-lsm-cli> ".to_string(), + } + } + + pub fn app_name(mut self, app_name: &str) -> Self { + self.app_name = app_name.to_string(); + self + } + + pub fn description(mut self, description: &str) -> Self { + self.description = description.to_string(); + self + } + + pub fn prompt(mut self, prompt: &str) -> Self { + self.prompt = prompt.to_string(); + self + } + + pub fn build(self, handler: ReplHandler) -> Result { + Ok(Repl { + app_name: self.app_name, + description: self.description, + prompt: self.prompt, + editor: DefaultEditor::new()?, + handler, + }) + } +} + fn main() -> Result<()> { let args = Args::parse(); let lsm = MiniLsm::open( @@ -69,97 +344,13 @@ fn main() -> Result<()> { serializable: args.serializable, }, )?; - let mut epoch = 0; - loop { - let mut line = String::new(); - std::io::stdin().read_line(&mut line)?; - let line = line.trim().to_string(); - if line.starts_with("fill ") { - let Some((_, options)) = line.split_once(' ') else { - println!("invalid command"); - continue; - }; - let Some((begin, end)) = options.split_once(' ') else { - println!("invalid command"); - continue; - }; - let begin = begin.parse::()?; - let end = end.parse::()?; - - for i in begin..=end { - lsm.put( - format!("{}", i).as_bytes(), - format!("value{}@{}", i, epoch).as_bytes(), - )?; - } - println!("{} values filled with epoch {}", end - begin + 1, epoch); - } else if line.starts_with("del ") { - let Some((_, key)) = line.split_once(' ') else { - println!("invalid command"); - continue; - }; - lsm.delete(key.as_bytes())?; - } else if line.starts_with("get ") { - let Some((_, key)) = line.split_once(' ') else { - println!("invalid command"); - continue; - }; - if let Some(value) = lsm.get(key.as_bytes())? { - println!("{}={:?}", key, value); - } else { - println!("{} not exist", key); - } - } else if line == "scan" { - let mut iter = lsm.scan(std::ops::Bound::Unbounded, std::ops::Bound::Unbounded)?; - let mut cnt = 0; - while iter.is_valid() { - println!( - "{:?}={:?}", - Bytes::copy_from_slice(iter.key()), - Bytes::copy_from_slice(iter.value()), - ); - iter.next()?; - cnt += 1; - } - println!("{} keys scanned", cnt); - } else if line.starts_with("scan ") { - let Some((_, rest)) = line.split_once(' ') else { - println!("invalid command"); - continue; - }; - let Some((begin_key, end_key)) = rest.split_once(' ') else { - println!("invalid command"); - continue; - }; - let mut iter = lsm.scan( - std::ops::Bound::Included(begin_key.as_bytes()), - std::ops::Bound::Included(end_key.as_bytes()), - )?; - let mut cnt = 0; - while iter.is_valid() { - println!( - "{:?}={:?}", - Bytes::copy_from_slice(iter.key()), - Bytes::copy_from_slice(iter.value()), - ); - iter.next()?; - cnt += 1; - } - println!("{} keys scanned", cnt); - } else if line == "dump" { - lsm.dump_structure(); - } else if line == "flush" { - lsm.force_flush()?; - } else if line == "full_compaction" { - lsm.force_full_compaction()?; - } else if line == "quit" || line == "close" { - lsm.close()?; - break; - } else { - println!("invalid command: {}", line); - } - epoch += 1; - } + let repl = ReplBuilder::new() + .app_name("mini-lsm-cli") + .description("A CLI for mini-lsm") + .prompt("mini-lsm-cli> ") + .build(ReplHandler { epoch: 0, lsm })?; + + repl.run()?; Ok(()) } diff --git a/mini-lsm-starter/src/block/iterator.rs b/mini-lsm-starter/src/block/iterator.rs index 1dde5abd3..eea23fd62 100644 --- a/mini-lsm-starter/src/block/iterator.rs +++ b/mini-lsm-starter/src/block/iterator.rs @@ -1,6 +1,3 @@ -#![allow(unused_variables)] // TODO(you): remove this lint after implementing this mod -#![allow(dead_code)] // TODO(you): remove this lint after implementing this mod - use std::sync::Arc; use bytes::Buf; @@ -50,16 +47,17 @@ impl BlockIterator { /// Returns the key of the current entry. pub fn key(&self) -> KeySlice { + debug_assert!(!self.key.is_empty(), "invalid iterator"); self.key.as_key_slice() } /// Returns the value of the current entry. pub fn value(&self) -> &[u8] { + debug_assert!(!self.key.is_empty(), "invalid iterator"); &self.block.data[self.value_range.0..self.value_range.1] } /// Returns true if the iterator is valid. - /// Note: You may want to make use of `key` pub fn is_valid(&self) -> bool { !self.key.is_empty() } @@ -111,7 +109,7 @@ impl BlockIterator { } /// Seek to the first key that >= `key`. - /// Note: You should assume the key-value pairs in the block are sorted when being added by + /// Note: we should assume the key-value pairs in the block are sorted when being added by /// callers. pub fn seek_to_key(&mut self, key: KeySlice) { let mut low = 0; diff --git a/mini-lsm-starter/src/compact.rs b/mini-lsm-starter/src/compact.rs index d52cc135f..faa9f6b86 100644 --- a/mini-lsm-starter/src/compact.rs +++ b/mini-lsm-starter/src/compact.rs @@ -145,6 +145,14 @@ impl LsmStorageInner { } fn trigger_flush(&self) -> Result<()> { + let res = { + let state = self.state.read(); + state.imm_memtables.len() >= self.options.num_memtable_limit + }; + if res { + self.force_flush_next_imm_memtable()?; + } + Ok(()) } diff --git a/mini-lsm-starter/src/iterators/merge_iterator.rs b/mini-lsm-starter/src/iterators/merge_iterator.rs index 0a2269ec0..1e14a6723 100644 --- a/mini-lsm-starter/src/iterators/merge_iterator.rs +++ b/mini-lsm-starter/src/iterators/merge_iterator.rs @@ -120,4 +120,16 @@ impl StorageIterator = KeySlice<'a>>> StorageIt Ok(()) } + + fn num_active_iterators(&self) -> usize { + self.iters + .iter() + .map(|x| x.1.num_active_iterators()) + .sum::() + + self + .current + .as_ref() + .map(|x| x.1.num_active_iterators()) + .unwrap_or(0) + } } diff --git a/mini-lsm-starter/src/iterators/two_merge_iterator.rs b/mini-lsm-starter/src/iterators/two_merge_iterator.rs index d22ebbb18..77cb3b5be 100644 --- a/mini-lsm-starter/src/iterators/two_merge_iterator.rs +++ b/mini-lsm-starter/src/iterators/two_merge_iterator.rs @@ -1,6 +1,3 @@ -#![allow(unused_variables)] // TODO(you): remove this lint after implementing this mod -#![allow(dead_code)] // TODO(you): remove this lint after implementing this mod - use anyhow::{Ok, Result}; use super::StorageIterator; @@ -19,11 +16,21 @@ impl< > TwoMergeIterator { pub fn create(a: A, b: B) -> Result { - Ok(Self { - choose_a: Self::choose_a(&a, &b), + let mut iter = Self { + choose_a: false, a, b, - }) + }; + iter.skip_b()?; + iter.choose_a = Self::choose_a(&iter.a, &iter.b); + Ok(iter) + } + + fn skip_b(&mut self) -> Result<()> { + if self.a.is_valid() && self.b.is_valid() && self.b.key() == self.a.key() { + self.b.next()?; + } + Ok(()) } fn choose_a(a: &A, b: &B) -> bool { @@ -33,7 +40,7 @@ impl< if !b.is_valid() { return true; } - a.key() <= b.key() + a.key() < b.key() } } @@ -61,26 +68,32 @@ impl< } fn is_valid(&self) -> bool { - self.a.is_valid() || self.b.is_valid() + if !self.a.is_valid() && !self.b.is_valid() { + false + } else if self.choose_a { + self.a.is_valid() + } else { + self.b.is_valid() + } } fn next(&mut self) -> Result<()> { - println!("next"); - // skip same key for the next + // skip same key for b if self.choose_a { - if self.a.is_valid() && self.b.is_valid() && self.a.key() == self.b.key() { - self.b.next()?; - } self.a.next()?; } else { - if self.a.is_valid() && self.b.is_valid() && self.a.key() == self.b.key() { - self.a.next()?; - } self.b.next()?; } + if self.a.is_valid() && self.b.is_valid() && self.a.key() == self.b.key() { + self.b.next()?; + } self.choose_a = Self::choose_a(&self.a, &self.b); Ok(()) } + + fn num_active_iterators(&self) -> usize { + self.a.num_active_iterators() + self.b.num_active_iterators() + } } diff --git a/mini-lsm-starter/src/lsm_iterator.rs b/mini-lsm-starter/src/lsm_iterator.rs index 4e3a55047..8b51e9062 100644 --- a/mini-lsm-starter/src/lsm_iterator.rs +++ b/mini-lsm-starter/src/lsm_iterator.rs @@ -1,4 +1,7 @@ +use std::ops::Bound; + use anyhow::{bail, Ok, Result}; +use bytes::Bytes; use crate::{ iterators::{ @@ -15,18 +18,38 @@ type LsmIteratorInner = pub struct LsmIterator { inner: LsmIteratorInner, + end_bound: Bound, + is_valid: bool, } impl LsmIterator { - pub(crate) fn new(iter: LsmIteratorInner) -> Result { - let mut iter = Self { inner: iter }; + pub(crate) fn new(iter: LsmIteratorInner, end_bound: Bound) -> Result { + let mut iter = Self { + is_valid: iter.is_valid(), + inner: iter, + end_bound, + }; iter.move_to_non_delete()?; Ok(iter) } fn move_to_non_delete(&mut self) -> Result<()> { - while self.is_valid() && self.value().is_empty() { - self.next()?; + while self.is_valid() && self.inner.value().is_empty() { + self.next_inner()?; + } + Ok(()) + } + + fn next_inner(&mut self) -> Result<()> { + self.inner.next()?; + if !self.inner.is_valid() { + self.is_valid = false; + return Ok(()); + } + match self.end_bound.as_ref() { + Bound::Unbounded => {} + Bound::Included(key) => self.is_valid = self.inner.key().raw_ref() <= key.as_ref(), + Bound::Excluded(key) => self.is_valid = self.inner.key().raw_ref() < key.as_ref(), } Ok(()) } @@ -36,7 +59,7 @@ impl StorageIterator for LsmIterator { type KeyType<'a> = &'a [u8]; fn is_valid(&self) -> bool { - self.inner.is_valid() + self.is_valid } fn key(&self) -> &[u8] { @@ -48,11 +71,15 @@ impl StorageIterator for LsmIterator { } fn next(&mut self) -> Result<()> { - self.inner.next()?; + self.next_inner()?; // move to the next non-delete entry self.move_to_non_delete()?; Ok(()) } + + fn num_active_iterators(&self) -> usize { + self.inner.num_active_iterators() + } } /// A wrapper around existing iterator, will prevent users from calling `next` when the iterator is @@ -97,4 +124,8 @@ impl StorageIterator for FusedIterator { } Ok(()) } + + fn num_active_iterators(&self) -> usize { + self.iter.num_active_iterators() + } } diff --git a/mini-lsm-starter/src/lsm_storage.rs b/mini-lsm-starter/src/lsm_storage.rs index ec8412339..827290910 100644 --- a/mini-lsm-starter/src/lsm_storage.rs +++ b/mini-lsm-starter/src/lsm_storage.rs @@ -21,9 +21,9 @@ use crate::iterators::StorageIterator; use crate::key::KeySlice; use crate::lsm_iterator::{FusedIterator, LsmIterator}; use crate::manifest::Manifest; -use crate::mem_table::MemTable; +use crate::mem_table::{map_bound, MemTable}; use crate::mvcc::LsmMvccInner; -use crate::table::{SsTable, SsTableIterator}; +use crate::table::{SsTable, SsTableBuilder, SsTableIterator}; pub type BlockCache = moka::sync::Cache<(usize, usize), Arc>; @@ -152,7 +152,8 @@ impl Drop for MiniLsm { impl MiniLsm { pub fn close(&self) -> Result<()> { - unimplemented!() + self.flush_notifier.send(()).ok(); + Ok(()) } /// Start the storage engine by either loading an existing directory or creating a new one if the directory does @@ -231,6 +232,9 @@ impl LsmStorageInner { /// not exist. pub(crate) fn open(path: impl AsRef, options: LsmStorageOptions) -> Result { let path = path.as_ref(); + if !path.exists() { + std::fs::create_dir_all(path)?; + } let state = LsmStorageState::create(&options); let compaction_controller = match &options.compaction_options { @@ -265,88 +269,14 @@ impl LsmStorageInner { unimplemented!() } - /// Get a key from the storage. In day 7, this can be further optimized by using a bloom filter. - pub fn get(&self, key: &[u8]) -> Result> { - let snapshot = self.state.read(); - // search memtable firstly - if let Some(value) = snapshot.memtable.get(key) { - if value.is_empty() { - return Ok(None); - } - return Ok(Some(value)); - } - - // traverse imm-memtable - for memtable in snapshot.imm_memtables.iter() { - if let Some(value) = memtable.get(key) { - if value.is_empty() { - return Ok(None); - } - return Ok(Some(value)); - } - } - - // create merge iterator for l0_sstables - let mut table_iters = Vec::with_capacity(snapshot.l0_sstables.len()); - for sst_id in snapshot.l0_sstables.iter() { - let sst = snapshot.sstables.get(sst_id).unwrap(); - let iter = - SsTableIterator::create_and_seek_to_key(sst.clone(), KeySlice::from_slice(key))?; - table_iters.push(Box::new(iter)); - } - - let merge_l0_sstable_iter = MergeIterator::create(table_iters); - if merge_l0_sstable_iter.is_valid() - && merge_l0_sstable_iter.key() == KeySlice::from_slice(key) - && !merge_l0_sstable_iter.value().is_empty() - { - return Ok(Some(Bytes::copy_from_slice(merge_l0_sstable_iter.value()))); - } - - Ok(None) - } - - /// Write a batch of data into the storage. Implement in week 2 day 7. - pub fn write_batch>(&self, _batch: &[WriteBatchRecord]) -> Result<()> { - unimplemented!() - } - - /// Put a key-value pair into the storage by writing into the current memtable. - pub fn put(&self, key: &[u8], value: &[u8]) -> Result<()> { - let size; - { - let guard = self.state.write(); - guard.memtable.put(key, value)?; - size = guard.memtable.approximate_size(); - } - - self.try_freeze(size)?; - - Ok(()) - } - - /// Remove a key from the storage by writing an empty value. - pub fn delete(&self, key: &[u8]) -> Result<()> { - let size; - { - let guard = self.state.write(); - guard.memtable.put(key, b"")?; - size = guard.memtable.approximate_size(); - } - - self.try_freeze(size)?; - - Ok(()) - } - pub fn try_freeze(&self, size: usize) -> Result<()> { // using double check for concurrency if size >= self.options.target_sst_size { let state_lock = self.state_lock.lock(); - let guard = self.state.read(); + let snapshot = self.state.read(); - if guard.memtable.approximate_size() >= self.options.target_sst_size { - drop(guard); + if snapshot.memtable.approximate_size() >= self.options.target_sst_size { + drop(snapshot); self.force_freeze_memtable(&state_lock)?; } } @@ -361,6 +291,10 @@ impl LsmStorageInner { Self::path_of_sst_static(&self.path, id) } + pub(crate) fn path(&self) -> &Path { + &self.path + } + pub(crate) fn path_of_wal_static(path: impl AsRef, id: usize) -> PathBuf { path.as_ref().join(format!("{:05}.wal", id)) } @@ -389,7 +323,39 @@ impl LsmStorageInner { /// Force flush the earliest-created immutable memtable to disk pub fn force_flush_next_imm_memtable(&self) -> Result<()> { - unimplemented!() + let _state_lock = self.state_lock.lock(); + + let last_imm_memtable; + { + let snapshot = self.state.read(); + last_imm_memtable = snapshot + .imm_memtables + .last() + .expect("no imm memtables!") + .clone(); + } + + // build new sstable + let mut builder = SsTableBuilder::new(self.options.block_size); + last_imm_memtable.flush(&mut builder)?; + let new_sst = Arc::new(builder.build( + last_imm_memtable.id(), + Some(self.block_cache.clone()), + self.path_of_sst(last_imm_memtable.id()), + )?); + + { + let mut guard = self.state.write(); + let mut snapshot = guard.as_ref().clone(); + let sst_id = snapshot.imm_memtables.pop().unwrap().id(); + println!("flushed {}.sst with size={}", sst_id, new_sst.table_size()); + snapshot.sstables.insert(sst_id, new_sst); + // L0 SSTs are sorted by creation time, from latest to earliest. + snapshot.l0_sstables.insert(0, sst_id); + *guard = Arc::new(snapshot); + } + + Ok(()) } pub fn new_txn(&self) -> Result<()> { @@ -397,6 +363,51 @@ impl LsmStorageInner { Ok(()) } + /// Get a key from the storage. In day 7, this can be further optimized by using a bloom filter. + pub fn get(&self, key: &[u8]) -> Result> { + let snapshot = self.state.read(); + // search memtable firstly + if let Some(value) = snapshot.memtable.get(key) { + if value.is_empty() { + return Ok(None); + } + return Ok(Some(value)); + } + + // traverse imm-memtable + for memtable in snapshot.imm_memtables.iter() { + if let Some(value) = memtable.get(key) { + if value.is_empty() { + return Ok(None); + } + return Ok(Some(value)); + } + } + + // create merge iterator for l0_sstables + let mut table_iters = Vec::with_capacity(snapshot.l0_sstables.len()); + for sst_id in snapshot.l0_sstables.iter() { + let sst = snapshot.sstables.get(sst_id).unwrap(); + if key_within(key, sst.first_key().raw_ref(), sst.last_key().raw_ref()) { + let iter = SsTableIterator::create_and_seek_to_key( + sst.clone(), + KeySlice::from_slice(key), + )?; + table_iters.push(Box::new(iter)); + } + } + + let merge_l0_sstable_iter = MergeIterator::create(table_iters); + if merge_l0_sstable_iter.is_valid() + && merge_l0_sstable_iter.key() == KeySlice::from_slice(key) + && !merge_l0_sstable_iter.value().is_empty() + { + return Ok(Some(Bytes::copy_from_slice(merge_l0_sstable_iter.value()))); + } + + Ok(None) + } + /// Create an iterator over a range of keys. pub fn scan( &self, @@ -421,28 +432,115 @@ impl LsmStorageInner { let mut table_iters = Vec::with_capacity(snapshot.l0_sstables.len()); for sst_id in snapshot.l0_sstables.iter() { let sst = snapshot.sstables[sst_id].clone(); - // SST iterator does not support passing an end bound to it. - // Therefore, need to handle the end_bound manually in LsmIterator - let iter = match lower { - Bound::Included(key) => { - SsTableIterator::create_and_seek_to_key(sst, KeySlice::from_slice(key))? - } - Bound::Excluded(key) => { - let mut iter = - SsTableIterator::create_and_seek_to_key(sst, KeySlice::from_slice(key))?; - if iter.is_valid() && iter.key() == KeySlice::from_slice(key) { - iter.next()?; + println!("sst_id: {}, scan range: {:?} {:?}", sst_id, lower, upper); + // filter out some SSTs that do not contain the key range + if check_intersect_of_range( + lower, + upper, + sst.first_key().raw_ref(), + sst.last_key().raw_ref(), + ) { + // SST iterator does not support passing an end bound to it. + // Therefore, need to handle the end_bound manually in LsmIterator + let iter = match lower { + Bound::Included(key) => { + SsTableIterator::create_and_seek_to_key(sst, KeySlice::from_slice(key))? } - iter - } - Bound::Unbounded => SsTableIterator::create_and_seek_to_first(sst)?, - }; - table_iters.push(Box::new(iter)); + Bound::Excluded(key) => { + let mut iter = SsTableIterator::create_and_seek_to_key( + sst, + KeySlice::from_slice(key), + )?; + if iter.is_valid() && iter.key() == KeySlice::from_slice(key) { + iter.next()?; + } + iter + } + Bound::Unbounded => SsTableIterator::create_and_seek_to_first(sst)?, + }; + + table_iters.push(Box::new(iter)); + } } let merge_l0_sstable_iter = MergeIterator::create(table_iters); let two_merge_iter = TwoMergeIterator::create(merge_memtable_iter, merge_l0_sstable_iter)?; + Ok(FusedIterator::new(LsmIterator::new( + two_merge_iter, + map_bound(upper), + )?)) + } - Ok(FusedIterator::new(LsmIterator::new(two_merge_iter)?)) + /// Write a batch of data into the storage. Implement in week 2 day 7. + pub fn write_batch>(&self, _batch: &[WriteBatchRecord]) -> Result<()> { + unimplemented!() } + + /// Put a key-value pair into the storage by writing into the current memtable. + pub fn put(&self, key: &[u8], value: &[u8]) -> Result<()> { + let size; + { + let snapshot = self.state.write(); + snapshot.memtable.put(key, value)?; + size = snapshot.memtable.approximate_size(); + } + + self.try_freeze(size)?; + + Ok(()) + } + + /// Remove a key from the storage by writing an empty value. + pub fn delete(&self, key: &[u8]) -> Result<()> { + let size; + { + let snapshot = self.state.write(); + snapshot.memtable.put(key, b"")?; + size = snapshot.memtable.approximate_size(); + } + + self.try_freeze(size)?; + + Ok(()) + } +} + +// utils + +// key_within checks if the user's key is within the SST's key range. +fn key_within(key: &[u8], sst_begin: &[u8], sst_end: &[u8]) -> bool { + sst_begin <= key && key <= sst_end +} + +// Check if user's `key_range` intersects with the SST's key range. +fn check_intersect_of_range( + begin: Bound<&[u8]>, + end: Bound<&[u8]>, + sst_begin: &[u8], + sst_end: &[u8], +) -> bool { + println!( + "intersected: {:?} {:?}, sst: {:?} {:?}", + begin, end, sst_begin, sst_end + ); + match end { + Bound::Excluded(key) if key <= sst_begin => { + return false; + } + Bound::Included(key) if key < sst_begin => { + return false; + } + _ => {} + } + match begin { + Bound::Excluded(key) if sst_end <= key => { + return false; + } + Bound::Included(key) if sst_end < key => { + return false; + } + _ => {} + } + + true } diff --git a/mini-lsm-starter/src/mem_table.rs b/mini-lsm-starter/src/mem_table.rs index c303a3900..b92c60022 100644 --- a/mini-lsm-starter/src/mem_table.rs +++ b/mini-lsm-starter/src/mem_table.rs @@ -114,8 +114,11 @@ impl MemTable { } /// Flush the mem-table to SSTable. Implement in week 1 day 6. - pub fn flush(&self, _builder: &mut SsTableBuilder) -> Result<()> { - unimplemented!() + pub fn flush(&self, builder: &mut SsTableBuilder) -> Result<()> { + self.map + .iter() + .for_each(|entry| builder.add(KeySlice::from_slice(entry.key()), entry.value())); + Ok(()) } pub fn id(&self) -> usize { diff --git a/mini-lsm-starter/src/table/iterator.rs b/mini-lsm-starter/src/table/iterator.rs index f12b8eb38..ad8eff595 100644 --- a/mini-lsm-starter/src/table/iterator.rs +++ b/mini-lsm-starter/src/table/iterator.rs @@ -80,6 +80,7 @@ impl StorageIterator for SsTableIterator { /// Return the `key` that's held by the underlying block iterator. fn key(&self) -> KeySlice { + debug_assert!(!self.blk_iter.key().is_empty(), "invalid iterator"); self.blk_iter.key() } diff --git a/mini-lsm-starter/src/tests.rs b/mini-lsm-starter/src/tests.rs index 0eef03848..0afbac661 100644 --- a/mini-lsm-starter/src/tests.rs +++ b/mini-lsm-starter/src/tests.rs @@ -7,3 +7,4 @@ mod week1_day2; mod week1_day3; mod week1_day4; mod week1_day5; +mod week1_day6; diff --git a/mini-lsm-starter/src/tests/week1_day6.rs b/mini-lsm-starter/src/tests/week1_day6.rs new file mode 100644 index 000000000..12bda859f --- /dev/null +++ b/mini-lsm-starter/src/tests/week1_day6.rs @@ -0,0 +1,200 @@ +use std::{ops::Bound, sync::Arc, time::Duration}; + +use bytes::Bytes; +use tempfile::tempdir; + +use self::harness::{check_lsm_iter_result_by_key, sync}; + +use super::*; +use crate::{ + iterators::StorageIterator, + lsm_storage::{LsmStorageInner, LsmStorageOptions, MiniLsm}, +}; + +#[test] +fn test_task1_storage_scan() { + let dir = tempdir().unwrap(); + let storage = + Arc::new(LsmStorageInner::open(&dir, LsmStorageOptions::default_for_week1_test()).unwrap()); + storage.put(b"0", b"2333333").unwrap(); + storage.put(b"00", b"2333333").unwrap(); + storage.put(b"4", b"23").unwrap(); + sync(&storage); + + storage.delete(b"4").unwrap(); + sync(&storage); + + storage.put(b"1", b"233").unwrap(); + storage.put(b"2", b"2333").unwrap(); + storage + .force_freeze_memtable(&storage.state_lock.lock()) + .unwrap(); + storage.put(b"00", b"2333").unwrap(); + storage + .force_freeze_memtable(&storage.state_lock.lock()) + .unwrap(); + storage.put(b"3", b"23333").unwrap(); + storage.delete(b"1").unwrap(); + + { + let state = storage.state.read(); + assert_eq!(state.l0_sstables.len(), 2); + assert_eq!(state.imm_memtables.len(), 2); + } + + storage.dump_structure(); + check_lsm_iter_result_by_key( + &mut storage.scan(Bound::Unbounded, Bound::Unbounded).unwrap(), + vec![ + (Bytes::from("0"), Bytes::from("2333333")), + (Bytes::from("00"), Bytes::from("2333")), + (Bytes::from("2"), Bytes::from("2333")), + (Bytes::from("3"), Bytes::from("23333")), + ], + ); + check_lsm_iter_result_by_key( + &mut storage + .scan(Bound::Included(b"1"), Bound::Included(b"2")) + .unwrap(), + vec![(Bytes::from("2"), Bytes::from("2333"))], + ); + check_lsm_iter_result_by_key( + &mut storage + .scan(Bound::Excluded(b"1"), Bound::Excluded(b"3")) + .unwrap(), + vec![(Bytes::from("2"), Bytes::from("2333"))], + ); +} + +#[test] +fn test_task1_storage_get() { + let dir = tempdir().unwrap(); + let storage = + Arc::new(LsmStorageInner::open(&dir, LsmStorageOptions::default_for_week1_test()).unwrap()); + storage.put(b"0", b"2333333").unwrap(); + storage.put(b"00", b"2333333").unwrap(); + storage.put(b"4", b"23").unwrap(); + sync(&storage); + + storage.delete(b"4").unwrap(); + sync(&storage); + + storage.put(b"1", b"233").unwrap(); + storage.put(b"2", b"2333").unwrap(); + storage + .force_freeze_memtable(&storage.state_lock.lock()) + .unwrap(); + storage.put(b"00", b"2333").unwrap(); + storage + .force_freeze_memtable(&storage.state_lock.lock()) + .unwrap(); + storage.put(b"3", b"23333").unwrap(); + storage.delete(b"1").unwrap(); + + storage.dump_structure(); + + { + let state = storage.state.read(); + assert_eq!(state.l0_sstables.len(), 2); + assert_eq!(state.imm_memtables.len(), 2); + } + + assert_eq!( + storage.get(b"0").unwrap(), + Some(Bytes::from_static(b"2333333")) + ); + assert_eq!( + storage.get(b"00").unwrap(), + Some(Bytes::from_static(b"2333")) + ); + assert_eq!( + storage.get(b"2").unwrap(), + Some(Bytes::from_static(b"2333")) + ); + assert_eq!( + storage.get(b"3").unwrap(), + Some(Bytes::from_static(b"23333")) + ); + assert_eq!(storage.get(b"4").unwrap(), None); + assert_eq!(storage.get(b"--").unwrap(), None); + assert_eq!(storage.get(b"555").unwrap(), None); +} + +#[test] +fn test_task2_auto_flush() { + let dir = tempdir().unwrap(); + let storage = MiniLsm::open(&dir, LsmStorageOptions::default_for_week1_day6_test()).unwrap(); + + let value = "1".repeat(1024); // 1KB + + // approximately 6MB + for i in 0..6000 { + storage + .put(format!("{i}").as_bytes(), value.as_bytes()) + .unwrap(); + } + + std::thread::sleep(Duration::from_millis(500)); + + assert!(!storage.inner.state.read().l0_sstables.is_empty()); +} + +#[test] +fn test_task3_sst_filter() { + let dir = tempdir().unwrap(); + let storage = + Arc::new(LsmStorageInner::open(&dir, LsmStorageOptions::default_for_week1_test()).unwrap()); + + for i in 1..=10000 { + if i % 1000 == 0 { + sync(&storage); + } + storage + .put(format!("{:05}", i).as_bytes(), b"2333333") + .unwrap(); + } + + let iter = storage.scan(Bound::Unbounded, Bound::Unbounded).unwrap(); + assert!( + iter.num_active_iterators() >= 10, + "did you implement num_active_iterators? current active iterators = {}", + iter.num_active_iterators() + ); + let max_num = iter.num_active_iterators(); + let iter = storage + .scan( + Bound::Excluded(format!("{:05}", 10000).as_bytes()), + Bound::Unbounded, + ) + .unwrap(); + assert!(iter.num_active_iterators() < max_num); + let min_num = iter.num_active_iterators(); + let iter = storage + .scan( + Bound::Unbounded, + Bound::Excluded(format!("{:05}", 1).as_bytes()), + ) + .unwrap(); + assert_eq!(iter.num_active_iterators(), min_num); + let iter = storage + .scan( + Bound::Unbounded, + Bound::Included(format!("{:05}", 0).as_bytes()), + ) + .unwrap(); + assert_eq!(iter.num_active_iterators(), min_num); + let iter = storage + .scan( + Bound::Included(format!("{:05}", 10001).as_bytes()), + Bound::Unbounded, + ) + .unwrap(); + assert_eq!(iter.num_active_iterators(), min_num); + let iter = storage + .scan( + Bound::Included(format!("{:05}", 5000).as_bytes()), + Bound::Excluded(format!("{:05}", 6000).as_bytes()), + ) + .unwrap(); + assert!(min_num < iter.num_active_iterators() && iter.num_active_iterators() < max_num); +} From c43fd77cbee30198a240fc874d021d292c5bb4fe Mon Sep 17 00:00:00 2001 From: husharp Date: Thu, 8 Feb 2024 14:12:38 +0800 Subject: [PATCH 07/22] week1_day7 Signed-off-by: husharp --- mini-lsm-starter/src/lsm_storage.rs | 22 ++++-- mini-lsm-starter/src/table.rs | 27 ++++--- mini-lsm-starter/src/table/bloom.rs | 61 ++++++++++++++-- mini-lsm-starter/src/table/builder.rs | 34 +++++---- mini-lsm-starter/src/tests.rs | 1 + mini-lsm-starter/src/tests/harness.rs | 2 +- mini-lsm-starter/src/tests/week1_day7.rs | 91 ++++++++++++++++++++++++ 7 files changed, 203 insertions(+), 35 deletions(-) create mode 100644 mini-lsm-starter/src/tests/week1_day7.rs diff --git a/mini-lsm-starter/src/lsm_storage.rs b/mini-lsm-starter/src/lsm_storage.rs index 827290910..639339e28 100644 --- a/mini-lsm-starter/src/lsm_storage.rs +++ b/mini-lsm-starter/src/lsm_storage.rs @@ -388,12 +388,24 @@ impl LsmStorageInner { let mut table_iters = Vec::with_capacity(snapshot.l0_sstables.len()); for sst_id in snapshot.l0_sstables.iter() { let sst = snapshot.sstables.get(sst_id).unwrap(); + // check if the key is within the SST's key range if key_within(key, sst.first_key().raw_ref(), sst.last_key().raw_ref()) { - let iter = SsTableIterator::create_and_seek_to_key( - sst.clone(), - KeySlice::from_slice(key), - )?; - table_iters.push(Box::new(iter)); + // bloom filter check + if let Some(bloom) = &sst.bloom { + if bloom.may_contain(farmhash::fingerprint32(key)) { + let iter = SsTableIterator::create_and_seek_to_key( + sst.clone(), + KeySlice::from_slice(key), + )?; + table_iters.push(Box::new(iter)); + } + } else { + let iter = SsTableIterator::create_and_seek_to_key( + sst.clone(), + KeySlice::from_slice(key), + )?; + table_iters.push(Box::new(iter)); + } } } diff --git a/mini-lsm-starter/src/table.rs b/mini-lsm-starter/src/table.rs index e792de605..497e03b34 100644 --- a/mini-lsm-starter/src/table.rs +++ b/mini-lsm-starter/src/table.rs @@ -132,26 +132,33 @@ impl SsTable { /// Open SSTable from a file. pub fn open(id: usize, block_cache: Option>, file: FileObject) -> Result { let len = file.size(); + // decode bloom filter // u32 for extra info - let raw_metadata_offset = file.read(len - 4, 4)?; - let metadat_offset = (&raw_metadata_offset[..]).get_u32() as u64; - let raw_metadata = file.read( - metadat_offset, - len - metadat_offset - 4, /* extra size */ - )?; - let block_metas = BlockMeta::decode_block_metas(raw_metadata.as_slice())?; + let raw_bloom_offset = file.read(len - 4, 4)?; + let bloom_offset = (&raw_bloom_offset[..]).get_u32() as u64; + let raw_bloom = file.read(bloom_offset, len - bloom_offset - 4 /* extra size */)?; + let bloom = Bloom::decode(raw_bloom.as_slice())?; + println!("decode bloom size: {}, k={}", bloom.filter.len(), bloom.k); - let raw_data = file.read(0, metadat_offset)?; + // decode block meta + // u32 for extra info + let raw_block_meta_offset = file.read(bloom_offset - 4, 4)?; + let block_meta_offset = (&raw_block_meta_offset[..]).get_u32() as u64; + let raw_block_meta = file.read(block_meta_offset, bloom_offset - 4 - block_meta_offset)?; + let block_metas = BlockMeta::decode_block_metas(raw_block_meta.as_slice())?; + + // decode data blocks + let raw_data = file.read(0, block_meta_offset)?; let sst_table = SsTable { id, file, - block_meta_offset: metadat_offset as usize, + block_meta_offset: block_meta_offset as usize, first_key: block_metas.first().unwrap().first_key.clone(), last_key: block_metas.last().unwrap().last_key.clone(), block_metas, block_cache, - bloom: None, + bloom: Some(bloom), max_ts: 0, }; Ok(sst_table) diff --git a/mini-lsm-starter/src/table/bloom.rs b/mini-lsm-starter/src/table/bloom.rs index 1f2f453f6..fbd46ec4a 100644 --- a/mini-lsm-starter/src/table/bloom.rs +++ b/mini-lsm-starter/src/table/bloom.rs @@ -3,7 +3,8 @@ use anyhow::Result; use bytes::{BufMut, Bytes, BytesMut}; -/// Implements a bloom filter +/// Bloom implements bloom filter functionalities over +/// a bit-slice of data. pub struct Bloom { /// data of filter in bits pub(crate) filter: Bytes, @@ -62,10 +63,10 @@ impl Bloom { } /// Get bloom filter bits per key from entries count and FPR - pub fn bloom_bits_per_key(entries: usize, false_positive_rate: f64) -> usize { + pub fn bloom_bits_per_key(hashs_count: usize, false_positive_rate: f64) -> usize { let size = - -1.0 * (entries as f64) * false_positive_rate.ln() / std::f64::consts::LN_2.powi(2); - let locs = (size / (entries as f64)).ceil(); + -1.0 * (hashs_count as f64) * false_positive_rate.ln() / std::f64::consts::LN_2.powi(2); + let locs = (std::f64::consts::LN_2 * size / (hashs_count as f64)).ceil(); locs as usize } @@ -79,7 +80,17 @@ impl Bloom { let mut filter = BytesMut::with_capacity(nbytes); filter.resize(nbytes, 0); - // TODO: build the bloom filter + keys.iter().for_each(|h| { + /* h is the key hash */ + let mut h = *h; + let delta = (h >> 17) | (h << 15); + for j in 0..k { + let bitpos = delta as usize % nbits; + filter.set_bit(bitpos, true); + // we can add delta to uniformly distribute the bits + h = h.wrapping_add(delta); + } + }); Self { filter: filter.freeze(), @@ -88,7 +99,7 @@ impl Bloom { } /// Check if a bloom filter may contain some data - pub fn may_contain(&self, h: u32) -> bool { + pub fn may_contain(&self, mut h: u32) -> bool { if self.k > 30 { // potential new encoding for short bloom filters true @@ -96,9 +107,45 @@ impl Bloom { let nbits = self.filter.bit_len(); let delta = (h >> 17) | (h << 15); - // TODO: probe the bloom filter + for j in 0..self.k { + let bitpos = delta as usize % nbits; + if !self.filter.get_bit(bitpos) { + return false; + } + // we can add delta to uniformly distribute the bits + h = h.wrapping_add(delta); + } true } } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_small_bloom_filter() { + let hash: Vec = vec![b"hello".to_vec(), b"world".to_vec()] + .into_iter() + .map(|x| farmhash::fingerprint32(&x)) + .collect(); + let bloom = Bloom::build_from_key_hashes(&hash, 10); + + let check_hash: Vec = vec![ + b"hello".to_vec(), + b"world".to_vec(), + b"x".to_vec(), + b"fool".to_vec(), + ] + .into_iter() + .map(|x| farmhash::fingerprint32(&x)) + .collect(); + + assert!(bloom.may_contain(check_hash[0])); + assert!(bloom.may_contain(check_hash[1])); + assert!(!bloom.may_contain(check_hash[2])); + assert!(!bloom.may_contain(check_hash[3])); + } +} diff --git a/mini-lsm-starter/src/table/builder.rs b/mini-lsm-starter/src/table/builder.rs index 3f1264fc6..4cb042a7a 100644 --- a/mini-lsm-starter/src/table/builder.rs +++ b/mini-lsm-starter/src/table/builder.rs @@ -1,13 +1,10 @@ -#![allow(unused_variables)] // TODO(you): remove this lint after implementing this mod -#![allow(dead_code)] // TODO(you): remove this lint after implementing this mod - use std::path::Path; use std::sync::Arc; use anyhow::Result; use bytes::BufMut; -use super::{BlockMeta, FileObject, SsTable}; +use super::{bloom::Bloom, BlockMeta, FileObject, SsTable}; use crate::{ block::BlockBuilder, key::{KeySlice, KeyVec}, @@ -22,14 +19,16 @@ pub struct SsTableBuilder { data: Vec, pub(crate) metas: Vec, block_size: usize, + key_hashes: Vec, } /* -------------------------------------------------------------------------------------------- -| Block Section | Meta Section | Extra | -------------------------------------------------------------------------------------------- -| data block | ... | data block | vec | meta block offset (u32) | -------------------------------------------------------------------------------------------- +----------------------------------------------------------------------------------------------------- +| Block Section | Meta Section | +----------------------------------------------------------------------------------------------------- +| data block | ... | data block | metadata | meta block offset | bloom filter | bloom filter offset | +| | varlen | u32 | varlen | u32 | +----------------------------------------------------------------------------------------------------- */ impl SsTableBuilder { /// Create a builder based on target block size. @@ -41,19 +40,21 @@ impl SsTableBuilder { data: Vec::new(), metas: Vec::new(), block_size, + key_hashes: Vec::new(), } } /// Adds a key-value pair to SSTable. /// - /// Note: You should split a new block when the current block is full.(`std::mem::replace` may - /// be helpful here) + /// we should split a new block when the current block is full. pub fn add(&mut self, key: KeySlice, value: &[u8]) { // if the first time add to this block, set the first key if self.first_key.is_empty() { self.first_key.set_from_slice(key); } + // add the key hash to the bloom filter + self.key_hashes.push(farmhash::fingerprint32(key.raw_ref())); // block builder returns false when the block is full. if self.block_builder.add(key, value) { self.last_key.set_from_slice(key); @@ -104,6 +105,15 @@ impl SsTableBuilder { BlockMeta::encode_block_metas(&self.metas, &mut buf); // extra info for the meta block offset buf.put_u32(block_meta_offset as u32); + // create bloom filter and encode it + let bits_per_key = Bloom::bloom_bits_per_key(self.key_hashes.len(), 0.01); + println!("encode bits per key: {}", bits_per_key); + let bloom = Bloom::build_from_key_hashes(&self.key_hashes, bits_per_key); + println!("encode bloom size: {}, k={}", bloom.filter.len(), bloom.k); + let bloom_offset = buf.len(); + bloom.encode(&mut buf); + buf.put_u32(bloom_offset as u32); + let file = FileObject::create(path.as_ref(), buf)?; let sst_table = SsTable { id, @@ -113,7 +123,7 @@ impl SsTableBuilder { block_metas: self.metas, block_meta_offset, block_cache, - bloom: None, + bloom: Some(bloom), max_ts: 0, }; Ok(sst_table) diff --git a/mini-lsm-starter/src/tests.rs b/mini-lsm-starter/src/tests.rs index 0afbac661..dd1f4a075 100644 --- a/mini-lsm-starter/src/tests.rs +++ b/mini-lsm-starter/src/tests.rs @@ -8,3 +8,4 @@ mod week1_day3; mod week1_day4; mod week1_day5; mod week1_day6; +mod week1_day7; diff --git a/mini-lsm-starter/src/tests/harness.rs b/mini-lsm-starter/src/tests/harness.rs index 104cfdaec..4b0d7e30d 100644 --- a/mini-lsm-starter/src/tests/harness.rs +++ b/mini-lsm-starter/src/tests/harness.rs @@ -210,7 +210,7 @@ pub fn generate_sst_with_ts( builder.build(id, block_cache, path.as_ref()).unwrap() } -pub(crate) fn sync(storage: &LsmStorageInner) { +pub fn sync(storage: &LsmStorageInner) { storage .force_freeze_memtable(&storage.state_lock.lock()) .unwrap(); diff --git a/mini-lsm-starter/src/tests/week1_day7.rs b/mini-lsm-starter/src/tests/week1_day7.rs new file mode 100644 index 000000000..ce51a66d8 --- /dev/null +++ b/mini-lsm-starter/src/tests/week1_day7.rs @@ -0,0 +1,91 @@ +use tempfile::tempdir; + +use crate::{ + key::{KeySlice, TS_ENABLED}, + table::{bloom::Bloom, FileObject, SsTable, SsTableBuilder}, +}; + +fn key_of(idx: usize) -> Vec { + format!("key_{:010}", idx * 5).into_bytes() +} + +fn value_of(idx: usize) -> Vec { + format!("value_{:010}", idx).into_bytes() +} + +fn num_of_keys() -> usize { + 100 +} + +#[test] +fn test_task1_bloom_filter() { + let mut key_hashes = Vec::new(); + for idx in 0..num_of_keys() { + let key = key_of(idx); + key_hashes.push(farmhash::fingerprint32(&key)); + } + let bits_per_key = Bloom::bloom_bits_per_key(key_hashes.len(), 0.01); + println!("bits per key: {}", bits_per_key); + let bloom = Bloom::build_from_key_hashes(&key_hashes, bits_per_key); + println!("bloom size: {}, k={}", bloom.filter.len(), bloom.k); + assert!(bloom.k < 30); + for idx in 0..num_of_keys() { + let key = key_of(idx); + assert!(bloom.may_contain(farmhash::fingerprint32(&key))); + } + let mut x = 0; + let mut cnt = 0; + for idx in num_of_keys()..(num_of_keys() * 10) { + let key = key_of(idx); + if bloom.may_contain(farmhash::fingerprint32(&key)) { + x += 1; + } + cnt += 1; + } + assert_ne!(x, cnt, "bloom filter not taking effect?"); + assert_ne!(x, 0, "bloom filter not taking effect?"); +} + +#[test] +fn test_task2_sst_decode() { + let mut builder = SsTableBuilder::new(128); + for idx in 0..num_of_keys() { + let key = key_of(idx); + let value = value_of(idx); + builder.add(KeySlice::for_testing_from_slice_no_ts(&key[..]), &value[..]); + } + let dir = tempdir().unwrap(); + let path = dir.path().join("1.sst"); + let sst = builder.build_for_test(&path).unwrap(); + let sst2 = SsTable::open(0, None, FileObject::open(&path).unwrap()).unwrap(); + let bloom_1 = sst.bloom.as_ref().unwrap(); + let bloom_2 = sst2.bloom.as_ref().unwrap(); + assert_eq!(bloom_1.k, bloom_2.k); + assert_eq!(bloom_1.filter, bloom_2.filter); +} + +#[test] +fn test_task3_block_key_compression() { + let mut builder = SsTableBuilder::new(128); + for idx in 0..num_of_keys() { + let key = key_of(idx); + let value = value_of(idx); + builder.add(KeySlice::for_testing_from_slice_no_ts(&key[..]), &value[..]); + } + let dir = tempdir().unwrap(); + let path = dir.path().join("1.sst"); + let sst = builder.build_for_test(path).unwrap(); + if TS_ENABLED { + assert!( + sst.block_metas.len() <= 34, + "you have {} blocks, expect 34", + sst.block_metas.len() + ); + } else { + assert!( + sst.block_metas.len() <= 25, + "you have {} blocks, expect 25", + sst.block_metas.len() + ); + } +} From f197e02ddbb10e80d23524e44589798529621b3f Mon Sep 17 00:00:00 2001 From: husharp Date: Sat, 10 Feb 2024 12:22:51 +0800 Subject: [PATCH 08/22] week2_day1 full compact Signed-off-by: husharp --- mini-lsm-starter/src/compact.rs | 130 ++++++++- .../src/iterators/concat_iterator.rs | 96 ++++++- mini-lsm-starter/src/lsm_iterator.rs | 9 +- mini-lsm-starter/src/lsm_storage.rs | 61 ++++- mini-lsm-starter/src/tests.rs | 1 + mini-lsm-starter/src/tests/harness.rs | 2 + mini-lsm-starter/src/tests/week2_day1.rs | 252 ++++++++++++++++++ 7 files changed, 524 insertions(+), 27 deletions(-) create mode 100644 mini-lsm-starter/src/tests/week2_day1.rs diff --git a/mini-lsm-starter/src/compact.rs b/mini-lsm-starter/src/compact.rs index faa9f6b86..8ed0df27a 100644 --- a/mini-lsm-starter/src/compact.rs +++ b/mini-lsm-starter/src/compact.rs @@ -4,6 +4,7 @@ mod leveled; mod simple_leveled; mod tiered; +use std::collections::HashSet; use std::sync::Arc; use std::time::Duration; @@ -15,8 +16,12 @@ pub use simple_leveled::{ }; pub use tiered::{TieredCompactionController, TieredCompactionOptions, TieredCompactionTask}; +use crate::iterators::merge_iterator::MergeIterator; +use crate::iterators::two_merge_iterator::TwoMergeIterator; +use crate::iterators::StorageIterator; +use crate::lsm_iterator::FusedIterator; use crate::lsm_storage::{LsmStorageInner, LsmStorageState}; -use crate::table::SsTable; +use crate::table::{SsTable, SsTableBuilder, SsTableIterator}; #[derive(Debug, Serialize, Deserialize)] pub enum CompactionTask { @@ -107,12 +112,129 @@ pub enum CompactionOptions { } impl LsmStorageInner { - fn compact(&self, _task: &CompactionTask) -> Result>> { - unimplemented!() + fn compact(&self, task: &CompactionTask) -> Result>> { + let snapshot = { + let state = self.state.read(); + state.clone() + }; + + let mut iter = match task { + CompactionTask::ForceFullCompaction { + l0_sstables, + l1_sstables, + } => { + // create l0_sstables + let mut table_iters = Vec::with_capacity(l0_sstables.len()); + for sst_id in l0_sstables.iter() { + let sst = snapshot.sstables.get(sst_id).unwrap(); + let iter = SsTableIterator::create_and_seek_to_first(sst.clone())?; + table_iters.push(Box::new(iter)); + } + let merge_l0_sstable_iter = MergeIterator::create(table_iters); + + // create l1_sstables + let mut table_iters = Vec::with_capacity(l1_sstables.len()); + for sst_id in l1_sstables.iter() { + let sst = snapshot.sstables.get(sst_id).unwrap(); + let iter = SsTableIterator::create_and_seek_to_first(sst.clone())?; + table_iters.push(Box::new(iter)); + } + let merge_l1_sstable_iter = MergeIterator::create(table_iters); + + // merge l0_sstables and l1_sstables + let merge_iter = + TwoMergeIterator::create(merge_l0_sstable_iter, merge_l1_sstable_iter)?; + FusedIterator::new(merge_iter) + } + _ => unimplemented!(), + }; + + let mut new_ssts = vec![]; + // compact the iterators + let mut builder = None; + while iter.is_valid() { + if builder.is_none() { + builder = Some(SsTableBuilder::new(self.options.target_sst_size)); + } + let builder_inner = builder.as_mut().unwrap(); + if !iter.value().is_empty() { + println!("[compact] key: {:?}, value: {:?}", iter.key(), iter.value()); + builder_inner.add(iter.key(), iter.value()); + } + iter.next()?; + + if builder_inner.estimated_size() > self.options.target_sst_size { + let sst_id = self.next_sst_id(); + let builder = builder.take().unwrap(); + let new_sst = Arc::new(builder.build( + sst_id, + Some(self.block_cache.clone()), + self.path_of_sst(sst_id), + )?); + new_ssts.push(new_sst); + } + } + + // put last sst if exists builder + if let Some(builder) = builder { + let sst_id = self.next_sst_id(); // lock dropped here + let sst = Arc::new(builder.build( + sst_id, + Some(self.block_cache.clone()), + self.path_of_sst(sst_id), + )?); + new_ssts.push(sst); + } + Ok(new_ssts) } pub fn force_full_compaction(&self) -> Result<()> { - unimplemented!() + let snapshot = { + let state = self.state.read(); + state.clone() + }; + + let l0_sstables = snapshot.l0_sstables.clone(); + let l1_sstables = snapshot.levels[0].1.clone(); + // compact the l0_sstables and l1_sstables to get compacted SSTs + let new_ssts = self.compact(&CompactionTask::ForceFullCompaction { + l0_sstables: l0_sstables.clone(), + l1_sstables: l1_sstables.clone(), + })?; + + // update the state + { + let _state_lock = self.state_lock.lock(); + let mut state = self.state.read().as_ref().clone(); + + // remove all participants of the compaction from the state + for sst in l0_sstables.iter().chain(l1_sstables.iter()) { + state.sstables.remove(sst); + } + + // remove old l0_sstables from the state + let mut l0_sstables_map = l0_sstables.iter().copied().collect::>(); + state.l0_sstables = state + .l0_sstables + .iter() + .filter(|x| !l0_sstables_map.remove(x)) + .copied() + .collect::>(); + assert!(l0_sstables_map.is_empty()); + + let ids = new_ssts.iter().map(|x| x.sst_id()).collect::>(); + state.levels[0].1 = ids; + // insert new SSTs to sstables + for sst in new_ssts.iter() { + state.sstables.insert(sst.sst_id(), sst.clone()); + } + *self.state.write() = Arc::new(state); + }; + + for sst in l0_sstables.iter().chain(l1_sstables.iter()) { + std::fs::remove_file(self.path_of_sst(*sst))?; + } + Ok(()) } fn trigger_compaction(&self) -> Result<()> { diff --git a/mini-lsm-starter/src/iterators/concat_iterator.rs b/mini-lsm-starter/src/iterators/concat_iterator.rs index 8cef71528..c010dee55 100644 --- a/mini-lsm-starter/src/iterators/concat_iterator.rs +++ b/mini-lsm-starter/src/iterators/concat_iterator.rs @@ -3,7 +3,7 @@ use std::sync::Arc; -use anyhow::Result; +use anyhow::{Ok, Result}; use super::StorageIterator; use crate::{ @@ -19,13 +19,85 @@ pub struct SstConcatIterator { sstables: Vec>, } +// check ssts vaild +fn check_valid(sstables: &[Arc]) { + for sst in sstables { + assert!(sst.first_key() <= sst.last_key()); + } + if !sstables.is_empty() { + for i in 1..sstables.len() { + assert!(sstables[i - 1].last_key() < sstables[i].first_key()); + } + } +} + impl SstConcatIterator { pub fn create_and_seek_to_first(sstables: Vec>) -> Result { - unimplemented!() + if sstables.is_empty() { + return Ok(Self { + current: None, + next_sst_idx: 0, + sstables, + }); + } + check_valid(&sstables); + let mut iter = Self { + current: Some(SsTableIterator::create_and_seek_to_first( + sstables[0].clone(), + )?), + next_sst_idx: 1, + sstables, + }; + + iter.move_iter_until_valid()?; + Ok(iter) } pub fn create_and_seek_to_key(sstables: Vec>, key: KeySlice) -> Result { - unimplemented!() + if sstables.is_empty() { + return Ok(Self { + current: None, + next_sst_idx: 0, + sstables, + }); + } + check_valid(&sstables); + // get key from sstables + let idx: usize = sstables + .partition_point(|table| table.first_key().as_key_slice() <= key) + .saturating_sub(1); + + let mut iter = Self { + current: Some(SsTableIterator::create_and_seek_to_key( + sstables[idx].clone(), + key, + )?), + next_sst_idx: idx + 1, + sstables, + }; + iter.move_iter_until_valid()?; + Ok(iter) + } + + fn move_iter_until_valid(&mut self) -> Result<()> { + // check if the current iterator is valid + loop { + if let Some(current) = &self.current { + if current.is_valid() { + return Ok(()); + } + if self.next_sst_idx >= self.sstables.len() { + self.current = None; + return Ok(()); + } + self.current = Some(SsTableIterator::create_and_seek_to_first( + self.sstables[self.next_sst_idx].clone(), + )?); + self.next_sst_idx += 1; + } else { + return Ok(()); + } + } } } @@ -33,19 +105,29 @@ impl StorageIterator for SstConcatIterator { type KeyType<'a> = KeySlice<'a>; fn key(&self) -> KeySlice { - unimplemented!() + self.current.as_ref().unwrap().key() } fn value(&self) -> &[u8] { - unimplemented!() + self.current.as_ref().unwrap().value() } fn is_valid(&self) -> bool { - unimplemented!() + if let Some(current) = &self.current { + assert!(current.is_valid()); + true + } else { + false + } } fn next(&mut self) -> Result<()> { - unimplemented!() + if self.current.is_none() { + return Ok(()); + } + self.current.as_mut().unwrap().next()?; + self.move_iter_until_valid()?; + Ok(()) } fn num_active_iterators(&self) -> usize { diff --git a/mini-lsm-starter/src/lsm_iterator.rs b/mini-lsm-starter/src/lsm_iterator.rs index 8b51e9062..236c8f349 100644 --- a/mini-lsm-starter/src/lsm_iterator.rs +++ b/mini-lsm-starter/src/lsm_iterator.rs @@ -5,7 +5,8 @@ use bytes::Bytes; use crate::{ iterators::{ - merge_iterator::MergeIterator, two_merge_iterator::TwoMergeIterator, StorageIterator, + concat_iterator::SstConcatIterator, merge_iterator::MergeIterator, + two_merge_iterator::TwoMergeIterator, StorageIterator, }, mem_table::MemTableIterator, table::SsTableIterator, @@ -13,8 +14,10 @@ use crate::{ /// Represents the internal type for an LSM iterator. This type will be changed across the tutorial for multiple times. /// choosing memtable firstly -type LsmIteratorInner = - TwoMergeIterator, MergeIterator>; +type LsmIteratorInner = TwoMergeIterator< + TwoMergeIterator, MergeIterator>, + SstConcatIterator, +>; pub struct LsmIterator { inner: LsmIteratorInner, diff --git a/mini-lsm-starter/src/lsm_storage.rs b/mini-lsm-starter/src/lsm_storage.rs index 639339e28..cb207f8f1 100644 --- a/mini-lsm-starter/src/lsm_storage.rs +++ b/mini-lsm-starter/src/lsm_storage.rs @@ -15,6 +15,7 @@ use crate::compact::{ CompactionController, CompactionOptions, LeveledCompactionController, LeveledCompactionOptions, SimpleLeveledCompactionController, SimpleLeveledCompactionOptions, TieredCompactionController, }; +use crate::iterators::concat_iterator::SstConcatIterator; use crate::iterators::merge_iterator::MergeIterator; use crate::iterators::two_merge_iterator::TwoMergeIterator; use crate::iterators::StorageIterator; @@ -385,7 +386,7 @@ impl LsmStorageInner { } // create merge iterator for l0_sstables - let mut table_iters = Vec::with_capacity(snapshot.l0_sstables.len()); + let mut l0_iters = Vec::with_capacity(snapshot.l0_sstables.len()); for sst_id in snapshot.l0_sstables.iter() { let sst = snapshot.sstables.get(sst_id).unwrap(); // check if the key is within the SST's key range @@ -397,24 +398,34 @@ impl LsmStorageInner { sst.clone(), KeySlice::from_slice(key), )?; - table_iters.push(Box::new(iter)); + l0_iters.push(Box::new(iter)); } } else { let iter = SsTableIterator::create_and_seek_to_key( sst.clone(), KeySlice::from_slice(key), )?; - table_iters.push(Box::new(iter)); + l0_iters.push(Box::new(iter)); } } } + let merge_l0_sstable_iter = MergeIterator::create(l0_iters); - let merge_l0_sstable_iter = MergeIterator::create(table_iters); - if merge_l0_sstable_iter.is_valid() - && merge_l0_sstable_iter.key() == KeySlice::from_slice(key) - && !merge_l0_sstable_iter.value().is_empty() + // create merge iterator for l1_sstables + let mut l1_ssts = Vec::with_capacity(snapshot.levels[0].1.len()); + for sst_id in snapshot.levels[0].1.iter() { + l1_ssts.push(snapshot.sstables[sst_id].clone()); + } + let l1_concat_iter = + SstConcatIterator::create_and_seek_to_key(l1_ssts, KeySlice::from_slice(key))?; + + let two_merge_iterator = TwoMergeIterator::create(merge_l0_sstable_iter, l1_concat_iter)?; + + if two_merge_iterator.is_valid() + && two_merge_iterator.key() == KeySlice::from_slice(key) + && !two_merge_iterator.value().is_empty() { - return Ok(Some(Bytes::copy_from_slice(merge_l0_sstable_iter.value()))); + return Ok(Some(Bytes::copy_from_slice(two_merge_iterator.value()))); } Ok(None) @@ -437,11 +448,11 @@ impl LsmStorageInner { for imm_memtable in snapshot.imm_memtables.iter() { memtable_iters.push(Box::new(imm_memtable.scan(lower, upper))); } - // using merge iterator to merge all iterators + // using merge iterator to merge all memtables and imm_memtables iters let merge_memtable_iter = MergeIterator::create(memtable_iters); - // using merge iterator to merge all sstables - let mut table_iters = Vec::with_capacity(snapshot.l0_sstables.len()); + // using merge iterator to merge all sstables iters + let mut l0_iters = Vec::with_capacity(snapshot.l0_sstables.len()); for sst_id in snapshot.l0_sstables.iter() { let sst = snapshot.sstables[sst_id].clone(); println!("sst_id: {}, scan range: {:?} {:?}", sst_id, lower, upper); @@ -471,12 +482,36 @@ impl LsmStorageInner { Bound::Unbounded => SsTableIterator::create_and_seek_to_first(sst)?, }; - table_iters.push(Box::new(iter)); + l0_iters.push(Box::new(iter)); } } - let merge_l0_sstable_iter = MergeIterator::create(table_iters); + let merge_l0_sstable_iter = MergeIterator::create(l0_iters); + + // concat l1 sstables + let mut l1_ssts = Vec::with_capacity(snapshot.levels[0].1.len()); + for sst_id in snapshot.levels[0].1.iter() { + l1_ssts.push(snapshot.sstables[sst_id].clone()); + } + let l1_concat_iter = match lower { + Bound::Included(key) => { + SstConcatIterator::create_and_seek_to_key(l1_ssts, KeySlice::from_slice(key))? + } + Bound::Excluded(key) => { + let mut iter = + SstConcatIterator::create_and_seek_to_key(l1_ssts, KeySlice::from_slice(key))?; + if iter.is_valid() && iter.key() == KeySlice::from_slice(key) { + iter.next()?; + } + iter + } + Bound::Unbounded => SstConcatIterator::create_and_seek_to_first(l1_ssts)?, + }; + // memtables and imm_memtables are merged first, then the result is merged with L0 SSTs let two_merge_iter = TwoMergeIterator::create(merge_memtable_iter, merge_l0_sstable_iter)?; + // finally, the result is merged with L1 SSTs + let two_merge_iter = TwoMergeIterator::create(two_merge_iter, l1_concat_iter)?; + Ok(FusedIterator::new(LsmIterator::new( two_merge_iter, map_bound(upper), diff --git a/mini-lsm-starter/src/tests.rs b/mini-lsm-starter/src/tests.rs index dd1f4a075..738e5a94e 100644 --- a/mini-lsm-starter/src/tests.rs +++ b/mini-lsm-starter/src/tests.rs @@ -9,3 +9,4 @@ mod week1_day4; mod week1_day5; mod week1_day6; mod week1_day7; +mod week2_day1; diff --git a/mini-lsm-starter/src/tests/harness.rs b/mini-lsm-starter/src/tests/harness.rs index 4b0d7e30d..9f81c3eae 100644 --- a/mini-lsm-starter/src/tests/harness.rs +++ b/mini-lsm-starter/src/tests/harness.rs @@ -1,3 +1,5 @@ +#![allow(dead_code)] + use std::{ collections::BTreeMap, ops::Bound, os::unix::fs::MetadataExt, path::Path, sync::Arc, time::Duration, diff --git a/mini-lsm-starter/src/tests/week2_day1.rs b/mini-lsm-starter/src/tests/week2_day1.rs new file mode 100644 index 000000000..b681c1b41 --- /dev/null +++ b/mini-lsm-starter/src/tests/week2_day1.rs @@ -0,0 +1,252 @@ +use std::{ops::Bound, path::Path, sync::Arc}; + +use self::harness::{check_iter_result_by_key, check_lsm_iter_result_by_key, sync}; +use bytes::Bytes; +use tempfile::tempdir; +use week2_day1::harness::construct_merge_iterator_over_storage; + +use super::*; +use crate::{ + iterators::{concat_iterator::SstConcatIterator, StorageIterator}, + key::{KeySlice, TS_ENABLED}, + lsm_storage::{LsmStorageInner, LsmStorageOptions}, + table::{SsTable, SsTableBuilder}, +}; + +#[test] +fn test_task1_full_compaction() { + // We do not use LSM iterator in this test because it's implemented as part of task 3concat.rs + let dir = tempdir().unwrap(); + let storage = + Arc::new(LsmStorageInner::open(&dir, LsmStorageOptions::default_for_week1_test()).unwrap()); + #[allow(clippy::let_unit_value)] + let _txn = storage.new_txn().unwrap(); + storage.put(b"0", b"v1").unwrap(); + sync(&storage); + storage.put(b"0", b"v2").unwrap(); + storage.put(b"1", b"v2").unwrap(); + storage.put(b"2", b"v2").unwrap(); + sync(&storage); + storage.delete(b"0").unwrap(); + storage.delete(b"2").unwrap(); + sync(&storage); + assert_eq!(storage.state.read().l0_sstables.len(), 3); + let mut iter = construct_merge_iterator_over_storage(&storage.state.read()); + if TS_ENABLED { + check_iter_result_by_key( + &mut iter, + vec![ + (Bytes::from_static(b"0"), Bytes::from_static(b"")), + (Bytes::from_static(b"0"), Bytes::from_static(b"v2")), + (Bytes::from_static(b"0"), Bytes::from_static(b"v1")), + (Bytes::from_static(b"1"), Bytes::from_static(b"v2")), + (Bytes::from_static(b"2"), Bytes::from_static(b"")), + (Bytes::from_static(b"2"), Bytes::from_static(b"v2")), + ], + ); + } else { + check_iter_result_by_key( + &mut iter, + vec![ + (Bytes::from_static(b"0"), Bytes::from_static(b"")), + (Bytes::from_static(b"1"), Bytes::from_static(b"v2")), + (Bytes::from_static(b"2"), Bytes::from_static(b"")), + ], + ); + } + storage.force_full_compaction().unwrap(); + storage.dump_structure(); + assert!(storage.state.read().l0_sstables.is_empty()); + let mut iter = construct_merge_iterator_over_storage(&storage.state.read()); + if TS_ENABLED { + check_iter_result_by_key( + &mut iter, + vec![ + (Bytes::from_static(b"0"), Bytes::from_static(b"")), + (Bytes::from_static(b"0"), Bytes::from_static(b"v2")), + (Bytes::from_static(b"0"), Bytes::from_static(b"v1")), + (Bytes::from_static(b"1"), Bytes::from_static(b"v2")), + (Bytes::from_static(b"2"), Bytes::from_static(b"")), + (Bytes::from_static(b"2"), Bytes::from_static(b"v2")), + ], + ); + } else { + check_iter_result_by_key( + &mut iter, + vec![(Bytes::from_static(b"1"), Bytes::from_static(b"v2"))], + ); + } + storage.put(b"0", b"v3").unwrap(); + storage.put(b"2", b"v3").unwrap(); + sync(&storage); + storage.delete(b"1").unwrap(); + sync(&storage); + let mut iter = construct_merge_iterator_over_storage(&storage.state.read()); + if TS_ENABLED { + check_iter_result_by_key( + &mut iter, + vec![ + (Bytes::from_static(b"0"), Bytes::from_static(b"v3")), + (Bytes::from_static(b"0"), Bytes::from_static(b"")), + (Bytes::from_static(b"0"), Bytes::from_static(b"v2")), + (Bytes::from_static(b"0"), Bytes::from_static(b"v1")), + (Bytes::from_static(b"1"), Bytes::from_static(b"")), + (Bytes::from_static(b"1"), Bytes::from_static(b"v2")), + (Bytes::from_static(b"2"), Bytes::from_static(b"v3")), + (Bytes::from_static(b"2"), Bytes::from_static(b"")), + (Bytes::from_static(b"2"), Bytes::from_static(b"v2")), + ], + ); + } else { + check_iter_result_by_key( + &mut iter, + vec![ + (Bytes::from_static(b"0"), Bytes::from_static(b"v3")), + (Bytes::from_static(b"1"), Bytes::from_static(b"")), + (Bytes::from_static(b"2"), Bytes::from_static(b"v3")), + ], + ); + } + storage.force_full_compaction().unwrap(); + assert!(storage.state.read().l0_sstables.is_empty()); + let mut iter = construct_merge_iterator_over_storage(&storage.state.read()); + if TS_ENABLED { + check_iter_result_by_key( + &mut iter, + vec![ + (Bytes::from_static(b"0"), Bytes::from_static(b"v3")), + (Bytes::from_static(b"0"), Bytes::from_static(b"")), + (Bytes::from_static(b"0"), Bytes::from_static(b"v2")), + (Bytes::from_static(b"0"), Bytes::from_static(b"v1")), + (Bytes::from_static(b"1"), Bytes::from_static(b"")), + (Bytes::from_static(b"1"), Bytes::from_static(b"v2")), + (Bytes::from_static(b"2"), Bytes::from_static(b"v3")), + (Bytes::from_static(b"2"), Bytes::from_static(b"")), + (Bytes::from_static(b"2"), Bytes::from_static(b"v2")), + ], + ); + } else { + check_iter_result_by_key( + &mut iter, + vec![ + (Bytes::from_static(b"0"), Bytes::from_static(b"v3")), + (Bytes::from_static(b"2"), Bytes::from_static(b"v3")), + ], + ); + } +} + +fn generate_concat_sst( + start_key: usize, + end_key: usize, + dir: impl AsRef, + id: usize, +) -> SsTable { + let mut builder = SsTableBuilder::new(128); + for idx in start_key..end_key { + let key = format!("{:05}", idx); + builder.add( + KeySlice::for_testing_from_slice_no_ts(key.as_bytes()), + b"test", + ); + } + let path = dir.as_ref().join(format!("{id}.sst")); + builder.build_for_test(path).unwrap() +} + +#[test] +fn test_task2_concat_iterator() { + let dir = tempdir().unwrap(); + let mut sstables = Vec::new(); + for i in 1..=10 { + sstables.push(Arc::new(generate_concat_sst( + i * 10, + (i + 1) * 10, + dir.path(), + i, + ))); + } + for key in 0..120 { + let iter = SstConcatIterator::create_and_seek_to_key( + sstables.clone(), + KeySlice::for_testing_from_slice_no_ts(format!("{:05}", key).as_bytes()), + ) + .unwrap(); + if key < 10 { + assert!(iter.is_valid()); + assert_eq!(iter.key().for_testing_key_ref(), b"00010"); + } else if key >= 110 { + assert!(!iter.is_valid()); + } else { + assert!(iter.is_valid()); + assert_eq!( + iter.key().for_testing_key_ref(), + format!("{:05}", key).as_bytes() + ); + } + } + let iter = SstConcatIterator::create_and_seek_to_first(sstables.clone()).unwrap(); + assert!(iter.is_valid()); + assert_eq!(iter.key().for_testing_key_ref(), b"00010"); +} + +#[test] +fn test_task3_integration() { + let dir = tempdir().unwrap(); + let storage = + Arc::new(LsmStorageInner::open(&dir, LsmStorageOptions::default_for_week1_test()).unwrap()); + storage.put(b"0", b"2333333").unwrap(); + storage.put(b"00", b"2333333").unwrap(); + storage.put(b"4", b"23").unwrap(); + sync(&storage); + + storage.delete(b"4").unwrap(); + sync(&storage); + + storage.force_full_compaction().unwrap(); + assert!(storage.state.read().l0_sstables.is_empty()); + assert!(!storage.state.read().levels[0].1.is_empty()); + + storage.put(b"1", b"233").unwrap(); + storage.put(b"2", b"2333").unwrap(); + sync(&storage); + + storage.put(b"00", b"2333").unwrap(); + storage.put(b"3", b"23333").unwrap(); + storage.delete(b"1").unwrap(); + sync(&storage); + storage.force_full_compaction().unwrap(); + + assert!(storage.state.read().l0_sstables.is_empty()); + assert!(!storage.state.read().levels[0].1.is_empty()); + + check_lsm_iter_result_by_key( + &mut storage.scan(Bound::Unbounded, Bound::Unbounded).unwrap(), + vec![ + (Bytes::from("0"), Bytes::from("2333333")), + (Bytes::from("00"), Bytes::from("2333")), + (Bytes::from("2"), Bytes::from("2333")), + (Bytes::from("3"), Bytes::from("23333")), + ], + ); + + assert_eq!( + storage.get(b"0").unwrap(), + Some(Bytes::from_static(b"2333333")) + ); + assert_eq!( + storage.get(b"00").unwrap(), + Some(Bytes::from_static(b"2333")) + ); + assert_eq!( + storage.get(b"2").unwrap(), + Some(Bytes::from_static(b"2333")) + ); + assert_eq!( + storage.get(b"3").unwrap(), + Some(Bytes::from_static(b"23333")) + ); + assert_eq!(storage.get(b"4").unwrap(), None); + assert_eq!(storage.get(b"--").unwrap(), None); + assert_eq!(storage.get(b"555").unwrap(), None); +} From 4914f45e54c0477e6b41565af9e8574074a8dce3 Mon Sep 17 00:00:00 2001 From: husharp Date: Mon, 12 Feb 2024 11:30:49 +0800 Subject: [PATCH 09/22] week2_day1 simple compact Signed-off-by: husharp --- .../src/bin/compaction-simulator.rs | 2 +- mini-lsm-starter/src/compact.rs | 197 ++++++++++++++---- .../src/compact/simple_leveled.rs | 84 +++++++- mini-lsm-starter/src/debug.rs | 9 + mini-lsm-starter/src/lsm_iterator.rs | 2 +- mini-lsm-starter/src/lsm_storage.rs | 103 +++++---- mini-lsm-starter/src/mem_table.rs | 1 + mini-lsm-starter/src/table/builder.rs | 2 - mini-lsm-starter/src/tests.rs | 1 + mini-lsm-starter/src/tests/harness.rs | 9 +- mini-lsm-starter/src/tests/week2_day2.rs | 27 +++ mini-lsm-starter/src/wal.rs | 1 + 12 files changed, 336 insertions(+), 102 deletions(-) create mode 100644 mini-lsm-starter/src/tests/week2_day2.rs diff --git a/mini-lsm-starter/src/bin/compaction-simulator.rs b/mini-lsm-starter/src/bin/compaction-simulator.rs index 18eba53d4..978fedeca 100644 --- a/mini-lsm-starter/src/bin/compaction-simulator.rs +++ b/mini-lsm-starter/src/bin/compaction-simulator.rs @@ -289,7 +289,7 @@ fn main() { storage.snapshot = snapshot; storage.remove(&del); println!("--- After Compaction ---"); - if dump_real_id { + if !dump_real_id { storage.dump_real_id(true, false); } else { storage.dump_original_id(true, false); diff --git a/mini-lsm-starter/src/compact.rs b/mini-lsm-starter/src/compact.rs index 8ed0df27a..1643eb887 100644 --- a/mini-lsm-starter/src/compact.rs +++ b/mini-lsm-starter/src/compact.rs @@ -8,7 +8,7 @@ use std::collections::HashSet; use std::sync::Arc; use std::time::Duration; -use anyhow::Result; +use anyhow::{Ok, Result}; pub use leveled::{LeveledCompactionController, LeveledCompactionOptions, LeveledCompactionTask}; use serde::{Deserialize, Serialize}; pub use simple_leveled::{ @@ -16,9 +16,11 @@ pub use simple_leveled::{ }; pub use tiered::{TieredCompactionController, TieredCompactionOptions, TieredCompactionTask}; +use crate::iterators::concat_iterator::SstConcatIterator; use crate::iterators::merge_iterator::MergeIterator; use crate::iterators::two_merge_iterator::TwoMergeIterator; use crate::iterators::StorageIterator; +use crate::key::KeySlice; use crate::lsm_iterator::FusedIterator; use crate::lsm_storage::{LsmStorageInner, LsmStorageState}; use crate::table::{SsTable, SsTableBuilder, SsTableIterator}; @@ -112,58 +114,31 @@ pub enum CompactionOptions { } impl LsmStorageInner { - fn compact(&self, task: &CompactionTask) -> Result>> { - let snapshot = { - let state = self.state.read(); - state.clone() - }; - - let mut iter = match task { - CompactionTask::ForceFullCompaction { - l0_sstables, - l1_sstables, - } => { - // create l0_sstables - let mut table_iters = Vec::with_capacity(l0_sstables.len()); - for sst_id in l0_sstables.iter() { - let sst = snapshot.sstables.get(sst_id).unwrap(); - let iter = SsTableIterator::create_and_seek_to_first(sst.clone())?; - table_iters.push(Box::new(iter)); - } - let merge_l0_sstable_iter = MergeIterator::create(table_iters); - - // create l1_sstables - let mut table_iters = Vec::with_capacity(l1_sstables.len()); - for sst_id in l1_sstables.iter() { - let sst = snapshot.sstables.get(sst_id).unwrap(); - let iter = SsTableIterator::create_and_seek_to_first(sst.clone())?; - table_iters.push(Box::new(iter)); - } - let merge_l1_sstable_iter = MergeIterator::create(table_iters); - - // merge l0_sstables and l1_sstables - let merge_iter = - TwoMergeIterator::create(merge_l0_sstable_iter, merge_l1_sstable_iter)?; - FusedIterator::new(merge_iter) - } - _ => unimplemented!(), - }; - - let mut new_ssts = vec![]; + fn compact_generate_sst_from_iter( + &self, + mut iter: impl for<'a> StorageIterator = KeySlice<'a>>, + compact_to_bottom_level: bool, + ) -> Result>> { + let mut new_ssts = Vec::new(); // compact the iterators let mut builder = None; while iter.is_valid() { if builder.is_none() { - builder = Some(SsTableBuilder::new(self.options.target_sst_size)); + builder = Some(SsTableBuilder::new(self.options.block_size)); } let builder_inner = builder.as_mut().unwrap(); - if !iter.value().is_empty() { - println!("[compact] key: {:?}, value: {:?}", iter.key(), iter.value()); + if compact_to_bottom_level { + if !iter.value().is_empty() { + builder_inner.add(iter.key(), iter.value()); + } + } else { builder_inner.add(iter.key(), iter.value()); } + iter.next()?; - if builder_inner.estimated_size() > self.options.target_sst_size { + if builder_inner.estimated_size() >= self.options.target_sst_size { + println!("compact_generate_sst_from_iter"); let sst_id = self.next_sst_id(); let builder = builder.take().unwrap(); let new_sst = Arc::new(builder.build( @@ -177,6 +152,7 @@ impl LsmStorageInner { // put last sst if exists builder if let Some(builder) = builder { + println!("compact_generate_sst_from_iter put last"); let sst_id = self.next_sst_id(); // lock dropped here let sst = Arc::new(builder.build( sst_id, @@ -188,6 +164,86 @@ impl LsmStorageInner { Ok(new_ssts) } + fn compact(&self, task: &CompactionTask) -> Result>> { + let snapshot = { + let state = self.state.read(); + state.clone() + }; + + match task { + CompactionTask::ForceFullCompaction { + l0_sstables, + l1_sstables, + } => { + // create l0_sstables + let mut l0_iters = Vec::with_capacity(l0_sstables.len()); + for sst_id in l0_sstables.iter() { + let sst = snapshot.sstables.get(sst_id).unwrap(); + let iter = SsTableIterator::create_and_seek_to_first(sst.clone())?; + l0_iters.push(Box::new(iter)); + } + + // create l1_sstables + let mut l1_iters = Vec::with_capacity(l1_sstables.len()); + for sst_id in l1_sstables.iter() { + let sst = snapshot.sstables.get(sst_id).unwrap(); + l1_iters.push(sst.clone()); + } + + // merge l0_sstables and l1_sstables + let iter = FusedIterator::new(TwoMergeIterator::create( + MergeIterator::create(l0_iters), + SstConcatIterator::create_and_seek_to_first(l1_iters)?, + )?); + self.compact_generate_sst_from_iter(iter, task.compact_to_bottom_level()) + } + CompactionTask::Simple(SimpleLeveledCompactionTask { + upper_level, + upper_level_sst_ids, + lower_level_sst_ids, + .. + }) => { + match upper_level { + Some(_) => { + // create iterators for upper and lower level sstables + let mut upper_ssts = Vec::with_capacity(upper_level_sst_ids.len()); + for id in upper_level_sst_ids.iter() { + upper_ssts.push(snapshot.sstables.get(id).unwrap().clone()); + } + let upper_iter = SstConcatIterator::create_and_seek_to_first(upper_ssts)?; + let mut lower_ssts = Vec::with_capacity(upper_level_sst_ids.len()); + for id in lower_level_sst_ids.iter() { + lower_ssts.push(snapshot.sstables.get(id).unwrap().clone()); + } + let lower_iter = SstConcatIterator::create_and_seek_to_first(lower_ssts)?; + let iter = TwoMergeIterator::create(upper_iter, lower_iter)?; + self.compact_generate_sst_from_iter(iter, task.compact_to_bottom_level()) + } + // because it is L0 compaction, we can not use concat iterator which is for ordered sstables + None => { + // create iterators for upper and lower level sstables + let mut upper_iters = Vec::with_capacity(upper_level_sst_ids.len()); + for id in upper_level_sst_ids.iter() { + let iter = SsTableIterator::create_and_seek_to_first( + snapshot.sstables.get(id).unwrap().clone(), + )?; + upper_iters.push(Box::new(iter)); + } + let upper_merge_iter = MergeIterator::create(upper_iters); + let mut lower_ssts = Vec::with_capacity(upper_level_sst_ids.len()); + for id in lower_level_sst_ids.iter() { + lower_ssts.push(snapshot.sstables.get(id).unwrap().clone()); + } + let lower_iter = SstConcatIterator::create_and_seek_to_first(lower_ssts)?; + let iter = TwoMergeIterator::create(upper_merge_iter, lower_iter)?; + self.compact_generate_sst_from_iter(iter, task.compact_to_bottom_level()) + } + } + } + _ => unimplemented!(), + } + } + pub fn force_full_compaction(&self) -> Result<()> { let snapshot = { let state = self.state.read(); @@ -197,12 +253,17 @@ impl LsmStorageInner { let l0_sstables = snapshot.l0_sstables.clone(); let l1_sstables = snapshot.levels[0].1.clone(); // compact the l0_sstables and l1_sstables to get compacted SSTs + println!( + "force full compaction with l0_sstables: {:?}, l1_sstables: {:?}", + l0_sstables, l1_sstables + ); let new_ssts = self.compact(&CompactionTask::ForceFullCompaction { l0_sstables: l0_sstables.clone(), l1_sstables: l1_sstables.clone(), })?; // update the state + let ids; { let _state_lock = self.state_lock.lock(); let mut state = self.state.read().as_ref().clone(); @@ -222,8 +283,8 @@ impl LsmStorageInner { .collect::>(); assert!(l0_sstables_map.is_empty()); - let ids = new_ssts.iter().map(|x| x.sst_id()).collect::>(); - state.levels[0].1 = ids; + ids = new_ssts.iter().map(|x| x.sst_id()).collect::>(); + state.levels[0].1 = ids.clone(); // insert new SSTs to sstables for sst in new_ssts.iter() { state.sstables.insert(sst.sst_id(), sst.clone()); @@ -234,11 +295,55 @@ impl LsmStorageInner { for sst in l0_sstables.iter().chain(l1_sstables.iter()) { std::fs::remove_file(self.path_of_sst(*sst))?; } + println!("force full compaction done, new SSTs: {:?}", ids); Ok(()) } fn trigger_compaction(&self) -> Result<()> { - unimplemented!() + let snapshot = { + let state = self.state.read(); + state.clone() + }; + + let task = self + .compaction_controller + .generate_compaction_task(&snapshot); + if let Some(task) = task { + self.dump_structure(); + println!("running compaction task: {:?}", task); + let new_ssts = self.compact(&task)?; + let output = new_ssts.iter().map(|x| x.sst_id()).collect::>(); + let mut snapshot = self.state.read().as_ref().clone(); + // insert new SSTs to sstables + for ssts_to_add in new_ssts { + let result = snapshot.sstables.insert(ssts_to_add.sst_id(), ssts_to_add); + assert!(result.is_none()); + } + let (mut new_snapshot, files_to_remove) = self + .compaction_controller + .apply_compaction_result(&snapshot, &task, &output); + // remove old SSTs from sstables + let mut ssts_to_remove = Vec::with_capacity(files_to_remove.len()); + for file_to_remove in &files_to_remove { + let result = new_snapshot.sstables.remove(file_to_remove); + assert!(result.is_some(), "cannot remove {}.sst", file_to_remove); + ssts_to_remove.push(result.unwrap()); + } + let mut state = self.state.write(); + *state = Arc::new(new_snapshot); + drop(state); + + println!( + "compaction finished: {} files removed, {} files added, output={:?}", + ssts_to_remove.len(), + output.len(), + output + ); + for sst in ssts_to_remove.iter() { + std::fs::remove_file(self.path_of_sst(sst.sst_id()))?; + } + } + Ok(()) } pub(crate) fn spawn_compaction_thread( diff --git a/mini-lsm-starter/src/compact/simple_leveled.rs b/mini-lsm-starter/src/compact/simple_leveled.rs index 1c008ce16..9c9d292a5 100644 --- a/mini-lsm-starter/src/compact/simple_leveled.rs +++ b/mini-lsm-starter/src/compact/simple_leveled.rs @@ -1,3 +1,5 @@ +use std::collections::HashSet; + use serde::{Deserialize, Serialize}; use crate::lsm_storage::LsmStorageState; @@ -33,9 +35,48 @@ impl SimpleLeveledCompactionController { /// Returns `None` if no compaction needs to be scheduled. The order of SSTs in the compaction task id vector matters. pub fn generate_compaction_task( &self, - _snapshot: &LsmStorageState, + snapshot: &LsmStorageState, ) -> Option { - unimplemented!() + // check if the size ratio is satisfied + for i in 0..self.options.max_levels { + // trigger a compaction of L0 and L1 + if i == 0 + && snapshot.l0_sstables.len() < self.options.level0_file_num_compaction_trigger + { + continue; + } + let upper_level_size = if i == 0 { + snapshot.l0_sstables.len() + } else { + // levels start from 0 + snapshot.levels[i - 1].1.len() + }; + let lower_level_size = snapshot.levels[i].1.len(); + let size_ratio = lower_level_size as f64 / upper_level_size as f64; + if size_ratio < self.options.size_ratio_percent as f64 / 100.0 { + let message = if i == 0 { + "L0 to L1".to_string() + } else { + format!("L{} to L{}", i, i + 1) + }; + println!( + "compaction triggered at {} with size ratio {}", + message, size_ratio, + ); + return Some(SimpleLeveledCompactionTask { + upper_level: if i == 0 { None } else { Some(i - 1) }, + upper_level_sst_ids: if i == 0 { + snapshot.l0_sstables.clone() + } else { + snapshot.levels[i - 1].1.clone() + }, + lower_level: i, + lower_level_sst_ids: snapshot.levels[i].1.clone(), + is_lower_level_bottom_level: i == self.options.max_levels - 1, + }); + } + } + None } /// Apply the compaction result. @@ -47,10 +88,41 @@ impl SimpleLeveledCompactionController { /// in your implementation. pub fn apply_compaction_result( &self, - _snapshot: &LsmStorageState, - _task: &SimpleLeveledCompactionTask, - _output: &[usize], + snapshot: &LsmStorageState, + task: &SimpleLeveledCompactionTask, + output: &[usize], ) -> (LsmStorageState, Vec) { - unimplemented!() + let mut files_to_remove = Vec::new(); + let mut new_snapshot = snapshot.clone(); + if let Some(upper_level) = task.upper_level { + // L1+ compaction + assert_eq!( + task.upper_level_sst_ids, snapshot.levels[upper_level].1, + "sst mismatched" + ); + println!("L{} compaction output: {:?}", upper_level + 1, output); + files_to_remove.extend(&task.upper_level_sst_ids); + new_snapshot.levels[upper_level].1.clear(); + } else { + // L0 compaction + println!("L0 compaction output: {:?}", output); + files_to_remove.extend(&task.upper_level_sst_ids); + let mut l0_ssts_compacted = task + .upper_level_sst_ids + .iter() + .copied() + .collect::>(); + let new_l0_sstables = snapshot + .l0_sstables + .iter() + .copied() + .filter(|x| !l0_ssts_compacted.remove(x)) + .collect::>(); + assert!(l0_ssts_compacted.is_empty()); + new_snapshot.l0_sstables = new_l0_sstables; + } + files_to_remove.extend(&new_snapshot.levels[task.lower_level].1); + new_snapshot.levels[task.lower_level].1 = output.to_vec(); + (new_snapshot, files_to_remove) } } diff --git a/mini-lsm-starter/src/debug.rs b/mini-lsm-starter/src/debug.rs index c9eab3dd9..2b05d6c91 100644 --- a/mini-lsm-starter/src/debug.rs +++ b/mini-lsm-starter/src/debug.rs @@ -2,7 +2,15 @@ use crate::lsm_storage::{LsmStorageInner, MiniLsm}; impl LsmStorageInner { pub fn dump_structure(&self) { + println!("----------------- Dump Structure -----------------"); let snapshot = self.state.read(); + // print mem-table + if !snapshot.memtable.is_empty() { + println!("Mem-table: {:?}", snapshot.memtable); + } + if !snapshot.imm_memtables.is_empty() { + println!("Imm-mem-tables: {:?}", snapshot.imm_memtables); + } if !snapshot.l0_sstables.is_empty() { println!( "L0 ({}): {:?}", @@ -13,6 +21,7 @@ impl LsmStorageInner { for (level, files) in &snapshot.levels { println!("L{level} ({}): {:?}", files.len(), files); } + println!("----------------- Dump Structure finished -----------------"); } } diff --git a/mini-lsm-starter/src/lsm_iterator.rs b/mini-lsm-starter/src/lsm_iterator.rs index 236c8f349..5c4e68087 100644 --- a/mini-lsm-starter/src/lsm_iterator.rs +++ b/mini-lsm-starter/src/lsm_iterator.rs @@ -16,7 +16,7 @@ use crate::{ /// choosing memtable firstly type LsmIteratorInner = TwoMergeIterator< TwoMergeIterator, MergeIterator>, - SstConcatIterator, + MergeIterator, >; pub struct LsmIterator { diff --git a/mini-lsm-starter/src/lsm_storage.rs b/mini-lsm-starter/src/lsm_storage.rs index cb207f8f1..ae8da53ec 100644 --- a/mini-lsm-starter/src/lsm_storage.rs +++ b/mini-lsm-starter/src/lsm_storage.rs @@ -385,42 +385,52 @@ impl LsmStorageInner { } } - // create merge iterator for l0_sstables - let mut l0_iters = Vec::with_capacity(snapshot.l0_sstables.len()); - for sst_id in snapshot.l0_sstables.iter() { - let sst = snapshot.sstables.get(sst_id).unwrap(); + let check_sst = |key: &[u8], sst: &SsTable| { // check if the key is within the SST's key range if key_within(key, sst.first_key().raw_ref(), sst.last_key().raw_ref()) { // bloom filter check if let Some(bloom) = &sst.bloom { if bloom.may_contain(farmhash::fingerprint32(key)) { - let iter = SsTableIterator::create_and_seek_to_key( - sst.clone(), - KeySlice::from_slice(key), - )?; - l0_iters.push(Box::new(iter)); + return true; } } else { - let iter = SsTableIterator::create_and_seek_to_key( - sst.clone(), - KeySlice::from_slice(key), - )?; - l0_iters.push(Box::new(iter)); + return true; } } + false + }; + + // create merge iterator for l0_sstables + let mut l0_iters = Vec::with_capacity(snapshot.l0_sstables.len()); + for sst_id in snapshot.l0_sstables.iter() { + let sst = snapshot.sstables.get(sst_id).unwrap(); + if check_sst(key, sst) { + let iter = SsTableIterator::create_and_seek_to_key( + sst.clone(), + KeySlice::from_slice(key), + )?; + l0_iters.push(Box::new(iter)); + } } let merge_l0_sstable_iter = MergeIterator::create(l0_iters); - // create merge iterator for l1_sstables - let mut l1_ssts = Vec::with_capacity(snapshot.levels[0].1.len()); - for sst_id in snapshot.levels[0].1.iter() { - l1_ssts.push(snapshot.sstables[sst_id].clone()); + // create merge iterator for multi-level sstables + let mut level_iters = Vec::with_capacity(snapshot.levels.len()); + for (_, level_sst_ids) in &snapshot.levels { + let mut ssts = Vec::with_capacity(level_sst_ids.len()); + for sst_id in level_sst_ids { + let sst = snapshot.sstables.get(sst_id).unwrap(); + if check_sst(key, sst) { + ssts.push(sst.clone()); + } + } + let level_iter = + SstConcatIterator::create_and_seek_to_key(ssts, KeySlice::from_slice(key))?; + level_iters.push(Box::new(level_iter)); } - let l1_concat_iter = - SstConcatIterator::create_and_seek_to_key(l1_ssts, KeySlice::from_slice(key))?; - - let two_merge_iterator = TwoMergeIterator::create(merge_l0_sstable_iter, l1_concat_iter)?; + let merge_iter = MergeIterator::create(level_iters); + let two_merge_iterator = TwoMergeIterator::create(merge_l0_sstable_iter, merge_iter)?; if two_merge_iterator.is_valid() && two_merge_iterator.key() == KeySlice::from_slice(key) && !two_merge_iterator.value().is_empty() @@ -486,31 +496,38 @@ impl LsmStorageInner { } } let merge_l0_sstable_iter = MergeIterator::create(l0_iters); - - // concat l1 sstables - let mut l1_ssts = Vec::with_capacity(snapshot.levels[0].1.len()); - for sst_id in snapshot.levels[0].1.iter() { - l1_ssts.push(snapshot.sstables[sst_id].clone()); - } - let l1_concat_iter = match lower { - Bound::Included(key) => { - SstConcatIterator::create_and_seek_to_key(l1_ssts, KeySlice::from_slice(key))? + // memtables and imm_memtables are merged first, then the result is merged with L0 SSTs + let two_merge_sst_iter = + TwoMergeIterator::create(merge_memtable_iter, merge_l0_sstable_iter)?; + + // concat multi-level sstables + let mut sst_concat_iters = Vec::with_capacity(snapshot.levels.len()); + for (level, level_sst_ids) in &snapshot.levels { + println!("level: {}", level); + let mut ssts = Vec::with_capacity(level_sst_ids.len()); + for sst_id in level_sst_ids.iter() { + ssts.push(snapshot.sstables[sst_id].clone()); } - Bound::Excluded(key) => { - let mut iter = - SstConcatIterator::create_and_seek_to_key(l1_ssts, KeySlice::from_slice(key))?; - if iter.is_valid() && iter.key() == KeySlice::from_slice(key) { - iter.next()?; + let concat_iter = match lower { + Bound::Included(key) => { + SstConcatIterator::create_and_seek_to_key(ssts, KeySlice::from_slice(key))? } - iter - } - Bound::Unbounded => SstConcatIterator::create_and_seek_to_first(l1_ssts)?, - }; + Bound::Excluded(key) => { + let mut iter = + SstConcatIterator::create_and_seek_to_key(ssts, KeySlice::from_slice(key))?; + if iter.is_valid() && iter.key() == KeySlice::from_slice(key) { + iter.next()?; + } + iter + } + Bound::Unbounded => SstConcatIterator::create_and_seek_to_first(ssts)?, + }; + sst_concat_iters.push(Box::new(concat_iter)); + } + let merge_sst_iter = MergeIterator::create(sst_concat_iters); - // memtables and imm_memtables are merged first, then the result is merged with L0 SSTs - let two_merge_iter = TwoMergeIterator::create(merge_memtable_iter, merge_l0_sstable_iter)?; // finally, the result is merged with L1 SSTs - let two_merge_iter = TwoMergeIterator::create(two_merge_iter, l1_concat_iter)?; + let two_merge_iter = TwoMergeIterator::create(two_merge_sst_iter, merge_sst_iter)?; Ok(FusedIterator::new(LsmIterator::new( two_merge_iter, diff --git a/mini-lsm-starter/src/mem_table.rs b/mini-lsm-starter/src/mem_table.rs index b92c60022..2f3d8d1cc 100644 --- a/mini-lsm-starter/src/mem_table.rs +++ b/mini-lsm-starter/src/mem_table.rs @@ -20,6 +20,7 @@ use crate::wal::Wal; /// /// An initial implementation of memtable is part of week 1, day 1. It will be incrementally implemented in other /// chapters of week 1 and week 2. +#[derive(Debug)] pub struct MemTable { map: Arc>, wal: Option, diff --git a/mini-lsm-starter/src/table/builder.rs b/mini-lsm-starter/src/table/builder.rs index 4cb042a7a..b545dfd96 100644 --- a/mini-lsm-starter/src/table/builder.rs +++ b/mini-lsm-starter/src/table/builder.rs @@ -107,9 +107,7 @@ impl SsTableBuilder { buf.put_u32(block_meta_offset as u32); // create bloom filter and encode it let bits_per_key = Bloom::bloom_bits_per_key(self.key_hashes.len(), 0.01); - println!("encode bits per key: {}", bits_per_key); let bloom = Bloom::build_from_key_hashes(&self.key_hashes, bits_per_key); - println!("encode bloom size: {}, k={}", bloom.filter.len(), bloom.k); let bloom_offset = buf.len(); bloom.encode(&mut buf); buf.put_u32(bloom_offset as u32); diff --git a/mini-lsm-starter/src/tests.rs b/mini-lsm-starter/src/tests.rs index 738e5a94e..cfd9055c4 100644 --- a/mini-lsm-starter/src/tests.rs +++ b/mini-lsm-starter/src/tests.rs @@ -10,3 +10,4 @@ mod week1_day5; mod week1_day6; mod week1_day7; mod week2_day1; +mod week2_day2; diff --git a/mini-lsm-starter/src/tests/harness.rs b/mini-lsm-starter/src/tests/harness.rs index 9f81c3eae..280fd0ff3 100644 --- a/mini-lsm-starter/src/tests/harness.rs +++ b/mini-lsm-starter/src/tests/harness.rs @@ -1,5 +1,3 @@ -#![allow(dead_code)] - use std::{ collections::BTreeMap, ops::Bound, os::unix::fs::MetadataExt, path::Path, sync::Arc, time::Duration, @@ -264,7 +262,12 @@ pub fn compaction_bench(storage: Arc) { let value = storage.get(key.as_bytes()).unwrap(); if let Some(val) = key_map.get(&i) { let expected_value = gen_value(*val); - assert_eq!(value, Some(Bytes::from(expected_value.clone()))); + assert_eq!( + value, + Some(Bytes::from(expected_value.clone())), + "key: {}", + key + ); expected_key_value_pairs.push((Bytes::from(key), Bytes::from(expected_value))); } else { assert!(value.is_none()); diff --git a/mini-lsm-starter/src/tests/week2_day2.rs b/mini-lsm-starter/src/tests/week2_day2.rs new file mode 100644 index 000000000..5064afc5e --- /dev/null +++ b/mini-lsm-starter/src/tests/week2_day2.rs @@ -0,0 +1,27 @@ +use tempfile::tempdir; + +use crate::{ + compact::{CompactionOptions, SimpleLeveledCompactionOptions}, + lsm_storage::{LsmStorageOptions, MiniLsm}, +}; + +use super::harness::{check_compaction_ratio, compaction_bench}; + +#[test] +fn test_integration() { + let dir = tempdir().unwrap(); + let storage = MiniLsm::open( + &dir, + LsmStorageOptions::default_for_week2_test(CompactionOptions::Simple( + SimpleLeveledCompactionOptions { + level0_file_num_compaction_trigger: 2, + max_levels: 3, + size_ratio_percent: 200, + }, + )), + ) + .unwrap(); + + compaction_bench(storage.clone()); + check_compaction_ratio(storage.clone()); +} diff --git a/mini-lsm-starter/src/wal.rs b/mini-lsm-starter/src/wal.rs index 2b31d436a..43870b57e 100644 --- a/mini-lsm-starter/src/wal.rs +++ b/mini-lsm-starter/src/wal.rs @@ -10,6 +10,7 @@ use bytes::Bytes; use crossbeam_skiplist::SkipMap; use parking_lot::Mutex; +#[derive(Debug)] pub struct Wal { file: Arc>>, } From 10b633348b360a7a38984a500332ce8f9eff6629 Mon Sep 17 00:00:00 2001 From: husharp Date: Mon, 12 Feb 2024 19:51:51 +0800 Subject: [PATCH 10/22] week2_day1 tiered compact Signed-off-by: husharp --- mini-lsm-starter/src/compact.rs | 28 ++++- .../src/compact/simple_leveled.rs | 1 + mini-lsm-starter/src/compact/tiered.rs | 114 +++++++++++++++++- mini-lsm-starter/src/lsm_storage.rs | 10 +- mini-lsm-starter/src/tests.rs | 1 + mini-lsm-starter/src/tests/harness.rs | 7 +- mini-lsm-starter/src/tests/week2_day3.rs | 28 +++++ 7 files changed, 170 insertions(+), 19 deletions(-) create mode 100644 mini-lsm-starter/src/tests/week2_day3.rs diff --git a/mini-lsm-starter/src/compact.rs b/mini-lsm-starter/src/compact.rs index 1643eb887..827ed3667 100644 --- a/mini-lsm-starter/src/compact.rs +++ b/mini-lsm-starter/src/compact.rs @@ -138,7 +138,6 @@ impl LsmStorageInner { iter.next()?; if builder_inner.estimated_size() >= self.options.target_sst_size { - println!("compact_generate_sst_from_iter"); let sst_id = self.next_sst_id(); let builder = builder.take().unwrap(); let new_sst = Arc::new(builder.build( @@ -216,8 +215,10 @@ impl LsmStorageInner { lower_ssts.push(snapshot.sstables.get(id).unwrap().clone()); } let lower_iter = SstConcatIterator::create_and_seek_to_first(lower_ssts)?; - let iter = TwoMergeIterator::create(upper_iter, lower_iter)?; - self.compact_generate_sst_from_iter(iter, task.compact_to_bottom_level()) + self.compact_generate_sst_from_iter( + TwoMergeIterator::create(upper_iter, lower_iter)?, + task.compact_to_bottom_level(), + ) } // because it is L0 compaction, we can not use concat iterator which is for ordered sstables None => { @@ -235,11 +236,28 @@ impl LsmStorageInner { lower_ssts.push(snapshot.sstables.get(id).unwrap().clone()); } let lower_iter = SstConcatIterator::create_and_seek_to_first(lower_ssts)?; - let iter = TwoMergeIterator::create(upper_merge_iter, lower_iter)?; - self.compact_generate_sst_from_iter(iter, task.compact_to_bottom_level()) + self.compact_generate_sst_from_iter( + TwoMergeIterator::create(upper_merge_iter, lower_iter)?, + task.compact_to_bottom_level(), + ) } } } + CompactionTask::Tiered(TieredCompactionTask { tiers, .. }) => { + let mut iters = Vec::with_capacity(tiers.len()); + for (_, tier_sst_ids) in tiers { + let mut ssts = Vec::with_capacity(tier_sst_ids.len()); + for id in tier_sst_ids.iter() { + ssts.push(snapshot.sstables.get(id).unwrap().clone()); + } + let iter = SstConcatIterator::create_and_seek_to_first(ssts)?; + iters.push(Box::new(iter)); + } + self.compact_generate_sst_from_iter( + MergeIterator::create(iters), + task.compact_to_bottom_level(), + ) + } _ => unimplemented!(), } } diff --git a/mini-lsm-starter/src/compact/simple_leveled.rs b/mini-lsm-starter/src/compact/simple_leveled.rs index 9c9d292a5..88915fb82 100644 --- a/mini-lsm-starter/src/compact/simple_leveled.rs +++ b/mini-lsm-starter/src/compact/simple_leveled.rs @@ -121,6 +121,7 @@ impl SimpleLeveledCompactionController { assert!(l0_ssts_compacted.is_empty()); new_snapshot.l0_sstables = new_l0_sstables; } + files_to_remove.extend(&new_snapshot.levels[task.lower_level].1); new_snapshot.levels[task.lower_level].1 = output.to_vec(); (new_snapshot, files_to_remove) diff --git a/mini-lsm-starter/src/compact/tiered.rs b/mini-lsm-starter/src/compact/tiered.rs index 25f300e15..2805a5af3 100644 --- a/mini-lsm-starter/src/compact/tiered.rs +++ b/mini-lsm-starter/src/compact/tiered.rs @@ -1,3 +1,5 @@ +use std::collections::HashMap; + use serde::{Deserialize, Serialize}; use crate::lsm_storage::LsmStorageState; @@ -25,19 +27,119 @@ impl TieredCompactionController { Self { options } } + // rely on https://github.com/facebook/rocksdb/wiki/Universal-Compaction pub fn generate_compaction_task( &self, - _snapshot: &LsmStorageState, + snapshot: &LsmStorageState, ) -> Option { - unimplemented!() + assert!( + snapshot.l0_sstables.is_empty(), + "should not add l0 ssts in tiered compaction" + ); + if snapshot.levels.len() < self.options.num_tiers { + return None; + } + // compaction triggered by space amplification ratio + // all levels except last level size / last level size + let mut size = 0; + for id in 0..(snapshot.levels.len() - 1) { + size += snapshot.levels[id].1.len(); + } + let space_amp_ratio = + (size as f64) / (snapshot.levels.last().unwrap().1.len() as f64) * 100.0; + if space_amp_ratio >= self.options.max_size_amplification_percent as f64 { + println!( + "compaction triggered by space amplification ratio: {}", + space_amp_ratio + ); + return Some(TieredCompactionTask { + tiers: snapshot.levels.clone(), + bottom_tier_included: true, + }); + } + + // size of all previous tiers / this tier >= (1 + size_ratio) * 100% + // compaction triggered by size ratio(number of sorted runs) + let size_ratio_trigger = (100.0 + self.options.size_ratio as f64) / 100.0; + let mut size = 0; + for id in 0..(snapshot.levels.len() - 1) { + size += snapshot.levels[id].1.len(); + let cur_tier = snapshot.levels[id + 1].1.len(); + let cur_size_ratio = size as f64 / cur_tier as f64; + // compaction num need exceed `self.options.min_merge_width` + if cur_size_ratio >= size_ratio_trigger && id + 2 >= self.options.min_merge_width { + println!( + "compaction triggered by size ratio: {}", + cur_size_ratio * 100.0 + ); + return Some(TieredCompactionTask { + tiers: snapshot + .levels + .iter() + .take(id + 2) + .cloned() + .collect::>(), + bottom_tier_included: id + 2 >= snapshot.levels.len(), + }); + } + } + // trying to reduce sorted runs without respecting size ratio + // to make sure we have exactly `num_tiers` tiers + println!("compaction triggered by reducing sorted runs"); + let num_tiers_to_take = snapshot.levels.len() - self.options.num_tiers + 2; + Some(TieredCompactionTask { + tiers: snapshot + .levels + .iter() + .take(num_tiers_to_take) + .cloned() + .collect::>(), + bottom_tier_included: snapshot.levels.len() >= num_tiers_to_take, + }) } pub fn apply_compaction_result( &self, - _snapshot: &LsmStorageState, - _task: &TieredCompactionTask, - _output: &[usize], + snapshot: &LsmStorageState, + task: &TieredCompactionTask, + output: &[usize], ) -> (LsmStorageState, Vec) { - unimplemented!() + assert!( + snapshot.l0_sstables.is_empty(), + "should not add l0 ssts in tiered compaction" + ); + let mut new_snapshot = snapshot.clone(); + let mut tier_to_remove = task + .tiers + .iter() + .map(|(x, y)| (*x, y)) + .collect::>(); + let mut files_to_remove = Vec::new(); + let mut levels = Vec::new(); + let mut new_tier_added = false; + for (tier_id, files) in &new_snapshot.levels { + if let Some(remove_files) = tier_to_remove.remove(tier_id) { + // the tier should be removed + assert_eq!( + remove_files, files, + "file changed after issuing compaction task" + ); + files_to_remove.extend(files.iter().cloned()); + } else { + // retain the tier + levels.push((*tier_id, files.clone())); + } + // ? + if tier_to_remove.is_empty() && !new_tier_added { + // add the compacted tier to the LSM tree + new_tier_added = true; + levels.push((output[0], output.to_vec())); + } + } + if !tier_to_remove.is_empty() { + unreachable!("some tiers not found??"); + } + new_snapshot.levels = levels; + (new_snapshot, files_to_remove) } } diff --git a/mini-lsm-starter/src/lsm_storage.rs b/mini-lsm-starter/src/lsm_storage.rs index ae8da53ec..c9906cd82 100644 --- a/mini-lsm-starter/src/lsm_storage.rs +++ b/mini-lsm-starter/src/lsm_storage.rs @@ -351,8 +351,14 @@ impl LsmStorageInner { let sst_id = snapshot.imm_memtables.pop().unwrap().id(); println!("flushed {}.sst with size={}", sst_id, new_sst.table_size()); snapshot.sstables.insert(sst_id, new_sst); - // L0 SSTs are sorted by creation time, from latest to earliest. - snapshot.l0_sstables.insert(0, sst_id); + if self.compaction_controller.flush_to_l0() { + // In leveled compaction or no compaction, simply flush to L0 + // L0 SSTs are sorted by creation time, from latest to earliest. + snapshot.l0_sstables.insert(0, sst_id); + } else { + // In tiered compaction, create a new tier + snapshot.levels.insert(0, (sst_id, vec![sst_id])); + } *guard = Arc::new(snapshot); } diff --git a/mini-lsm-starter/src/tests.rs b/mini-lsm-starter/src/tests.rs index cfd9055c4..155e04455 100644 --- a/mini-lsm-starter/src/tests.rs +++ b/mini-lsm-starter/src/tests.rs @@ -11,3 +11,4 @@ mod week1_day6; mod week1_day7; mod week2_day1; mod week2_day2; +mod week2_day3; diff --git a/mini-lsm-starter/src/tests/harness.rs b/mini-lsm-starter/src/tests/harness.rs index 280fd0ff3..4b0d7e30d 100644 --- a/mini-lsm-starter/src/tests/harness.rs +++ b/mini-lsm-starter/src/tests/harness.rs @@ -262,12 +262,7 @@ pub fn compaction_bench(storage: Arc) { let value = storage.get(key.as_bytes()).unwrap(); if let Some(val) = key_map.get(&i) { let expected_value = gen_value(*val); - assert_eq!( - value, - Some(Bytes::from(expected_value.clone())), - "key: {}", - key - ); + assert_eq!(value, Some(Bytes::from(expected_value.clone()))); expected_key_value_pairs.push((Bytes::from(key), Bytes::from(expected_value))); } else { assert!(value.is_none()); diff --git a/mini-lsm-starter/src/tests/week2_day3.rs b/mini-lsm-starter/src/tests/week2_day3.rs new file mode 100644 index 000000000..6e124db0b --- /dev/null +++ b/mini-lsm-starter/src/tests/week2_day3.rs @@ -0,0 +1,28 @@ +use tempfile::tempdir; + +use crate::{ + compact::{CompactionOptions, TieredCompactionOptions}, + lsm_storage::{LsmStorageOptions, MiniLsm}, +}; + +use super::harness::{check_compaction_ratio, compaction_bench}; + +#[test] +fn test_integration() { + let dir = tempdir().unwrap(); + let storage = MiniLsm::open( + &dir, + LsmStorageOptions::default_for_week2_test(CompactionOptions::Tiered( + TieredCompactionOptions { + num_tiers: 3, + max_size_amplification_percent: 200, + size_ratio: 1, + min_merge_width: 2, + }, + )), + ) + .unwrap(); + + compaction_bench(storage.clone()); + check_compaction_ratio(storage.clone()); +} From cb57157c5b4c6614e5e44748df807b401b238208 Mon Sep 17 00:00:00 2001 From: husharp Date: Thu, 15 Feb 2024 10:10:39 +0800 Subject: [PATCH 11/22] week2_day4 leveled compaction Signed-off-by: husharp --- mini-lsm-starter/src/compact.rs | 15 +- mini-lsm-starter/src/compact/leveled.rs | 201 +++++++++++++++++++++-- mini-lsm-starter/src/tests.rs | 1 + mini-lsm-starter/src/tests/week2_day4.rs | 28 ++++ 4 files changed, 230 insertions(+), 15 deletions(-) create mode 100644 mini-lsm-starter/src/tests/week2_day4.rs diff --git a/mini-lsm-starter/src/compact.rs b/mini-lsm-starter/src/compact.rs index 827ed3667..3c21360e9 100644 --- a/mini-lsm-starter/src/compact.rs +++ b/mini-lsm-starter/src/compact.rs @@ -74,17 +74,17 @@ impl CompactionController { &self, snapshot: &LsmStorageState, task: &CompactionTask, - output: &[usize], + new_ssts: &[usize], ) -> (LsmStorageState, Vec) { match (self, task) { (CompactionController::Leveled(ctrl), CompactionTask::Leveled(task)) => { - ctrl.apply_compaction_result(snapshot, task, output) + ctrl.apply_compaction_result(snapshot, task, new_ssts) } (CompactionController::Simple(ctrl), CompactionTask::Simple(task)) => { - ctrl.apply_compaction_result(snapshot, task, output) + ctrl.apply_compaction_result(snapshot, task, new_ssts) } (CompactionController::Tiered(ctrl), CompactionTask::Tiered(task)) => { - ctrl.apply_compaction_result(snapshot, task, output) + ctrl.apply_compaction_result(snapshot, task, new_ssts) } _ => unreachable!(), } @@ -201,6 +201,12 @@ impl LsmStorageInner { upper_level_sst_ids, lower_level_sst_ids, .. + }) + | CompactionTask::Leveled(LeveledCompactionTask { + upper_level, + upper_level_sst_ids, + lower_level_sst_ids, + .. }) => { match upper_level { Some(_) => { @@ -258,7 +264,6 @@ impl LsmStorageInner { task.compact_to_bottom_level(), ) } - _ => unimplemented!(), } } diff --git a/mini-lsm-starter/src/compact/leveled.rs b/mini-lsm-starter/src/compact/leveled.rs index 50db7b024..2c49436fd 100644 --- a/mini-lsm-starter/src/compact/leveled.rs +++ b/mini-lsm-starter/src/compact/leveled.rs @@ -1,3 +1,5 @@ +use std::collections::HashSet; + use serde::{Deserialize, Serialize}; use crate::lsm_storage::LsmStorageState; @@ -29,28 +31,207 @@ impl LeveledCompactionController { Self { options } } + // contain exactly one SST in the upper level and overlapping SSTs in the lower level. fn find_overlapping_ssts( &self, - _snapshot: &LsmStorageState, - _sst_ids: &[usize], - _in_level: usize, + snapshot: &LsmStorageState, + sst_ids: &[usize], + in_level: usize, ) -> Vec { - unimplemented!() + let begin_key = sst_ids + .iter() + .map(|id| snapshot.sstables[id].first_key()) + .min() + .cloned() + .unwrap(); + let end_key = sst_ids + .iter() + .map(|id| snapshot.sstables[id].last_key()) + .max() + .cloned() + .unwrap(); + let mut overlap_ssts = Vec::new(); + for sst_id in &snapshot.levels[in_level - 1].1 { + let sst = &snapshot.sstables[sst_id]; + let first_key = sst.first_key(); + let last_key = sst.last_key(); + if !(last_key < &begin_key || first_key > &end_key) { + overlap_ssts.push(*sst_id); + } + } + overlap_ssts } + // rely on https://github.com/facebook/rocksdb/wiki/Leveled-Compaction pub fn generate_compaction_task( &self, - _snapshot: &LsmStorageState, + snapshot: &LsmStorageState, ) -> Option { - unimplemented!() + // compute target level size + // such as [0 0 30MB 300MB 3GB 30GB] when level_size_multiplier=10 and base_level_size_mb=200 + // because only keep at most one level below `base_level_size_mb` + let mut target_level_size = (0..self.options.max_levels).map(|_| 0).collect::>(); // exclude level 0 + let mut real_level_size = Vec::with_capacity(self.options.max_levels); + for i in 0..self.options.max_levels { + real_level_size.push( + snapshot.levels[i] + .1 + .iter() + .map(|x| snapshot.sstables.get(x).unwrap().table_size()) + .sum::() as usize, + ); + } + + // select base level and compute target level size + let mut base_level = self.options.max_levels; + let base_level_size_bytes = self.options.base_level_size_mb * 1024 * 1024; + target_level_size[self.options.max_levels - 1] = + real_level_size[self.options.max_levels - 1].max(base_level_size_bytes); + for i in (0..(self.options.max_levels - 1)).rev() { + let next_level_size = target_level_size[i + 1]; + let this_level_size = next_level_size / self.options.level_size_multiplier; + if next_level_size > base_level_size_bytes { + target_level_size[i] = this_level_size; + } + if target_level_size[i] > 0 { + base_level = i + 1; + } + } + + // directly place the SST from L0 to the lowest level possible + if snapshot.l0_sstables.len() >= self.options.level0_file_num_compaction_trigger { + println!("flush L0 SST to base level {}", base_level); + return Some(LeveledCompactionTask { + upper_level: None, + upper_level_sst_ids: snapshot.l0_sstables.clone(), + lower_level: base_level, + lower_level_sst_ids: self.find_overlapping_ssts( + snapshot, + &snapshot.l0_sstables, + base_level, + ), + is_lower_level_bottom_level: false, + }); + } + + // compute the compaction priorities of each level by `current_size / target_size` + let mut compaction_priorities = Vec::with_capacity(self.options.max_levels); + for level in 0..self.options.max_levels { + let prio = real_level_size[level] as f64 / target_level_size[level] as f64; + if prio > 1.0 { + compaction_priorities.push((prio, level + 1)); + } + } + + // select the compaction task with the highest priority + compaction_priorities.sort_by(|a, b| a.partial_cmp(b).unwrap().reverse()); + if let Some((_, level)) = compaction_priorities.first() { + println!( + "target level sizes: {:?}, real level sizes: {:?}, base_level: {}", + target_level_size + .iter() + .map(|x| format!("{}MB", x / 1024 / 1024)) + .collect::>(), + real_level_size + .iter() + .map(|x| format!("{}MB", x / 1024 / 1024)) + .collect::>(), + base_level, + ); + let level = *level; + // select the oldest sst to compact + let selected_sst = snapshot.levels[level - 1].1.iter().min().copied().unwrap(); + println!( + "compaction triggered by priority: {level} out of {:?}, select {selected_sst} for compaction", + compaction_priorities + ); + return Some(LeveledCompactionTask { + upper_level: Some(level), + upper_level_sst_ids: vec![selected_sst], + lower_level: level + 1, + lower_level_sst_ids: self.find_overlapping_ssts( + snapshot, + &[selected_sst], + level + 1, + ), + is_lower_level_bottom_level: level + 1 == self.options.max_levels, + }); + } + + None } pub fn apply_compaction_result( &self, - _snapshot: &LsmStorageState, - _task: &LeveledCompactionTask, - _output: &[usize], + snapshot: &LsmStorageState, + task: &LeveledCompactionTask, + new_ssts: &[usize], ) -> (LsmStorageState, Vec) { - unimplemented!() + let mut snapshot = snapshot.clone(); + let mut files_to_remove = Vec::new(); + let mut upper_level_sst_ids_set = task + .upper_level_sst_ids + .iter() + .copied() + .collect::>(); + let mut lower_level_sst_ids_set = task + .lower_level_sst_ids + .iter() + .copied() + .collect::>(); + // None is for l0 compaction + if let Some(upper_level) = task.upper_level { + // remove the compacted SSTs from the upper level + let new_upper_level_ssts = snapshot.levels[upper_level - 1] + .1 + .iter() + .filter_map(|x| { + if upper_level_sst_ids_set.remove(x) { + return None; + } + Some(*x) + }) + .collect::>(); + assert!(upper_level_sst_ids_set.is_empty()); + snapshot.levels[upper_level - 1].1 = new_upper_level_ssts; + } else { + let new_l0_ssts = snapshot + .l0_sstables + .iter() + .filter_map(|x| { + if upper_level_sst_ids_set.remove(x) { + return None; + } + Some(*x) + }) + .collect::>(); + assert!(upper_level_sst_ids_set.is_empty()); + snapshot.l0_sstables = new_l0_ssts; + } + files_to_remove.extend(&task.upper_level_sst_ids); + + let mut new_lower_level_ssts = snapshot.levels[task.lower_level - 1] + .1 + .iter() + .filter_map(|x| { + if lower_level_sst_ids_set.remove(x) { + return None; + } + Some(*x) + }) + .collect::>(); + assert!(lower_level_sst_ids_set.is_empty()); + files_to_remove.extend(&task.lower_level_sst_ids); + // add the new SSTs to the lower level + new_lower_level_ssts.extend(new_ssts); + // need to be sorted + new_lower_level_ssts.sort_by(|a, b| { + snapshot.sstables[a] + .first_key() + .cmp(snapshot.sstables[b].first_key()) + }); + snapshot.levels[task.lower_level - 1].1 = new_lower_level_ssts; + + (snapshot, files_to_remove) } } diff --git a/mini-lsm-starter/src/tests.rs b/mini-lsm-starter/src/tests.rs index 155e04455..23f677960 100644 --- a/mini-lsm-starter/src/tests.rs +++ b/mini-lsm-starter/src/tests.rs @@ -12,3 +12,4 @@ mod week1_day7; mod week2_day1; mod week2_day2; mod week2_day3; +mod week2_day4; diff --git a/mini-lsm-starter/src/tests/week2_day4.rs b/mini-lsm-starter/src/tests/week2_day4.rs new file mode 100644 index 000000000..a2fd9007e --- /dev/null +++ b/mini-lsm-starter/src/tests/week2_day4.rs @@ -0,0 +1,28 @@ +use tempfile::tempdir; + +use crate::{ + compact::{CompactionOptions, LeveledCompactionOptions}, + lsm_storage::{LsmStorageOptions, MiniLsm}, +}; + +use super::harness::{check_compaction_ratio, compaction_bench}; + +#[test] +fn test_integration() { + let dir = tempdir().unwrap(); + let storage = MiniLsm::open( + &dir, + LsmStorageOptions::default_for_week2_test(CompactionOptions::Leveled( + LeveledCompactionOptions { + level0_file_num_compaction_trigger: 2, + level_size_multiplier: 2, + base_level_size_mb: 1, + max_levels: 4, + }, + )), + ) + .unwrap(); + + compaction_bench(storage.clone()); + check_compaction_ratio(storage.clone()); +} From da57ce54ca4dae59dcec60cabbc1f8f45bfade95 Mon Sep 17 00:00:00 2001 From: husharp Date: Thu, 15 Feb 2024 12:45:11 +0800 Subject: [PATCH 12/22] week2_day5 add manifest Signed-off-by: husharp --- mini-lsm-starter/src/compact.rs | 19 ++-- mini-lsm-starter/src/lsm_storage.rs | 110 ++++++++++++++++++++--- mini-lsm-starter/src/manifest.rs | 46 ++++++++-- mini-lsm-starter/src/tests.rs | 1 + mini-lsm-starter/src/tests/week2_day5.rs | 81 +++++++++++++++++ 5 files changed, 234 insertions(+), 23 deletions(-) create mode 100644 mini-lsm-starter/src/tests/week2_day5.rs diff --git a/mini-lsm-starter/src/compact.rs b/mini-lsm-starter/src/compact.rs index 3c21360e9..5624fd883 100644 --- a/mini-lsm-starter/src/compact.rs +++ b/mini-lsm-starter/src/compact.rs @@ -23,6 +23,7 @@ use crate::iterators::StorageIterator; use crate::key::KeySlice; use crate::lsm_iterator::FusedIterator; use crate::lsm_storage::{LsmStorageInner, LsmStorageState}; +use crate::manifest::ManifestRecord; use crate::table::{SsTable, SsTableBuilder, SsTableIterator}; #[derive(Debug, Serialize, Deserialize)] @@ -332,10 +333,11 @@ impl LsmStorageInner { .compaction_controller .generate_compaction_task(&snapshot); if let Some(task) = task { + let state_lock = self.state_lock.lock(); self.dump_structure(); println!("running compaction task: {:?}", task); let new_ssts = self.compact(&task)?; - let output = new_ssts.iter().map(|x| x.sst_id()).collect::>(); + let new_ssts_to_add = new_ssts.iter().map(|x| x.sst_id()).collect::>(); let mut snapshot = self.state.read().as_ref().clone(); // insert new SSTs to sstables for ssts_to_add in new_ssts { @@ -344,7 +346,7 @@ impl LsmStorageInner { } let (mut new_snapshot, files_to_remove) = self .compaction_controller - .apply_compaction_result(&snapshot, &task, &output); + .apply_compaction_result(&snapshot, &task, &new_ssts_to_add); // remove old SSTs from sstables let mut ssts_to_remove = Vec::with_capacity(files_to_remove.len()); for file_to_remove in &files_to_remove { @@ -357,14 +359,21 @@ impl LsmStorageInner { drop(state); println!( - "compaction finished: {} files removed, {} files added, output={:?}", + "compaction finished: {} files removed, {} files added, ssts={:?}", ssts_to_remove.len(), - output.len(), - output + new_ssts_to_add.len(), + new_ssts_to_add ); for sst in ssts_to_remove.iter() { std::fs::remove_file(self.path_of_sst(sst.sst_id()))?; } + + // update manifest + self.manifest.as_ref().unwrap().add_record( + &state_lock, + ManifestRecord::Compaction(task, new_ssts_to_add), + )?; + self.sync_dir()?; } Ok(()) } diff --git a/mini-lsm-starter/src/lsm_storage.rs b/mini-lsm-starter/src/lsm_storage.rs index c9906cd82..384a1baa0 100644 --- a/mini-lsm-starter/src/lsm_storage.rs +++ b/mini-lsm-starter/src/lsm_storage.rs @@ -1,12 +1,13 @@ #![allow(dead_code)] // REMOVE THIS LINE after fully implementing this functionality use std::collections::HashMap; +use std::fs::File; use std::ops::Bound; use std::path::{Path, PathBuf}; use std::sync::atomic::AtomicUsize; use std::sync::Arc; -use anyhow::{Ok, Result}; +use anyhow::{Context, Ok, Result}; use bytes::Bytes; use parking_lot::{Mutex, MutexGuard, RwLock}; @@ -21,10 +22,10 @@ use crate::iterators::two_merge_iterator::TwoMergeIterator; use crate::iterators::StorageIterator; use crate::key::KeySlice; use crate::lsm_iterator::{FusedIterator, LsmIterator}; -use crate::manifest::Manifest; +use crate::manifest::{Manifest, ManifestRecord}; use crate::mem_table::{map_bound, MemTable}; use crate::mvcc::LsmMvccInner; -use crate::table::{SsTable, SsTableBuilder, SsTableIterator}; +use crate::table::{FileObject, SsTable, SsTableBuilder, SsTableIterator}; pub type BlockCache = moka::sync::Cache<(usize, usize), Arc>; @@ -153,7 +154,31 @@ impl Drop for MiniLsm { impl MiniLsm { pub fn close(&self) -> Result<()> { + self.inner.sync_dir()?; + // flush and compaction threads should be stopped self.flush_notifier.send(()).ok(); + self.compaction_notifier.send(()).ok(); + + // close the compaction thread and flush thread + if let Some(handle) = self.flush_thread.lock().take() { + handle.join().ok(); + } + if let Some(handle) = self.compaction_thread.lock().take() { + handle.join().ok(); + } + + // flush memtable and imm_memtables + if !self.inner.state.read().memtable.is_empty() { + self.inner + .force_freeze_memtable(&self.inner.state_lock.lock())?; + } + + while !self.inner.state.read().imm_memtables.is_empty() { + self.inner.force_flush_next_imm_memtable()?; + } + + self.inner.sync_dir()?; + Ok(()) } @@ -234,9 +259,10 @@ impl LsmStorageInner { pub(crate) fn open(path: impl AsRef, options: LsmStorageOptions) -> Result { let path = path.as_ref(); if !path.exists() { - std::fs::create_dir_all(path)?; + std::fs::create_dir_all(path).context("failed to create DB directory")?; } - let state = LsmStorageState::create(&options); + + let block_cache = Arc::new(BlockCache::new(1 << 20)); // 4GB block cache, let compaction_controller = match &options.compaction_options { CompactionOptions::Leveled(options) => { @@ -251,18 +277,75 @@ impl LsmStorageInner { CompactionOptions::NoCompaction => CompactionController::NoCompaction, }; + let mut state = LsmStorageState::create(&options); + let manifest; + let mut next_sst_id = 1; + // recover from MANIFEST, `/MANIFEST` + let manifest_path = path.join("MANIFEST"); + if !manifest_path.exists() { + manifest = Manifest::create(&manifest_path).context("failed to create manifest")?; + } else { + let (m, records) = Manifest::recover(manifest_path)?; + for record in records { + match record { + ManifestRecord::Flush(sst_id) => { + if compaction_controller.flush_to_l0() { + state.l0_sstables.insert(0, sst_id); + } else { + state.levels.insert(0, (sst_id, vec![sst_id])); + } + next_sst_id = next_sst_id.max(sst_id); + } + ManifestRecord::NewMemtable(_) => unimplemented!(), + ManifestRecord::Compaction(task, new_ssts) => { + let (new_state, _) = + compaction_controller.apply_compaction_result(&state, &task, &new_ssts); + state = new_state; + next_sst_id = next_sst_id.max(new_ssts.iter().max().copied().unwrap()); + } + } + } + // recover SSTs + for &sst_id in state.l0_sstables.iter() { + let sst = SsTable::open( + sst_id, + Some(block_cache.clone()), + FileObject::open(&Self::path_of_sst_static(path, sst_id)) + .context("failed to open SST")?, + )?; + state.sstables.insert(sst_id, Arc::new(sst)); + next_sst_id = next_sst_id.max(sst_id); + } + for &sst_id in state.levels.iter().flat_map(|(_, ssts)| ssts.iter()) { + let sst = SsTable::open( + sst_id, + Some(block_cache.clone()), + FileObject::open(&Self::path_of_sst_static(path, sst_id)) + .context("failed to open SST")?, + )?; + state.sstables.insert(sst_id, Arc::new(sst)); + next_sst_id = next_sst_id.max(sst_id); + } + state.memtable = Arc::new(MemTable::create(next_sst_id)); + + next_sst_id += 1; + manifest = m; + } + let storage = Self { state: Arc::new(RwLock::new(Arc::new(state))), state_lock: Mutex::new(()), path: path.to_path_buf(), - block_cache: Arc::new(BlockCache::new(1024)), - next_sst_id: AtomicUsize::new(1), + block_cache, + next_sst_id: AtomicUsize::new(next_sst_id), compaction_controller, - manifest: None, + manifest: Some(manifest), options: options.into(), mvcc: None, }; + storage.sync_dir()?; + Ok(storage) } @@ -305,7 +388,8 @@ impl LsmStorageInner { } pub(super) fn sync_dir(&self) -> Result<()> { - unimplemented!() + File::open(&self.path)?.sync_all()?; + Ok(()) } /// Force freeze the current memtable to an immutable memtable @@ -324,7 +408,7 @@ impl LsmStorageInner { /// Force flush the earliest-created immutable memtable to disk pub fn force_flush_next_imm_memtable(&self) -> Result<()> { - let _state_lock = self.state_lock.lock(); + let state_lock = self.state_lock.lock(); let last_imm_memtable; { @@ -362,6 +446,12 @@ impl LsmStorageInner { *guard = Arc::new(snapshot); } + // update manifest + self.manifest + .as_ref() + .unwrap() + .add_record(&state_lock, ManifestRecord::Flush(last_imm_memtable.id()))?; + Ok(()) } diff --git a/mini-lsm-starter/src/manifest.rs b/mini-lsm-starter/src/manifest.rs index e9b005926..a8428b5df 100644 --- a/mini-lsm-starter/src/manifest.rs +++ b/mini-lsm-starter/src/manifest.rs @@ -1,15 +1,18 @@ #![allow(dead_code)] // REMOVE THIS LINE after fully implementing this functionality -use std::fs::File; +use std::fs::OpenOptions; +use std::io::Read; use std::path::Path; use std::sync::Arc; +use std::{fs::File, io::Write}; -use anyhow::Result; +use anyhow::{Context, Result}; use parking_lot::{Mutex, MutexGuard}; use serde::{Deserialize, Serialize}; use crate::compact::CompactionTask; +// | JSON record | JSON record | JSON record | JSON record | pub struct Manifest { file: Arc>, } @@ -22,12 +25,35 @@ pub enum ManifestRecord { } impl Manifest { - pub fn create(_path: impl AsRef) -> Result { - unimplemented!() + pub fn create(path: impl AsRef) -> Result { + Ok(Self { + file: Arc::new(Mutex::new( + OpenOptions::new() + .read(true) + .create_new(true) + .write(true) + .open(path) + .context("failed to create manifest")?, + )), + }) } - pub fn recover(_path: impl AsRef) -> Result<(Self, Vec)> { - unimplemented!() + pub fn recover(path: impl AsRef) -> Result<(Self, Vec)> { + let mut file = OpenOptions::new() + .read(true) + .append(true) + .open(path) + .context("failed to recover manifest")?; + let mut buf = Vec::new(); + file.read_to_end(&mut buf)?; + let stream = serde_json::Deserializer::from_slice(&buf).into_iter::(); + let records = stream.collect::, _>>()?; + Ok(( + Self { + file: Arc::new(Mutex::new(file)), + }, + records, + )) } pub fn add_record( @@ -38,7 +64,11 @@ impl Manifest { self.add_record_when_init(record) } - pub fn add_record_when_init(&self, _record: ManifestRecord) -> Result<()> { - unimplemented!() + pub fn add_record_when_init(&self, record: ManifestRecord) -> Result<()> { + let mut file = self.file.lock(); + let buf = serde_json::to_vec(&record)?; + file.write_all(&buf)?; + file.sync_all()?; + Ok(()) } } diff --git a/mini-lsm-starter/src/tests.rs b/mini-lsm-starter/src/tests.rs index 23f677960..7c6ba8e96 100644 --- a/mini-lsm-starter/src/tests.rs +++ b/mini-lsm-starter/src/tests.rs @@ -13,3 +13,4 @@ mod week2_day1; mod week2_day2; mod week2_day3; mod week2_day4; +mod week2_day5; diff --git a/mini-lsm-starter/src/tests/week2_day5.rs b/mini-lsm-starter/src/tests/week2_day5.rs new file mode 100644 index 000000000..9ceedb33f --- /dev/null +++ b/mini-lsm-starter/src/tests/week2_day5.rs @@ -0,0 +1,81 @@ +use tempfile::tempdir; + +use crate::{ + compact::{ + CompactionOptions, LeveledCompactionOptions, SimpleLeveledCompactionOptions, + TieredCompactionOptions, + }, + lsm_storage::{LsmStorageOptions, MiniLsm}, + tests::harness::dump_files_in_dir, +}; + +#[test] +fn test_integration_leveled() { + test_integration(CompactionOptions::Leveled(LeveledCompactionOptions { + level_size_multiplier: 2, + level0_file_num_compaction_trigger: 2, + max_levels: 3, + base_level_size_mb: 1, + })) +} + +#[test] +fn test_integration_tiered() { + test_integration(CompactionOptions::Tiered(TieredCompactionOptions { + num_tiers: 3, + max_size_amplification_percent: 200, + size_ratio: 1, + min_merge_width: 3, + })) +} + +#[test] +fn test_integration_simple() { + test_integration(CompactionOptions::Simple(SimpleLeveledCompactionOptions { + size_ratio_percent: 200, + level0_file_num_compaction_trigger: 2, + max_levels: 3, + })); +} + +fn test_integration(compaction_options: CompactionOptions) { + let dir = tempdir().unwrap(); + let storage = MiniLsm::open( + &dir, + LsmStorageOptions::default_for_week2_test(compaction_options.clone()), + ) + .unwrap(); + for i in 0..=20 { + storage.put(b"0", format!("v{}", i).as_bytes()).unwrap(); + if i % 2 == 0 { + storage.put(b"1", format!("v{}", i).as_bytes()).unwrap(); + } else { + storage.delete(b"1").unwrap(); + } + if i % 2 == 1 { + storage.put(b"2", format!("v{}", i).as_bytes()).unwrap(); + } else { + storage.delete(b"2").unwrap(); + } + storage + .inner + .force_freeze_memtable(&storage.inner.state_lock.lock()) + .unwrap(); + } + storage.close().unwrap(); + // ensure all SSTs are flushed + assert!(storage.inner.state.read().memtable.is_empty()); + assert!(storage.inner.state.read().imm_memtables.is_empty()); + storage.dump_structure(); + drop(storage); + dump_files_in_dir(&dir); + + let storage = MiniLsm::open( + &dir, + LsmStorageOptions::default_for_week2_test(compaction_options.clone()), + ) + .unwrap(); + assert_eq!(&storage.get(b"0").unwrap().unwrap()[..], b"v20".as_slice()); + assert_eq!(&storage.get(b"1").unwrap().unwrap()[..], b"v20".as_slice()); + assert_eq!(storage.get(b"2").unwrap(), None); +} From a83162edf90bfbfe1d50d2510a4b6ff012c0bb7e Mon Sep 17 00:00:00 2001 From: husharp Date: Thu, 15 Feb 2024 21:01:13 +0800 Subject: [PATCH 13/22] week2_day6 add wal Signed-off-by: husharp --- mini-lsm-starter/src/compact.rs | 2 +- mini-lsm-starter/src/lsm_storage.rs | 119 ++++++++++++++++++----- mini-lsm-starter/src/manifest.rs | 4 +- mini-lsm-starter/src/mem_table.rs | 26 +++-- mini-lsm-starter/src/tests.rs | 1 + mini-lsm-starter/src/tests/week2_day6.rs | 77 +++++++++++++++ mini-lsm-starter/src/wal.rs | 64 +++++++++--- 7 files changed, 245 insertions(+), 48 deletions(-) create mode 100644 mini-lsm-starter/src/tests/week2_day6.rs diff --git a/mini-lsm-starter/src/compact.rs b/mini-lsm-starter/src/compact.rs index 5624fd883..008622336 100644 --- a/mini-lsm-starter/src/compact.rs +++ b/mini-lsm-starter/src/compact.rs @@ -255,7 +255,7 @@ impl LsmStorageInner { for (_, tier_sst_ids) in tiers { let mut ssts = Vec::with_capacity(tier_sst_ids.len()); for id in tier_sst_ids.iter() { - ssts.push(snapshot.sstables.get(id).unwrap().clone()); + ssts.push(snapshot.sstables[id].clone()); } let iter = SstConcatIterator::create_and_seek_to_first(ssts)?; iters.push(Box::new(iter)); diff --git a/mini-lsm-starter/src/lsm_storage.rs b/mini-lsm-starter/src/lsm_storage.rs index 384a1baa0..8fadf4ef9 100644 --- a/mini-lsm-starter/src/lsm_storage.rs +++ b/mini-lsm-starter/src/lsm_storage.rs @@ -1,6 +1,6 @@ #![allow(dead_code)] // REMOVE THIS LINE after fully implementing this functionality -use std::collections::HashMap; +use std::collections::{BTreeSet, HashMap}; use std::fs::File; use std::ops::Bound; use std::path::{Path, PathBuf}; @@ -167,6 +167,13 @@ impl MiniLsm { handle.join().ok(); } + // sync wal + if self.inner.options.enable_wal { + self.inner.sync()?; + self.inner.sync_dir()?; + return Ok(()); + } + // flush memtable and imm_memtables if !self.inner.state.read().memtable.is_empty() { self.inner @@ -259,7 +266,8 @@ impl LsmStorageInner { pub(crate) fn open(path: impl AsRef, options: LsmStorageOptions) -> Result { let path = path.as_ref(); if !path.exists() { - std::fs::create_dir_all(path).context("failed to create DB directory")?; + std::fs::create_dir_all(path) + .context("[LsmStorageInner.open] failed to create DB directory")?; } let block_cache = Arc::new(BlockCache::new(1 << 20)); // 4GB block cache, @@ -283,12 +291,24 @@ impl LsmStorageInner { // recover from MANIFEST, `/MANIFEST` let manifest_path = path.join("MANIFEST"); if !manifest_path.exists() { - manifest = Manifest::create(&manifest_path).context("failed to create manifest")?; + if options.enable_wal { + let memtable = Arc::new(MemTable::create_with_wal( + state.memtable.id(), + Self::path_of_wal_static(path, state.memtable.id()), + )?); + state.memtable = memtable; + } + manifest = Manifest::create(&manifest_path) + .context("[LsmStorageInner.open] failed to create manifest")?; + manifest.add_record_when_init(ManifestRecord::NewMemtable(state.memtable.id()))?; } else { let (m, records) = Manifest::recover(manifest_path)?; + let mut memtables = BTreeSet::new(); for record in records { match record { ManifestRecord::Flush(sst_id) => { + let res = memtables.remove(&sst_id); + assert!(res, "memtable not exist?"); if compaction_controller.flush_to_l0() { state.l0_sstables.insert(0, sst_id); } else { @@ -296,7 +316,10 @@ impl LsmStorageInner { } next_sst_id = next_sst_id.max(sst_id); } - ManifestRecord::NewMemtable(_) => unimplemented!(), + ManifestRecord::NewMemtable(x) => { + next_sst_id = next_sst_id.max(x); + memtables.insert(x); + } ManifestRecord::Compaction(task, new_ssts) => { let (new_state, _) = compaction_controller.apply_compaction_result(&state, &task, &new_ssts); @@ -305,8 +328,14 @@ impl LsmStorageInner { } } } + // recover SSTs - for &sst_id in state.l0_sstables.iter() { + let mut sst_cnt = 0; + for &sst_id in state + .l0_sstables + .iter() + .chain(state.levels.iter().flat_map(|(_, files)| files)) + { let sst = SsTable::open( sst_id, Some(block_cache.clone()), @@ -315,18 +344,31 @@ impl LsmStorageInner { )?; state.sstables.insert(sst_id, Arc::new(sst)); next_sst_id = next_sst_id.max(sst_id); + sst_cnt += 1; } - for &sst_id in state.levels.iter().flat_map(|(_, ssts)| ssts.iter()) { - let sst = SsTable::open( - sst_id, - Some(block_cache.clone()), - FileObject::open(&Self::path_of_sst_static(path, sst_id)) - .context("failed to open SST")?, - )?; - state.sstables.insert(sst_id, Arc::new(sst)); - next_sst_id = next_sst_id.max(sst_id); + println!("recovered {} SSTs", sst_cnt); + next_sst_id += 1; + + // recover memtables + if options.enable_wal { + let mut wal_cnt = 0; + for id in memtables { + let memtable = + MemTable::recover_from_wal(id, Self::path_of_wal_static(path, id))?; + if !memtable.is_empty() { + state.imm_memtables.insert(0, Arc::new(memtable)); + wal_cnt += 1; + } + } + println!("recovered {} memtables from WAL", wal_cnt); + state.memtable = Arc::new(MemTable::create_with_wal( + next_sst_id, + Self::path_of_wal_static(path, next_sst_id), + )?); + } else { + state.memtable = Arc::new(MemTable::create(next_sst_id)); } - state.memtable = Arc::new(MemTable::create(next_sst_id)); + m.add_record_when_init(ManifestRecord::NewMemtable(next_sst_id))?; next_sst_id += 1; manifest = m; @@ -350,7 +392,7 @@ impl LsmStorageInner { } pub fn sync(&self) -> Result<()> { - unimplemented!() + self.state.read().memtable.sync_wal() } pub fn try_freeze(&self, size: usize) -> Result<()> { @@ -393,16 +435,33 @@ impl LsmStorageInner { } /// Force freeze the current memtable to an immutable memtable - pub fn force_freeze_memtable(&self, _state_lock_observer: &MutexGuard<'_, ()>) -> Result<()> { - let new_memtable = Arc::new(MemTable::create(self.next_sst_id())); + pub fn force_freeze_memtable(&self, state_lock_observer: &MutexGuard<'_, ()>) -> Result<()> { + let new_memtable_id = self.next_sst_id(); + let new_memtable = if self.options.enable_wal { + Arc::new(MemTable::create_with_wal( + new_memtable_id, + self.path_of_wal(new_memtable_id), + )?) + } else { + Arc::new(MemTable::create(new_memtable_id)) + }; + { let mut guard = self.state.write(); let mut snapshot = guard.as_ref().clone(); let old_memtable = std::mem::replace(&mut snapshot.memtable, new_memtable); // imm_memtables.first() should be the last frozen memtable - snapshot.imm_memtables.insert(0, old_memtable); + snapshot.imm_memtables.insert(0, old_memtable.clone()); *guard = Arc::new(snapshot); + drop(guard); + old_memtable.sync_wal()?; } + + // add NewMemtable record to manifest + self.manifest.as_ref().unwrap().add_record( + state_lock_observer, + ManifestRecord::NewMemtable(new_memtable_id), + )?; Ok(()) } @@ -423,12 +482,14 @@ impl LsmStorageInner { // build new sstable let mut builder = SsTableBuilder::new(self.options.block_size); last_imm_memtable.flush(&mut builder)?; + let sst_id = last_imm_memtable.id(); let new_sst = Arc::new(builder.build( - last_imm_memtable.id(), + sst_id, Some(self.block_cache.clone()), - self.path_of_sst(last_imm_memtable.id()), + self.path_of_sst(sst_id), )?); + // Add the new SST to the storage { let mut guard = self.state.write(); let mut snapshot = guard.as_ref().clone(); @@ -446,11 +507,15 @@ impl LsmStorageInner { *guard = Arc::new(snapshot); } + if self.options.enable_wal { + std::fs::remove_file(self.path_of_wal(sst_id))?; + } + // update manifest self.manifest .as_ref() .unwrap() - .add_record(&state_lock, ManifestRecord::Flush(last_imm_memtable.id()))?; + .add_record(&state_lock, ManifestRecord::Flush(sst_id))?; Ok(()) } @@ -460,7 +525,7 @@ impl LsmStorageInner { Ok(()) } - /// Get a key from the storage. In day 7, this can be further optimized by using a bloom filter. + /// Get a key from the storage. In week1 day7, this can be further optimized by using a bloom filter. pub fn get(&self, key: &[u8]) -> Result> { let snapshot = self.state.read(); // search memtable firstly @@ -499,8 +564,8 @@ impl LsmStorageInner { // create merge iterator for l0_sstables let mut l0_iters = Vec::with_capacity(snapshot.l0_sstables.len()); for sst_id in snapshot.l0_sstables.iter() { - let sst = snapshot.sstables.get(sst_id).unwrap(); - if check_sst(key, sst) { + let sst = snapshot.sstables[sst_id].clone(); + if check_sst(key, &sst) { let iter = SsTableIterator::create_and_seek_to_key( sst.clone(), KeySlice::from_slice(key), @@ -515,8 +580,8 @@ impl LsmStorageInner { for (_, level_sst_ids) in &snapshot.levels { let mut ssts = Vec::with_capacity(level_sst_ids.len()); for sst_id in level_sst_ids { - let sst = snapshot.sstables.get(sst_id).unwrap(); - if check_sst(key, sst) { + let sst = snapshot.sstables[sst_id].clone(); + if check_sst(key, &sst) { ssts.push(sst.clone()); } } diff --git a/mini-lsm-starter/src/manifest.rs b/mini-lsm-starter/src/manifest.rs index a8428b5df..15f759c96 100644 --- a/mini-lsm-starter/src/manifest.rs +++ b/mini-lsm-starter/src/manifest.rs @@ -1,5 +1,3 @@ -#![allow(dead_code)] // REMOVE THIS LINE after fully implementing this functionality - use std::fs::OpenOptions; use std::io::Read; use std::path::Path; @@ -33,7 +31,7 @@ impl Manifest { .create_new(true) .write(true) .open(path) - .context("failed to create manifest")?, + .context("[manifest.create] failed to create manifest")?, )), }) } diff --git a/mini-lsm-starter/src/mem_table.rs b/mini-lsm-starter/src/mem_table.rs index 2f3d8d1cc..0c8a6be30 100644 --- a/mini-lsm-starter/src/mem_table.rs +++ b/mini-lsm-starter/src/mem_table.rs @@ -39,23 +39,34 @@ pub(crate) fn map_bound(bound: Bound<&[u8]>) -> Bound { impl MemTable { /// Create a new mem-table. - pub fn create(_id: usize) -> Self { + pub fn create(id: usize) -> Self { Self { map: Arc::new(SkipMap::new()), wal: None, - id: _id, + id, approximate_size: Arc::new(AtomicUsize::new(0)), } } /// Create a new mem-table with WAL - pub fn create_with_wal(_id: usize, _path: impl AsRef) -> Result { - unimplemented!() + pub fn create_with_wal(id: usize, path: impl AsRef) -> Result { + Ok(Self { + map: Arc::new(SkipMap::new()), + wal: Some(Wal::create(path)?), + id, + approximate_size: Arc::new(AtomicUsize::new(0)), + }) } /// Create a memtable from WAL - pub fn recover_from_wal(_id: usize, _path: impl AsRef) -> Result { - unimplemented!() + pub fn recover_from_wal(id: usize, path: impl AsRef) -> Result { + let map = Arc::new(SkipMap::new()); + Ok(Self { + wal: Some(Wal::recover(path, &map)?), + map, + id, + approximate_size: Arc::new(AtomicUsize::new(0)), + }) } pub fn for_testing_put_slice(&self, key: &[u8], value: &[u8]) -> Result<()> { @@ -90,6 +101,9 @@ impl MemTable { ); self.map .insert(Bytes::copy_from_slice(key), Bytes::copy_from_slice(value)); + if let Some(ref wal) = self.wal { + wal.put(key, value)?; + } Ok(()) } diff --git a/mini-lsm-starter/src/tests.rs b/mini-lsm-starter/src/tests.rs index 7c6ba8e96..0a30b3207 100644 --- a/mini-lsm-starter/src/tests.rs +++ b/mini-lsm-starter/src/tests.rs @@ -14,3 +14,4 @@ mod week2_day2; mod week2_day3; mod week2_day4; mod week2_day5; +mod week2_day6; diff --git a/mini-lsm-starter/src/tests/week2_day6.rs b/mini-lsm-starter/src/tests/week2_day6.rs new file mode 100644 index 000000000..befd10059 --- /dev/null +++ b/mini-lsm-starter/src/tests/week2_day6.rs @@ -0,0 +1,77 @@ +use tempfile::tempdir; + +use crate::{ + compact::{ + CompactionOptions, LeveledCompactionOptions, SimpleLeveledCompactionOptions, + TieredCompactionOptions, + }, + lsm_storage::{LsmStorageOptions, MiniLsm}, + tests::harness::dump_files_in_dir, +}; + +#[test] +fn test_integration_leveled() { + test_integration(CompactionOptions::Leveled(LeveledCompactionOptions { + level_size_multiplier: 2, + level0_file_num_compaction_trigger: 2, + max_levels: 3, + base_level_size_mb: 1, + })) +} + +#[test] +fn test_integration_tiered() { + test_integration(CompactionOptions::Tiered(TieredCompactionOptions { + num_tiers: 3, + max_size_amplification_percent: 200, + size_ratio: 1, + min_merge_width: 3, + })) +} + +#[test] +fn test_integration_simple() { + test_integration(CompactionOptions::Simple(SimpleLeveledCompactionOptions { + size_ratio_percent: 200, + level0_file_num_compaction_trigger: 2, + max_levels: 3, + })); +} + +fn test_integration(compaction_options: CompactionOptions) { + let dir = tempdir().unwrap(); + let mut options = LsmStorageOptions::default_for_week2_test(compaction_options); + options.enable_wal = true; + let storage = MiniLsm::open(&dir, options.clone()).unwrap(); + for i in 0..=20 { + storage.put(b"0", format!("v{}", i).as_bytes()).unwrap(); + if i % 2 == 0 { + storage.put(b"1", format!("v{}", i).as_bytes()).unwrap(); + } else { + storage.delete(b"1").unwrap(); + } + if i % 2 == 1 { + storage.put(b"2", format!("v{}", i).as_bytes()).unwrap(); + } else { + storage.delete(b"2").unwrap(); + } + storage + .inner + .force_freeze_memtable(&storage.inner.state_lock.lock()) + .unwrap(); + } + storage.close().unwrap(); + // ensure some SSTs are not flushed + assert!( + !storage.inner.state.read().memtable.is_empty() + || !storage.inner.state.read().imm_memtables.is_empty() + ); + storage.dump_structure(); + drop(storage); + dump_files_in_dir(&dir); + + let storage = MiniLsm::open(&dir, options).unwrap(); + assert_eq!(&storage.get(b"0").unwrap().unwrap()[..], b"v20".as_slice()); + assert_eq!(&storage.get(b"1").unwrap().unwrap()[..], b"v20".as_slice()); + assert_eq!(storage.get(b"2").unwrap(), None); +} diff --git a/mini-lsm-starter/src/wal.rs b/mini-lsm-starter/src/wal.rs index 43870b57e..baa1aab45 100644 --- a/mini-lsm-starter/src/wal.rs +++ b/mini-lsm-starter/src/wal.rs @@ -1,34 +1,76 @@ #![allow(dead_code)] // REMOVE THIS LINE after fully implementing this functionality -use std::fs::File; -use std::io::BufWriter; +use std::fs::{File, OpenOptions}; +use std::io::{BufWriter, Read, Write}; use std::path::Path; use std::sync::Arc; -use anyhow::Result; -use bytes::Bytes; +use anyhow::{Context, Result}; +use bytes::{Buf, BufMut, Bytes}; use crossbeam_skiplist::SkipMap; use parking_lot::Mutex; +// | key_len | key | value_len | value | #[derive(Debug)] pub struct Wal { file: Arc>>, } impl Wal { - pub fn create(_path: impl AsRef) -> Result { - unimplemented!() + pub fn create(path: impl AsRef) -> Result { + Ok(Self { + file: Arc::new(Mutex::new(BufWriter::new( + OpenOptions::new() + .read(true) + .create_new(true) + .write(true) + .open(path) + .context("[WAL.create] failed to create WAL")?, + ))), + }) } - pub fn recover(_path: impl AsRef, _skiplist: &SkipMap) -> Result { - unimplemented!() + pub fn recover(path: impl AsRef, skiplist: &SkipMap) -> Result { + let path = path.as_ref(); + let mut file = OpenOptions::new() + .read(true) + .append(true) + .open(path) + .context("[WAL.recover] failed to recover from WAL")?; + let mut buf = Vec::new(); + file.read_to_end(&mut buf)?; + // read buf to insert into skiplist + let mut buf = buf.as_slice(); + while !buf.is_empty() { + let key_len = buf.get_u16() as usize; + let key = Bytes::copy_from_slice(&buf[..key_len]); + buf.advance(key_len); + let value_len = buf.get_u16() as usize; + let value = Bytes::copy_from_slice(&buf[..value_len]); + buf.advance(value_len); + skiplist.insert(key, value); + } + Ok(Self { + file: Arc::new(Mutex::new(BufWriter::new(file))), + }) } - pub fn put(&self, _key: &[u8], _value: &[u8]) -> Result<()> { - unimplemented!() + pub fn put(&self, key: &[u8], value: &[u8]) -> Result<()> { + let mut file = self.file.lock(); + let mut buf: Vec = + Vec::with_capacity(key.len() + value.len() + std::mem::size_of::()); + buf.put_u16(key.len() as u16); + buf.put_slice(key); + buf.put_u16(value.len() as u16); + buf.put_slice(value); + file.write_all(&buf)?; + Ok(()) } pub fn sync(&self) -> Result<()> { - unimplemented!() + let mut file = self.file.lock(); + file.flush()?; + file.get_mut().sync_all()?; + Ok(()) } } From 8482cb6bded0c226e17714013af2b2d219644156 Mon Sep 17 00:00:00 2001 From: husharp Date: Fri, 23 Feb 2024 17:40:55 +0800 Subject: [PATCH 14/22] add check sum Signed-off-by: husharp --- Cargo.lock | 1 + mini-lsm-starter/Cargo.toml | 1 + mini-lsm-starter/src/lsm_storage.rs | 55 ++++++++++++++++----------- mini-lsm-starter/src/manifest.rs | 23 +++++++++-- mini-lsm-starter/src/table.rs | 32 +++++++++++++--- mini-lsm-starter/src/table/bloom.rs | 16 ++++++-- mini-lsm-starter/src/table/builder.rs | 19 +++++---- mini-lsm-starter/src/wal.rs | 24 ++++++++++-- 8 files changed, 127 insertions(+), 44 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index fa83e9465..1a761d952 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -527,6 +527,7 @@ dependencies = [ "arc-swap", "bytes", "clap", + "crc32fast", "crossbeam-channel", "crossbeam-epoch", "crossbeam-skiplist", diff --git a/mini-lsm-starter/Cargo.toml b/mini-lsm-starter/Cargo.toml index e84ac93ff..d824d0300 100644 --- a/mini-lsm-starter/Cargo.toml +++ b/mini-lsm-starter/Cargo.toml @@ -21,6 +21,7 @@ serde = { version = "1.0", features = ["derive"] } farmhash = "1" nom = "7.1.3" rustyline = "13.0.0" +crc32fast = "1.3.2" [dev-dependencies] tempfile = "3" diff --git a/mini-lsm-starter/src/lsm_storage.rs b/mini-lsm-starter/src/lsm_storage.rs index 8fadf4ef9..edcb1b1fb 100644 --- a/mini-lsm-starter/src/lsm_storage.rs +++ b/mini-lsm-starter/src/lsm_storage.rs @@ -697,36 +697,47 @@ impl LsmStorageInner { } /// Write a batch of data into the storage. Implement in week 2 day 7. - pub fn write_batch>(&self, _batch: &[WriteBatchRecord]) -> Result<()> { - unimplemented!() + pub fn write_batch>(&self, batch: &[WriteBatchRecord]) -> Result<()> { + for record in batch { + match record { + WriteBatchRecord::Put(key, value) => { + let size; + { + let key = key.as_ref(); + let value = value.as_ref(); + assert!(!key.is_empty(), "key cannot be empty"); + assert!(!value.is_empty(), "value cannot be empty"); + let snapshot = self.state.write(); + snapshot.memtable.put(key, value)?; + size = snapshot.memtable.approximate_size(); + } + + self.try_freeze(size)?; + } + WriteBatchRecord::Del(key) => { + let size; + { + let key = key.as_ref(); + let snapshot = self.state.write(); + snapshot.memtable.put(key, b"")?; + size = snapshot.memtable.approximate_size(); + } + + self.try_freeze(size)?; + } + } + } + Ok(()) } /// Put a key-value pair into the storage by writing into the current memtable. pub fn put(&self, key: &[u8], value: &[u8]) -> Result<()> { - let size; - { - let snapshot = self.state.write(); - snapshot.memtable.put(key, value)?; - size = snapshot.memtable.approximate_size(); - } - - self.try_freeze(size)?; - - Ok(()) + self.write_batch(&[WriteBatchRecord::Put(key, value)]) } /// Remove a key from the storage by writing an empty value. pub fn delete(&self, key: &[u8]) -> Result<()> { - let size; - { - let snapshot = self.state.write(); - snapshot.memtable.put(key, b"")?; - size = snapshot.memtable.approximate_size(); - } - - self.try_freeze(size)?; - - Ok(()) + self.write_batch(&[WriteBatchRecord::Del(key)]) } } diff --git a/mini-lsm-starter/src/manifest.rs b/mini-lsm-starter/src/manifest.rs index 15f759c96..891fae727 100644 --- a/mini-lsm-starter/src/manifest.rs +++ b/mini-lsm-starter/src/manifest.rs @@ -5,12 +5,13 @@ use std::sync::Arc; use std::{fs::File, io::Write}; use anyhow::{Context, Result}; +use bytes::{Buf, BufMut}; use parking_lot::{Mutex, MutexGuard}; use serde::{Deserialize, Serialize}; use crate::compact::CompactionTask; -// | JSON record | JSON record | JSON record | JSON record | +// | len | JSON record | checksum | len | JSON record | checksum | len | JSON record | checksum | pub struct Manifest { file: Arc>, } @@ -44,8 +45,18 @@ impl Manifest { .context("failed to recover manifest")?; let mut buf = Vec::new(); file.read_to_end(&mut buf)?; - let stream = serde_json::Deserializer::from_slice(&buf).into_iter::(); - let records = stream.collect::, _>>()?; + // need to check all records checksum + let mut stream = buf.as_slice(); + let mut records = Vec::new(); + while stream.has_remaining() { + let len = stream.get_u64() as usize; + let record = stream.copy_to_bytes(len); + let checksum = stream.get_u32(); + if crc32fast::hash(&record) != checksum { + return Err(anyhow::anyhow!("Manifest record checksum mismatch")); + } + records.push(serde_json::from_slice(&record)?); + } Ok(( Self { file: Arc::new(Mutex::new(file)), @@ -64,7 +75,11 @@ impl Manifest { pub fn add_record_when_init(&self, record: ManifestRecord) -> Result<()> { let mut file = self.file.lock(); - let buf = serde_json::to_vec(&record)?; + let mut buf = serde_json::to_vec(&record)?; + file.write_all(&(buf.len() as u64).to_be_bytes())?; + // add checksum + let checksum = crc32fast::hash(&buf); + buf.put_u32(checksum); file.write_all(&buf)?; file.sync_all()?; Ok(()) diff --git a/mini-lsm-starter/src/table.rs b/mini-lsm-starter/src/table.rs index 497e03b34..937017959 100644 --- a/mini-lsm-starter/src/table.rs +++ b/mini-lsm-starter/src/table.rs @@ -43,6 +43,8 @@ impl BlockMeta { /// in order to help keep track of `first_key` when decoding from the same buffer in the future. pub fn encode_block_metas(block_meta: &[BlockMeta], buf: &mut Vec) { let original_len = buf.len(); + let meta_len = block_meta.len(); + buf.put_u32(block_meta.len() as u32); for meta in block_meta { buf.put_u32(meta.offset as u32); buf.put_u16(meta.first_key.len() as u16); @@ -50,12 +52,19 @@ impl BlockMeta { buf.put_u16(meta.last_key.len() as u16); buf.put_slice(meta.last_key.raw_ref()); } + // add checksum + let checksum = crc32fast::hash(&buf[original_len + 4..]); + buf.put_u32(checksum); } /// Decode block meta from a buffer. - pub fn decode_block_metas(mut buf: impl Buf) -> Result> { + pub fn decode_block_metas(mut buf: &[u8]) -> Result> { + // get meta data len + let meta_len = buf.get_u32() as usize; + // cal checksum + let checksum = crc32fast::hash(&buf[..buf.remaining() - 4]); let mut block_meta = vec![]; - while buf.has_remaining() { + for _ in 0..meta_len { let offset = buf.get_u32() as usize; let first_key_len = buf.get_u16() as usize; let first_key = buf.copy_to_bytes(first_key_len); @@ -67,6 +76,10 @@ impl BlockMeta { last_key: KeyBytes::from_bytes(last_key), }); } + // checksum + if checksum != buf.get_u32() { + return Err(anyhow::anyhow!("BlockMeta checksum mismatch")); + } Ok(block_meta) } } @@ -187,12 +200,21 @@ impl SsTable { /// Read a block from the disk. pub fn read_block(&self, block_idx: usize) -> Result> { let offset = self.block_metas[block_idx].offset as u64; - let next_offset = self + let block_end = self .block_metas .get(block_idx + 1) .map_or(self.block_meta_offset, |x| x.offset) as u64; - let data = self.file.read(offset, next_offset - offset)?; - Ok(Arc::new(Block::decode(&data))) + // get all data including checksum + let block_len = block_end - offset - 4; + let data_including_checksum = self.file.read(offset, block_end - offset)?; + let checksum = (&data_including_checksum[block_len as usize..]).get_u32(); + let block_data = &data_including_checksum[..block_len as usize]; + // check checksum + if checksum != crc32fast::hash(block_data) { + return Err(anyhow::anyhow!("checksum mismatch")); + } + + Ok(Arc::new(Block::decode(block_data))) } /// Read a block from disk, with block cache. (Day 4) diff --git a/mini-lsm-starter/src/table/bloom.rs b/mini-lsm-starter/src/table/bloom.rs index fbd46ec4a..262ffd09f 100644 --- a/mini-lsm-starter/src/table/bloom.rs +++ b/mini-lsm-starter/src/table/bloom.rs @@ -1,7 +1,7 @@ // Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. use anyhow::Result; -use bytes::{BufMut, Bytes, BytesMut}; +use bytes::{Buf, BufMut, Bytes, BytesMut}; /// Bloom implements bloom filter functionalities over /// a bit-slice of data. @@ -48,8 +48,14 @@ impl> BitSliceMut for T { impl Bloom { /// Decode a bloom filter pub fn decode(buf: &[u8]) -> Result { - let filter = &buf[..buf.len() - 1]; - let k = buf[buf.len() - 1]; + // get checksum as 4 bytes + let checksum = (&buf[buf.len() - 4..buf.len()]).get_u32(); + let all_data = &buf[..buf.len() - 4]; + if checksum != crc32fast::hash(all_data) { + return Err(anyhow::anyhow!("Bloom filter checksum mismatch")); + } + let filter = &buf[..buf.len() - 1 - 4]; + let k = buf[buf.len() - 1 - 4]; Ok(Self { filter: filter.to_vec().into(), k, @@ -58,8 +64,12 @@ impl Bloom { /// Encode a bloom filter pub fn encode(&self, buf: &mut Vec) { + let offset = buf.len(); buf.extend(&self.filter); buf.put_u8(self.k); + // add checksum + let checksum = crc32fast::hash(buf[offset..].as_ref()); + buf.put_u32(checksum); } /// Get bloom filter bits per key from entries count and FPR diff --git a/mini-lsm-starter/src/table/builder.rs b/mini-lsm-starter/src/table/builder.rs index b545dfd96..065038325 100644 --- a/mini-lsm-starter/src/table/builder.rs +++ b/mini-lsm-starter/src/table/builder.rs @@ -23,12 +23,12 @@ pub struct SsTableBuilder { } /* ------------------------------------------------------------------------------------------------------ -| Block Section | Meta Section | ------------------------------------------------------------------------------------------------------ -| data block | ... | data block | metadata | meta block offset | bloom filter | bloom filter offset | -| | varlen | u32 | varlen | u32 | ------------------------------------------------------------------------------------------------------ +--------------------------------------------------------------------------------------------------------------------------------------------------------------- +| Block Section | Meta Section | +--------------------------------------------------------------------------------------------------------------------------------------------------------------- +| data block | checksum | ... | data block | checksum | no. of block | metadata | checksum | meta block offset | bloom filter | checksum | bloom filter offset | +| varlen | u32 | | varlen | u32 | u32 | varlen | u32 | u32 | varlen | u32 | u32 | +--------------------------------------------------------------------------------------------------------------------------------------------------------------- */ impl SsTableBuilder { /// Create a builder based on target block size. @@ -78,7 +78,12 @@ impl SsTableBuilder { first_key: std::mem::take(&mut self.first_key).into_key_bytes(), last_key: std::mem::take(&mut self.last_key).into_key_bytes(), }); - self.data.extend_from_slice(&block.build().encode()); + + let encode_data = block.build().encode(); + self.data.extend_from_slice(&encode_data); + // add checksum + let checksum = crc32fast::hash(&encode_data); + self.data.put_u32(checksum); } /// Get the estimated size of the SSTable. diff --git a/mini-lsm-starter/src/wal.rs b/mini-lsm-starter/src/wal.rs index baa1aab45..bd043328b 100644 --- a/mini-lsm-starter/src/wal.rs +++ b/mini-lsm-starter/src/wal.rs @@ -1,6 +1,7 @@ #![allow(dead_code)] // REMOVE THIS LINE after fully implementing this functionality use std::fs::{File, OpenOptions}; +use std::hash::Hasher; use std::io::{BufWriter, Read, Write}; use std::path::Path; use std::sync::Arc; @@ -10,7 +11,7 @@ use bytes::{Buf, BufMut, Bytes}; use crossbeam_skiplist::SkipMap; use parking_lot::Mutex; -// | key_len | key | value_len | value | +// | key_len | key | value_len | value | checksum | #[derive(Debug)] pub struct Wal { file: Arc>>, @@ -42,12 +43,21 @@ impl Wal { // read buf to insert into skiplist let mut buf = buf.as_slice(); while !buf.is_empty() { + let mut hasher = crc32fast::Hasher::new(); let key_len = buf.get_u16() as usize; + hasher.write_u16(key_len as u16); let key = Bytes::copy_from_slice(&buf[..key_len]); + hasher.write(&key); buf.advance(key_len); let value_len = buf.get_u16() as usize; + hasher.write_u16(value_len as u16); let value = Bytes::copy_from_slice(&buf[..value_len]); + hasher.write(&value); buf.advance(value_len); + let checksum = buf.get_u32(); + if checksum != hasher.finalize() { + return Err(anyhow::anyhow!("WAL checksum mismatch")); + } skiplist.insert(key, value); } Ok(Self { @@ -57,12 +67,20 @@ impl Wal { pub fn put(&self, key: &[u8], value: &[u8]) -> Result<()> { let mut file = self.file.lock(); - let mut buf: Vec = - Vec::with_capacity(key.len() + value.len() + std::mem::size_of::()); + let mut buf: Vec = Vec::with_capacity( + key.len() + value.len() + std::mem::size_of::() + std::mem::size_of::(), + ); + // use a crc32fast::Hasher to compute the checksum incrementally on each field. + let mut hasher = crc32fast::Hasher::new(); buf.put_u16(key.len() as u16); + hasher.write_u16(key.len() as u16); buf.put_slice(key); + hasher.write(key); buf.put_u16(value.len() as u16); + hasher.write_u16(value.len() as u16); buf.put_slice(value); + hasher.write(value); + buf.put_u32(hasher.finalize()); file.write_all(&buf)?; Ok(()) } From 81ab3b07fa64ac9fa4f3b66843feed719e4a923f Mon Sep 17 00:00:00 2001 From: husharp Date: Sat, 24 Feb 2024 11:05:50 +0800 Subject: [PATCH 15/22] add ts for key Signed-off-by: husharp --- mini-lsm-starter/src/block.rs | 10 +-- mini-lsm-starter/src/block/builder.rs | 23 +++-- mini-lsm-starter/src/block/iterator.rs | 6 +- mini-lsm-starter/src/key.rs | 103 +++++++++++++++-------- mini-lsm-starter/src/lsm_iterator.rs | 6 +- mini-lsm-starter/src/lsm_storage.rs | 44 +++++----- mini-lsm-starter/src/mem_table.rs | 10 +-- mini-lsm-starter/src/table.rs | 22 +++-- mini-lsm-starter/src/table/builder.rs | 2 +- mini-lsm-starter/src/tests.rs | 1 + mini-lsm-starter/src/tests/week3_day1.rs | 54 ++++++++++++ 11 files changed, 193 insertions(+), 88 deletions(-) create mode 100644 mini-lsm-starter/src/tests/week3_day1.rs diff --git a/mini-lsm-starter/src/block.rs b/mini-lsm-starter/src/block.rs index 0dd37d730..0f907adc7 100644 --- a/mini-lsm-starter/src/block.rs +++ b/mini-lsm-starter/src/block.rs @@ -1,6 +1,3 @@ -#![allow(unused_variables)] // TODO(you): remove this lint after implementing this mod -#![allow(dead_code)] // TODO(you): remove this lint after implementing this mod - mod builder; mod iterator; @@ -52,8 +49,9 @@ impl Block { // redundant key is 0 let mut entry = &self.data[0..]; entry.get_u16(); - let key_len = entry.get_u16(); - let key = &entry[..key_len as usize]; - KeyVec::from_vec(key.to_vec()) + let key_len = entry.get_u16() as usize; + let key = &entry[..key_len]; + entry.advance(key_len); + KeyVec::from_vec_with_ts(key.to_vec(), entry.get_u64()) } } diff --git a/mini-lsm-starter/src/block/builder.rs b/mini-lsm-starter/src/block/builder.rs index b20662fa0..96698cb4d 100644 --- a/mini-lsm-starter/src/block/builder.rs +++ b/mini-lsm-starter/src/block/builder.rs @@ -25,8 +25,8 @@ pub struct BlockBuilder { fn compute_redundant_key(first_key: KeySlice, cur_key: KeySlice) -> usize { let mut i = 0; - while i < first_key.len() && i < cur_key.len() { - if first_key.raw_ref()[i] == cur_key.raw_ref()[i] { + while i < first_key.key_len() && i < cur_key.key_len() { + if first_key.key_ref()[i] == cur_key.key_ref()[i] { i += 1; } else { break; @@ -46,18 +46,23 @@ impl BlockBuilder { } } + fn estimated_size(&self) -> usize { + SIZEOF_U16 /* number of key-value pairs in the block */ + self.offsets.len() * SIZEOF_U16 /* offsets */ + self.data.len() + // key-value pairs + } + /* Each entry is a key-value pair. ----------------------------------------------------------------------- | Entry #1 | ... | ----------------------------------------------------------------------- - | redundant_key(first_key_index) | key_len (2B) | key (keylen) | value_len (2B) | value (varlen) | ... | + | redundant_key(first_key_index) | key_len (2B) | key (keylen) | timestamp (u64) | value_len (2B) | value (varlen) | ... | ----------------------------------------------------------------------- */ /// Adds a key-value pair to the block. Returns false when the block is full. #[must_use] pub fn add(&mut self, key: KeySlice, value: &[u8]) -> bool { - if self.data.len() + self.offsets.len() + SIZEOF_U16 /* num of elements */ - + key.len() + value.len() + SIZEOF_U16 /* key len */ + SIZEOF_U16 /* value len */ + if self.estimated_size() + SIZEOF_U16 /* num of elements */ + + key.raw_len() + value.len() + SIZEOF_U16 /* key len */ + SIZEOF_U16 /* value len */ > self.block_size && !self.is_empty() { @@ -70,9 +75,11 @@ impl BlockBuilder { let redundant_index = compute_redundant_key(self.first_key.as_key_slice(), key); self.data.put_u16(redundant_index as u16); // rest of the key - self.data.put_u16((key.len() - redundant_index) as u16); - self.data.put(&key.raw_ref()[redundant_index..]); - + self.data.put_u16((key.key_len() - redundant_index) as u16); + self.data.put(&key.key_ref()[redundant_index..]); + // timestamp + self.data.put_u64(key.ts()); + // value self.data.put_u16(value.len() as u16); self.data.put(value); diff --git a/mini-lsm-starter/src/block/iterator.rs b/mini-lsm-starter/src/block/iterator.rs index eea23fd62..20acd8fc1 100644 --- a/mini-lsm-starter/src/block/iterator.rs +++ b/mini-lsm-starter/src/block/iterator.rs @@ -90,13 +90,17 @@ impl BlockIterator { let key_len = entry.get_u16() as usize; let key = &entry[..key_len]; self.key.clear(); - self.key.append(&self.first_key.raw_ref()[..redundant_len]); + self.key.append(&self.first_key.key_ref()[..redundant_len]); self.key.append(key); // move the entry ptr to the begin of the value entry.advance(key_len); + // set timestamp + let ts = entry.get_u64(); + self.key.set_ts(ts); let value_len = entry.get_u16() as usize; let value_offset_begin = offset + SIZEOF_U16*2 /* redundant + key_len(2B) */ + + std::mem::size_of::() /* timestamp(u64) */ + key_len + SIZEOF_U16 /* value_len(2B) */; let value_offset_end = value_offset_begin + value_len; diff --git a/mini-lsm-starter/src/key.rs b/mini-lsm-starter/src/key.rs index f459ee745..b383282ce 100644 --- a/mini-lsm-starter/src/key.rs +++ b/mini-lsm-starter/src/key.rs @@ -1,41 +1,54 @@ -use std::fmt::Debug; +use std::{cmp::Reverse, fmt::Debug}; use bytes::Bytes; -pub const TS_ENABLED: bool = false; - -pub struct Key>(T); +pub struct Key>(T, u64); pub type KeySlice<'a> = Key<&'a [u8]>; pub type KeyVec = Key>; pub type KeyBytes = Key; +/// For testing purpose, should not use anywhere in your implementation. +pub const TS_ENABLED: bool = true; + +/// Temporary, should remove after implementing full week 3 day 1 + 2. +pub const TS_DEFAULT: u64 = 0; + +pub const TS_MAX: u64 = std::u64::MAX; +pub const TS_MIN: u64 = std::u64::MIN; +pub const TS_RANGE_BEGIN: u64 = std::u64::MAX; +pub const TS_RANGE_END: u64 = std::u64::MIN; + impl> Key { pub fn into_inner(self) -> T { self.0 } - pub fn len(&self) -> usize { + pub fn key_len(&self) -> usize { self.0.as_ref().len() } + pub fn raw_len(&self) -> usize { + self.0.as_ref().len() + std::mem::size_of::() + } + pub fn is_empty(&self) -> bool { self.0.as_ref().is_empty() } pub fn for_testing_ts(self) -> u64 { - 0 + self.1 } } impl Key> { pub fn new() -> Self { - Self(Vec::new()) + Self(Vec::new(), TS_DEFAULT) } - /// Create a `KeyVec` from a `Vec`. Will be removed in week 3. - pub fn from_vec(key: Vec) -> Self { - Self(key) + /// Create a `KeyVec` from a `Vec` and a ts. Will be removed in week 3. + pub fn from_vec_with_ts(key: Vec, ts: u64) -> Self { + Self(key, ts) } /// Clears the key and set ts to 0. @@ -48,51 +61,66 @@ impl Key> { self.0.extend(data) } - /// Set the key from a slice without re-allocating. The signature will change in week 3. + pub fn set_ts(&mut self, ts: u64) { + self.1 = ts; + } + + /// Set the key from a slice without re-allocating. pub fn set_from_slice(&mut self, key_slice: KeySlice) { self.0.clear(); self.0.extend(key_slice.0); + self.1 = key_slice.1; } pub fn as_key_slice(&self) -> KeySlice { - Key(self.0.as_slice()) + Key(self.0.as_slice(), self.1) } pub fn into_key_bytes(self) -> KeyBytes { - Key(self.0.into()) + Key(self.0.into(), self.1) } - /// Always use `raw_ref` to access the key in week 1 + 2. This function will be removed in week 3. - pub fn raw_ref(&self) -> &[u8] { + pub fn key_ref(&self) -> &[u8] { self.0.as_ref() } + pub fn ts(&self) -> u64 { + self.1 + } + pub fn for_testing_key_ref(&self) -> &[u8] { self.0.as_ref() } pub fn for_testing_from_vec_no_ts(key: Vec) -> Self { - Self(key) + Self(key, TS_DEFAULT) } } impl Key { + pub fn new() -> Self { + Self(Bytes::new(), TS_DEFAULT) + } + pub fn as_key_slice(&self) -> KeySlice { - Key(&self.0) + Key(&self.0, self.1) } - /// Create a `KeyBytes` from a `Bytes`. Will be removed in week 3. - pub fn from_bytes(bytes: Bytes) -> KeyBytes { - Key(bytes) + /// Create a `KeyBytes` from a `Bytes` and a ts. + pub fn from_bytes_with_ts(bytes: Bytes, ts: u64) -> KeyBytes { + Key(bytes, ts) } - /// Always use `raw_ref` to access the key in week 1 + 2. This function will be removed in week 3. - pub fn raw_ref(&self) -> &[u8] { + pub fn key_ref(&self) -> &[u8] { self.0.as_ref() } + pub fn ts(&self) -> u64 { + self.1 + } + pub fn for_testing_from_bytes_no_ts(bytes: Bytes) -> KeyBytes { - Key(bytes) + Key(bytes, TS_DEFAULT) } pub fn for_testing_key_ref(&self) -> &[u8] { @@ -102,29 +130,32 @@ impl Key { impl<'a> Key<&'a [u8]> { pub fn to_key_vec(self) -> KeyVec { - Key(self.0.to_vec()) + Key(self.0.to_vec(), self.1) } /// Create a key slice from a slice. Will be removed in week 3. - pub fn from_slice(slice: &'a [u8]) -> Self { - Self(slice) + pub fn from_slice(slice: &'a [u8], ts: u64) -> Self { + Self(slice, ts) } - /// Always use `raw_ref` to access the key in week 1 + 2. This function will be removed in week 3. - pub fn raw_ref(self) -> &'a [u8] { + pub fn key_ref(self) -> &'a [u8] { self.0 } + pub fn ts(&self) -> u64 { + self.1 + } + pub fn for_testing_key_ref(self) -> &'a [u8] { self.0 } pub fn for_testing_from_slice_no_ts(slice: &'a [u8]) -> Self { - Self(slice) + Self(slice, TS_DEFAULT) } - pub fn for_testing_from_slice_with_ts(slice: &'a [u8], _ts: u64) -> Self { - Self(slice) + pub fn for_testing_from_slice_with_ts(slice: &'a [u8], ts: u64) -> Self { + Self(slice, ts) } } @@ -136,13 +167,13 @@ impl + Debug> Debug for Key { impl + Default> Default for Key { fn default() -> Self { - Self(T::default()) + Self(T::default(), TS_DEFAULT) } } impl + PartialEq> PartialEq for Key { fn eq(&self, other: &Self) -> bool { - self.0.eq(&other.0) + (self.0.as_ref(), self.1).eq(&(other.0.as_ref(), other.1)) } } @@ -150,7 +181,7 @@ impl + Eq> Eq for Key {} impl + Clone> Clone for Key { fn clone(&self) -> Self { - Self(self.0.clone()) + Self(self.0.clone(), self.1) } } @@ -158,12 +189,12 @@ impl + Copy> Copy for Key {} impl + PartialOrd> PartialOrd for Key { fn partial_cmp(&self, other: &Self) -> Option { - self.0.partial_cmp(&other.0) + (self.0.as_ref(), Reverse(self.1)).partial_cmp(&(other.0.as_ref(), Reverse(other.1))) } } impl + Ord> Ord for Key { fn cmp(&self, other: &Self) -> std::cmp::Ordering { - self.0.cmp(&other.0) + (self.0.as_ref(), Reverse(self.1)).cmp(&(other.0.as_ref(), Reverse(other.1))) } } diff --git a/mini-lsm-starter/src/lsm_iterator.rs b/mini-lsm-starter/src/lsm_iterator.rs index 5c4e68087..e9868664f 100644 --- a/mini-lsm-starter/src/lsm_iterator.rs +++ b/mini-lsm-starter/src/lsm_iterator.rs @@ -51,8 +51,8 @@ impl LsmIterator { } match self.end_bound.as_ref() { Bound::Unbounded => {} - Bound::Included(key) => self.is_valid = self.inner.key().raw_ref() <= key.as_ref(), - Bound::Excluded(key) => self.is_valid = self.inner.key().raw_ref() < key.as_ref(), + Bound::Included(key) => self.is_valid = self.inner.key().key_ref() <= key.as_ref(), + Bound::Excluded(key) => self.is_valid = self.inner.key().key_ref() < key.as_ref(), } Ok(()) } @@ -66,7 +66,7 @@ impl StorageIterator for LsmIterator { } fn key(&self) -> &[u8] { - self.inner.key().raw_ref() + self.inner.key().key_ref() } fn value(&self) -> &[u8] { diff --git a/mini-lsm-starter/src/lsm_storage.rs b/mini-lsm-starter/src/lsm_storage.rs index edcb1b1fb..05044f477 100644 --- a/mini-lsm-starter/src/lsm_storage.rs +++ b/mini-lsm-starter/src/lsm_storage.rs @@ -20,7 +20,7 @@ use crate::iterators::concat_iterator::SstConcatIterator; use crate::iterators::merge_iterator::MergeIterator; use crate::iterators::two_merge_iterator::TwoMergeIterator; use crate::iterators::StorageIterator; -use crate::key::KeySlice; +use crate::key::{KeySlice, TS_RANGE_BEGIN}; use crate::lsm_iterator::{FusedIterator, LsmIterator}; use crate::manifest::{Manifest, ManifestRecord}; use crate::mem_table::{map_bound, MemTable}; @@ -548,7 +548,7 @@ impl LsmStorageInner { let check_sst = |key: &[u8], sst: &SsTable| { // check if the key is within the SST's key range - if key_within(key, sst.first_key().raw_ref(), sst.last_key().raw_ref()) { + if key_within(key, sst.first_key().key_ref(), sst.last_key().key_ref()) { // bloom filter check if let Some(bloom) = &sst.bloom { if bloom.may_contain(farmhash::fingerprint32(key)) { @@ -568,7 +568,7 @@ impl LsmStorageInner { if check_sst(key, &sst) { let iter = SsTableIterator::create_and_seek_to_key( sst.clone(), - KeySlice::from_slice(key), + KeySlice::from_slice(key, TS_RANGE_BEGIN), )?; l0_iters.push(Box::new(iter)); } @@ -585,15 +585,17 @@ impl LsmStorageInner { ssts.push(sst.clone()); } } - let level_iter = - SstConcatIterator::create_and_seek_to_key(ssts, KeySlice::from_slice(key))?; + let level_iter = SstConcatIterator::create_and_seek_to_key( + ssts, + KeySlice::from_slice(key, TS_RANGE_BEGIN), + )?; level_iters.push(Box::new(level_iter)); } let merge_iter = MergeIterator::create(level_iters); let two_merge_iterator = TwoMergeIterator::create(merge_l0_sstable_iter, merge_iter)?; if two_merge_iterator.is_valid() - && two_merge_iterator.key() == KeySlice::from_slice(key) + && two_merge_iterator.key().key_ref() == key && !two_merge_iterator.value().is_empty() { return Ok(Some(Bytes::copy_from_slice(two_merge_iterator.value()))); @@ -631,21 +633,22 @@ impl LsmStorageInner { if check_intersect_of_range( lower, upper, - sst.first_key().raw_ref(), - sst.last_key().raw_ref(), + sst.first_key().key_ref(), + sst.last_key().key_ref(), ) { // SST iterator does not support passing an end bound to it. // Therefore, need to handle the end_bound manually in LsmIterator let iter = match lower { - Bound::Included(key) => { - SsTableIterator::create_and_seek_to_key(sst, KeySlice::from_slice(key))? - } + Bound::Included(key) => SsTableIterator::create_and_seek_to_key( + sst, + KeySlice::from_slice(key, TS_RANGE_BEGIN), + )?, Bound::Excluded(key) => { let mut iter = SsTableIterator::create_and_seek_to_key( sst, - KeySlice::from_slice(key), + KeySlice::from_slice(key, TS_RANGE_BEGIN), )?; - if iter.is_valid() && iter.key() == KeySlice::from_slice(key) { + if iter.is_valid() && iter.key().key_ref() == key { iter.next()?; } iter @@ -670,13 +673,16 @@ impl LsmStorageInner { ssts.push(snapshot.sstables[sst_id].clone()); } let concat_iter = match lower { - Bound::Included(key) => { - SstConcatIterator::create_and_seek_to_key(ssts, KeySlice::from_slice(key))? - } + Bound::Included(key) => SstConcatIterator::create_and_seek_to_key( + ssts, + KeySlice::from_slice(key, TS_RANGE_BEGIN), + )?, Bound::Excluded(key) => { - let mut iter = - SstConcatIterator::create_and_seek_to_key(ssts, KeySlice::from_slice(key))?; - if iter.is_valid() && iter.key() == KeySlice::from_slice(key) { + let mut iter = SstConcatIterator::create_and_seek_to_key( + ssts, + KeySlice::from_slice(key, TS_RANGE_BEGIN), + )?; + if iter.is_valid() && iter.key().key_ref() == key { iter.next()?; } iter diff --git a/mini-lsm-starter/src/mem_table.rs b/mini-lsm-starter/src/mem_table.rs index 0c8a6be30..3629d9fd5 100644 --- a/mini-lsm-starter/src/mem_table.rs +++ b/mini-lsm-starter/src/mem_table.rs @@ -12,7 +12,7 @@ use crossbeam_skiplist::SkipMap; use ouroboros::self_referencing; use crate::iterators::StorageIterator; -use crate::key::KeySlice; +use crate::key::{KeySlice, TS_DEFAULT}; use crate::table::SsTableBuilder; use crate::wal::Wal; @@ -130,9 +130,9 @@ impl MemTable { /// Flush the mem-table to SSTable. Implement in week 1 day 6. pub fn flush(&self, builder: &mut SsTableBuilder) -> Result<()> { - self.map - .iter() - .for_each(|entry| builder.add(KeySlice::from_slice(entry.key()), entry.value())); + self.map.iter().for_each(|entry| { + builder.add(KeySlice::from_slice(entry.key(), TS_DEFAULT), entry.value()) + }); Ok(()) } @@ -187,7 +187,7 @@ impl StorageIterator for MemTableIterator { } fn key(&self) -> KeySlice { - KeySlice::from_slice(&self.borrow_item().0) + KeySlice::from_slice(&self.borrow_item().0, TS_DEFAULT) } // is_valid returns if the iterator has reached the end or errored. diff --git a/mini-lsm-starter/src/table.rs b/mini-lsm-starter/src/table.rs index 937017959..382aa42ed 100644 --- a/mini-lsm-starter/src/table.rs +++ b/mini-lsm-starter/src/table.rs @@ -34,7 +34,7 @@ pub struct BlockMeta { ----------------------------------------------------------------------- | block meta | ... | ----------------------------------------------------------------------- -| offset(4B) | first_key_len (2B) | first_key (keylen) | last_key_len (2B) | last_key (keylen) | ... | +| offset(4B) | first_key_len (2B) | first_key (keylen) ts | last_key_len (2B) | last_key (keylen) ts | ... | ----------------------------------------------------------------------- */ impl BlockMeta { @@ -47,10 +47,12 @@ impl BlockMeta { buf.put_u32(block_meta.len() as u32); for meta in block_meta { buf.put_u32(meta.offset as u32); - buf.put_u16(meta.first_key.len() as u16); - buf.put_slice(meta.first_key.raw_ref()); - buf.put_u16(meta.last_key.len() as u16); - buf.put_slice(meta.last_key.raw_ref()); + buf.put_u16(meta.first_key.key_len() as u16); + buf.put_slice(meta.first_key.key_ref()); + buf.put_u64(meta.first_key.ts()); + buf.put_u16(meta.last_key.key_len() as u16); + buf.put_slice(meta.last_key.key_ref()); + buf.put_u64(meta.last_key.ts()); } // add checksum let checksum = crc32fast::hash(&buf[original_len + 4..]); @@ -67,13 +69,15 @@ impl BlockMeta { for _ in 0..meta_len { let offset = buf.get_u32() as usize; let first_key_len = buf.get_u16() as usize; - let first_key = buf.copy_to_bytes(first_key_len); + let first_key = + KeyBytes::from_bytes_with_ts(buf.copy_to_bytes(first_key_len), buf.get_u64()); let last_key_len = buf.get_u16() as usize; - let last_key = buf.copy_to_bytes(last_key_len); + let last_key = + KeyBytes::from_bytes_with_ts(buf.copy_to_bytes(last_key_len), buf.get_u64()); block_meta.push(BlockMeta { offset, - first_key: KeyBytes::from_bytes(first_key), - last_key: KeyBytes::from_bytes(last_key), + first_key, + last_key, }); } // checksum diff --git a/mini-lsm-starter/src/table/builder.rs b/mini-lsm-starter/src/table/builder.rs index 065038325..c2fa11bd5 100644 --- a/mini-lsm-starter/src/table/builder.rs +++ b/mini-lsm-starter/src/table/builder.rs @@ -54,7 +54,7 @@ impl SsTableBuilder { } // add the key hash to the bloom filter - self.key_hashes.push(farmhash::fingerprint32(key.raw_ref())); + self.key_hashes.push(farmhash::fingerprint32(key.key_ref())); // block builder returns false when the block is full. if self.block_builder.add(key, value) { self.last_key.set_from_slice(key); diff --git a/mini-lsm-starter/src/tests.rs b/mini-lsm-starter/src/tests.rs index 0a30b3207..a5acad172 100644 --- a/mini-lsm-starter/src/tests.rs +++ b/mini-lsm-starter/src/tests.rs @@ -15,3 +15,4 @@ mod week2_day3; mod week2_day4; mod week2_day5; mod week2_day6; +mod week3_day1; diff --git a/mini-lsm-starter/src/tests/week3_day1.rs b/mini-lsm-starter/src/tests/week3_day1.rs new file mode 100644 index 000000000..df5597968 --- /dev/null +++ b/mini-lsm-starter/src/tests/week3_day1.rs @@ -0,0 +1,54 @@ +use std::sync::Arc; + +use bytes::Bytes; +use tempfile::tempdir; + +use crate::key::KeySlice; +use crate::table::{FileObject, SsTable, SsTableBuilder, SsTableIterator}; + +use super::harness::{check_iter_result_by_key_and_ts, generate_sst_with_ts}; + +#[test] +fn test_sst_build_multi_version_simple() { + let mut builder = SsTableBuilder::new(16); + builder.add( + KeySlice::for_testing_from_slice_with_ts(b"233", 233), + b"233333", + ); + builder.add( + KeySlice::for_testing_from_slice_with_ts(b"233", 0), + b"2333333", + ); + let dir = tempdir().unwrap(); + builder.build_for_test(dir.path().join("1.sst")).unwrap(); +} + +fn generate_test_data() -> Vec<((Bytes, u64), Bytes)> { + (0..100) + .map(|id| { + ( + (Bytes::from(format!("key{:05}", id / 5)), 5 - (id % 5)), + Bytes::from(format!("value{:05}", id)), + ) + }) + .collect() +} + +#[test] +fn test_sst_build_multi_version_hard() { + let dir = tempdir().unwrap(); + let data = generate_test_data(); + generate_sst_with_ts(1, dir.path().join("1.sst"), data.clone(), None); + let sst = Arc::new( + SsTable::open( + 1, + None, + FileObject::open(&dir.path().join("1.sst")).unwrap(), + ) + .unwrap(), + ); + check_iter_result_by_key_and_ts( + &mut SsTableIterator::create_and_seek_to_first(sst).unwrap(), + data, + ); +} From bc34cf19512e3f56f09a2e4282dc8f6a9b9d7d69 Mon Sep 17 00:00:00 2001 From: husharp Date: Sat, 24 Feb 2024 12:31:38 +0800 Subject: [PATCH 16/22] refactor to know MVCC Signed-off-by: husharp --- mini-lsm-starter/src/compact.rs | 31 +++++---- mini-lsm-starter/src/lsm_iterator.rs | 25 +++++-- mini-lsm-starter/src/lsm_storage.rs | 79 ++++++++++++--------- mini-lsm-starter/src/mem_table.rs | 89 +++++++++++++++++------- mini-lsm-starter/src/tests.rs | 1 + mini-lsm-starter/src/tests/week3_day2.rs | 61 ++++++++++++++++ mini-lsm-starter/src/wal.rs | 24 ++++--- 7 files changed, 227 insertions(+), 83 deletions(-) create mode 100644 mini-lsm-starter/src/tests/week3_day2.rs diff --git a/mini-lsm-starter/src/compact.rs b/mini-lsm-starter/src/compact.rs index 008622336..f21413c3a 100644 --- a/mini-lsm-starter/src/compact.rs +++ b/mini-lsm-starter/src/compact.rs @@ -118,36 +118,43 @@ impl LsmStorageInner { fn compact_generate_sst_from_iter( &self, mut iter: impl for<'a> StorageIterator = KeySlice<'a>>, - compact_to_bottom_level: bool, + _compact_to_bottom_level: bool, ) -> Result>> { let mut new_ssts = Vec::new(); // compact the iterators let mut builder = None; + let mut last_key = Vec::::new(); while iter.is_valid() { if builder.is_none() { builder = Some(SsTableBuilder::new(self.options.block_size)); } let builder_inner = builder.as_mut().unwrap(); - if compact_to_bottom_level { - if !iter.value().is_empty() { - builder_inner.add(iter.key(), iter.value()); - } - } else { - builder_inner.add(iter.key(), iter.value()); - } - iter.next()?; + let same_as_last_key = iter.key().key_ref() == last_key; - if builder_inner.estimated_size() >= self.options.target_sst_size { + // keep ALL versions of a key during the compaction.(NOT remove empty keys for now[week 3, day 2]) + // the same key with different timestamps are put in the same SST file, even if it exceeds the SST size limit + if builder_inner.estimated_size() >= self.options.target_sst_size && !same_as_last_key { let sst_id = self.next_sst_id(); - let builder = builder.take().unwrap(); - let new_sst = Arc::new(builder.build( + let old_builder = builder.take().unwrap(); + let new_sst = Arc::new(old_builder.build( sst_id, Some(self.block_cache.clone()), self.path_of_sst(sst_id), )?); new_ssts.push(new_sst); + builder = Some(SsTableBuilder::new(self.options.block_size)); } + + // add the key-value pair to the builder + builder.as_mut().unwrap().add(iter.key(), iter.value()); + + if !same_as_last_key { + last_key.clear(); + last_key.extend(iter.key().key_ref()); + } + + iter.next()?; } // put last sst if exists builder diff --git a/mini-lsm-starter/src/lsm_iterator.rs b/mini-lsm-starter/src/lsm_iterator.rs index e9868664f..0264a41df 100644 --- a/mini-lsm-starter/src/lsm_iterator.rs +++ b/mini-lsm-starter/src/lsm_iterator.rs @@ -23,6 +23,7 @@ pub struct LsmIterator { inner: LsmIteratorInner, end_bound: Bound, is_valid: bool, + prev_key: Vec, } impl LsmIterator { @@ -31,16 +32,26 @@ impl LsmIterator { is_valid: iter.is_valid(), inner: iter, end_bound, + prev_key: Vec::new(), }; - iter.move_to_non_delete()?; + iter.move_to_key()?; Ok(iter) } - fn move_to_non_delete(&mut self) -> Result<()> { - while self.is_valid() && self.inner.value().is_empty() { - self.next_inner()?; + fn move_to_key(&mut self) -> Result<()> { + loop { + while self.is_valid() && self.key() == self.prev_key { + self.next_inner()?; + } + if !self.is_valid() { + return Ok(()); + } + self.prev_key.clear(); + self.prev_key.extend(self.inner.key().key_ref()); + if !self.value().is_empty() { + return Ok(()); + } } - Ok(()) } fn next_inner(&mut self) -> Result<()> { @@ -75,8 +86,8 @@ impl StorageIterator for LsmIterator { fn next(&mut self) -> Result<()> { self.next_inner()?; - // move to the next non-delete entry - self.move_to_non_delete()?; + // move to the next key + self.move_to_key()?; Ok(()) } diff --git a/mini-lsm-starter/src/lsm_storage.rs b/mini-lsm-starter/src/lsm_storage.rs index 05044f477..5f354c2d2 100644 --- a/mini-lsm-starter/src/lsm_storage.rs +++ b/mini-lsm-starter/src/lsm_storage.rs @@ -20,10 +20,10 @@ use crate::iterators::concat_iterator::SstConcatIterator; use crate::iterators::merge_iterator::MergeIterator; use crate::iterators::two_merge_iterator::TwoMergeIterator; use crate::iterators::StorageIterator; -use crate::key::{KeySlice, TS_RANGE_BEGIN}; +use crate::key::{KeySlice, TS_RANGE_BEGIN, TS_RANGE_END}; use crate::lsm_iterator::{FusedIterator, LsmIterator}; use crate::manifest::{Manifest, ManifestRecord}; -use crate::mem_table::{map_bound, MemTable}; +use crate::mem_table::{map_bound, map_key_bound_plus_ts, MemTable}; use crate::mvcc::LsmMvccInner; use crate::table::{FileObject, SsTable, SsTableBuilder, SsTableIterator}; @@ -261,6 +261,10 @@ impl LsmStorageInner { .fetch_add(1, std::sync::atomic::Ordering::SeqCst) } + pub(crate) fn mvcc(&self) -> &LsmMvccInner { + self.mvcc.as_ref().unwrap() + } + /// Start the storage engine by either loading an existing directory or creating a new one if the directory does /// not exist. pub(crate) fn open(path: impl AsRef, options: LsmStorageOptions) -> Result { @@ -383,7 +387,7 @@ impl LsmStorageInner { compaction_controller, manifest: Some(manifest), options: options.into(), - mvcc: None, + mvcc: Some(LsmMvccInner::new(0)), }; storage.sync_dir()?; @@ -529,22 +533,21 @@ impl LsmStorageInner { pub fn get(&self, key: &[u8]) -> Result> { let snapshot = self.state.read(); // search memtable firstly - if let Some(value) = snapshot.memtable.get(key) { - if value.is_empty() { - return Ok(None); - } - return Ok(Some(value)); - } + let mut memtable_iters = Vec::with_capacity(snapshot.imm_memtables.len() + 1); + memtable_iters.push(Box::new(snapshot.memtable.scan( + Bound::Included(KeySlice::from_slice(key, TS_RANGE_BEGIN)), + Bound::Included(KeySlice::from_slice(key, TS_RANGE_END)), + ))); // memtable scan range is [key, key] // traverse imm-memtable - for memtable in snapshot.imm_memtables.iter() { - if let Some(value) = memtable.get(key) { - if value.is_empty() { - return Ok(None); - } - return Ok(Some(value)); - } + for imm_memtable in snapshot.imm_memtables.iter() { + memtable_iters.push(Box::new(imm_memtable.scan( + Bound::Included(KeySlice::from_slice(key, TS_RANGE_BEGIN)), + Bound::Included(KeySlice::from_slice(key, TS_RANGE_END)), + ))); } + // using merge iterator to merge all memtables and imm_memtables iters + let memtable_iter = MergeIterator::create(memtable_iters); let check_sst = |key: &[u8], sst: &SsTable| { // check if the key is within the SST's key range @@ -573,7 +576,8 @@ impl LsmStorageInner { l0_iters.push(Box::new(iter)); } } - let merge_l0_sstable_iter = MergeIterator::create(l0_iters); + let memtable_and_l0_iter = + TwoMergeIterator::create(memtable_iter, MergeIterator::create(l0_iters))?; // create merge iterator for multi-level sstables let mut level_iters = Vec::with_capacity(snapshot.levels.len()); @@ -591,9 +595,9 @@ impl LsmStorageInner { )?; level_iters.push(Box::new(level_iter)); } - let merge_iter = MergeIterator::create(level_iters); + let merge_level_iter = MergeIterator::create(level_iters); - let two_merge_iterator = TwoMergeIterator::create(merge_l0_sstable_iter, merge_iter)?; + let two_merge_iterator = TwoMergeIterator::create(memtable_and_l0_iter, merge_level_iter)?; if two_merge_iterator.is_valid() && two_merge_iterator.key().key_ref() == key && !two_merge_iterator.value().is_empty() @@ -617,12 +621,19 @@ impl LsmStorageInner { // need to get all memtables and imm_memtables let mut memtable_iters = Vec::with_capacity(snapshot.imm_memtables.len() + 1); - memtable_iters.push(Box::new(snapshot.memtable.scan(lower, upper))); + memtable_iters.push(Box::new(snapshot.memtable.scan( + map_key_bound_plus_ts(lower, TS_RANGE_BEGIN), + map_key_bound_plus_ts(upper, TS_RANGE_END), + ))); + for imm_memtable in snapshot.imm_memtables.iter() { - memtable_iters.push(Box::new(imm_memtable.scan(lower, upper))); + memtable_iters.push(Box::new(imm_memtable.scan( + map_key_bound_plus_ts(lower, TS_RANGE_BEGIN), + map_key_bound_plus_ts(upper, TS_RANGE_END), + ))); } // using merge iterator to merge all memtables and imm_memtables iters - let merge_memtable_iter = MergeIterator::create(memtable_iters); + let memtable_iter = MergeIterator::create(memtable_iters); // using merge iterator to merge all sstables iters let mut l0_iters = Vec::with_capacity(snapshot.l0_sstables.len()); @@ -648,7 +659,7 @@ impl LsmStorageInner { sst, KeySlice::from_slice(key, TS_RANGE_BEGIN), )?; - if iter.is_valid() && iter.key().key_ref() == key { + while iter.is_valid() && iter.key().key_ref() == key { iter.next()?; } iter @@ -659,10 +670,9 @@ impl LsmStorageInner { l0_iters.push(Box::new(iter)); } } - let merge_l0_sstable_iter = MergeIterator::create(l0_iters); // memtables and imm_memtables are merged first, then the result is merged with L0 SSTs - let two_merge_sst_iter = - TwoMergeIterator::create(merge_memtable_iter, merge_l0_sstable_iter)?; + let memtable_and_l0_iter = + TwoMergeIterator::create(memtable_iter, MergeIterator::create(l0_iters))?; // concat multi-level sstables let mut sst_concat_iters = Vec::with_capacity(snapshot.levels.len()); @@ -682,7 +692,7 @@ impl LsmStorageInner { ssts, KeySlice::from_slice(key, TS_RANGE_BEGIN), )?; - if iter.is_valid() && iter.key().key_ref() == key { + while iter.is_valid() && iter.key().key_ref() == key { iter.next()?; } iter @@ -694,7 +704,7 @@ impl LsmStorageInner { let merge_sst_iter = MergeIterator::create(sst_concat_iters); // finally, the result is merged with L1 SSTs - let two_merge_iter = TwoMergeIterator::create(two_merge_sst_iter, merge_sst_iter)?; + let two_merge_iter = TwoMergeIterator::create(memtable_and_l0_iter, merge_sst_iter)?; Ok(FusedIterator::new(LsmIterator::new( two_merge_iter, @@ -704,6 +714,8 @@ impl LsmStorageInner { /// Write a batch of data into the storage. Implement in week 2 day 7. pub fn write_batch>(&self, batch: &[WriteBatchRecord]) -> Result<()> { + let _lck = self.mvcc().write_lock.lock(); + let ts = self.mvcc().latest_commit_ts() + 1; for record in batch { match record { WriteBatchRecord::Put(key, value) => { @@ -713,8 +725,10 @@ impl LsmStorageInner { let value = value.as_ref(); assert!(!key.is_empty(), "key cannot be empty"); assert!(!value.is_empty(), "value cannot be empty"); - let snapshot = self.state.write(); - snapshot.memtable.put(key, value)?; + let snapshot = self.state.read(); + snapshot + .memtable + .put(KeySlice::from_slice(key, ts), value)?; size = snapshot.memtable.approximate_size(); } @@ -724,8 +738,8 @@ impl LsmStorageInner { let size; { let key = key.as_ref(); - let snapshot = self.state.write(); - snapshot.memtable.put(key, b"")?; + let snapshot = self.state.read(); + snapshot.memtable.put(KeySlice::from_slice(key, ts), &[])?; size = snapshot.memtable.approximate_size(); } @@ -733,6 +747,7 @@ impl LsmStorageInner { } } } + self.mvcc().update_commit_ts(ts); Ok(()) } diff --git a/mini-lsm-starter/src/mem_table.rs b/mini-lsm-starter/src/mem_table.rs index 3629d9fd5..165bb71bb 100644 --- a/mini-lsm-starter/src/mem_table.rs +++ b/mini-lsm-starter/src/mem_table.rs @@ -12,7 +12,7 @@ use crossbeam_skiplist::SkipMap; use ouroboros::self_referencing; use crate::iterators::StorageIterator; -use crate::key::{KeySlice, TS_DEFAULT}; +use crate::key::{KeyBytes, KeySlice, TS_DEFAULT}; use crate::table::SsTableBuilder; use crate::wal::Wal; @@ -22,7 +22,7 @@ use crate::wal::Wal; /// chapters of week 1 and week 2. #[derive(Debug)] pub struct MemTable { - map: Arc>, + map: Arc>, wal: Option, id: usize, approximate_size: Arc, @@ -37,6 +37,30 @@ pub(crate) fn map_bound(bound: Bound<&[u8]>) -> Bound { } } +/// Create a bound of `Bytes` from a bound of `KeySlice`. +pub(crate) fn map_key_bound(bound: Bound) -> Bound { + match bound { + Bound::Included(x) => Bound::Included(KeyBytes::from_bytes_with_ts( + Bytes::copy_from_slice(x.key_ref()), + x.ts(), + )), + Bound::Excluded(x) => Bound::Excluded(KeyBytes::from_bytes_with_ts( + Bytes::copy_from_slice(x.key_ref()), + x.ts(), + )), + Bound::Unbounded => Bound::Unbounded, + } +} + +/// Create a bound of `Bytes` from a bound of `KeySlice`. +pub(crate) fn map_key_bound_plus_ts(bound: Bound<&[u8]>, ts: u64) -> Bound { + match bound { + Bound::Included(x) => Bound::Included(KeySlice::from_slice(x, ts)), + Bound::Excluded(x) => Bound::Excluded(KeySlice::from_slice(x, ts)), + Bound::Unbounded => Bound::Unbounded, + } +} + impl MemTable { /// Create a new mem-table. pub fn create(id: usize) -> Self { @@ -70,11 +94,11 @@ impl MemTable { } pub fn for_testing_put_slice(&self, key: &[u8], value: &[u8]) -> Result<()> { - self.put(key, value) + self.put(KeySlice::from_slice(key, TS_DEFAULT), value) } pub fn for_testing_get_slice(&self, key: &[u8]) -> Option { - self.get(key) + self.get(KeySlice::from_slice(key, TS_DEFAULT)) } pub fn for_testing_scan_slice( @@ -82,25 +106,36 @@ impl MemTable { lower: Bound<&[u8]>, upper: Bound<&[u8]>, ) -> MemTableIterator { - self.scan(lower, upper) + self.scan( + map_key_bound_plus_ts(lower, TS_DEFAULT), + map_key_bound_plus_ts(upper, TS_DEFAULT), + ) } + /// should not be used after finishing week 3 /// Get a value by key. - pub fn get(&self, key: &[u8]) -> Option { - self.map.get(key).map(|x| x.value().clone()) + pub fn get(&self, key: KeySlice) -> Option { + let key_bytes = KeyBytes::from_bytes_with_ts( + Bytes::from_static(unsafe { std::mem::transmute(key.key_ref()) }), + key.ts(), + ); + + self.map.get(&key_bytes).map(|x| x.value().clone()) } /// Put a key-value pair into the mem-table. /// /// In week 1, day 1, simply put the key-value pair into the skipmap. /// In week 2, day 6, also flush the data to WAL. - pub fn put(&self, key: &[u8], value: &[u8]) -> Result<()> { + pub fn put(&self, key: KeySlice, value: &[u8]) -> Result<()> { self.approximate_size.fetch_add( - key.len() + value.len(), + key.raw_len() + value.len(), std::sync::atomic::Ordering::Relaxed, ); - self.map - .insert(Bytes::copy_from_slice(key), Bytes::copy_from_slice(value)); + self.map.insert( + key.to_key_vec().into_key_bytes(), + Bytes::copy_from_slice(value), + ); if let Some(ref wal) = self.wal { wal.put(key, value)?; } @@ -115,12 +150,12 @@ impl MemTable { } /// Get an iterator over a range of keys. - pub fn scan(&self, lower: Bound<&[u8]>, upper: Bound<&[u8]>) -> MemTableIterator { + pub fn scan(&self, lower: Bound, upper: Bound) -> MemTableIterator { let map = self.map.clone(); let mut iter = MemTableIteratorBuilder { map, - iter_builder: |map| map.range((map_bound(lower), map_bound(upper))), - item: (Bytes::new(), Bytes::new()), + iter_builder: |map| map.range((map_key_bound(lower), map_key_bound(upper))), + item: (KeyBytes::new(), Bytes::new()), } .build(); let next = iter.with_iter_mut(|iter| MemTableIterator::entry_to_item(iter.next())); @@ -130,9 +165,9 @@ impl MemTable { /// Flush the mem-table to SSTable. Implement in week 1 day 6. pub fn flush(&self, builder: &mut SsTableBuilder) -> Result<()> { - self.map.iter().for_each(|entry| { - builder.add(KeySlice::from_slice(entry.key(), TS_DEFAULT), entry.value()) - }); + self.map + .iter() + .for_each(|entry| builder.add(entry.key().as_key_slice(), entry.value())); Ok(()) } @@ -151,31 +186,37 @@ impl MemTable { } } -type SkipMapRangeIter<'a> = - crossbeam_skiplist::map::Range<'a, Bytes, (Bound, Bound), Bytes, Bytes>; +type SkipMapRangeIter<'a> = crossbeam_skiplist::map::Range< + 'a, + KeyBytes, + (Bound, Bound), + KeyBytes, + Bytes, +>; /// An iterator over a range of `SkipMap`. This is a self-referential structure and please refer to week 1, day 2 /// chapter for more information. /// /// This is part of week 1, day 2. +/// changed in week 3, day 2. #[self_referencing] pub struct MemTableIterator { /// Stores a reference to the skipmap. - map: Arc>, + map: Arc>, /// Stores a skipmap iterator that refers to the lifetime of `MemTableIterator` itself. #[borrows(map)] #[not_covariant] iter: SkipMapRangeIter<'this>, /// Stores the current key-value pair. - item: (Bytes, Bytes), + item: (KeyBytes, Bytes), } impl MemTableIterator { // This function is used to convert a `SkipMap` entry to a key-value pair. - fn entry_to_item(entry: Option>) -> (Bytes, Bytes) { + fn entry_to_item(entry: Option>) -> (KeyBytes, Bytes) { entry .map(|x| (x.key().clone(), x.value().clone())) - .unwrap_or_else(|| (Bytes::new(), Bytes::new())) + .unwrap_or_else(|| (KeyBytes::new(), Bytes::new())) } } @@ -187,7 +228,7 @@ impl StorageIterator for MemTableIterator { } fn key(&self) -> KeySlice { - KeySlice::from_slice(&self.borrow_item().0, TS_DEFAULT) + self.borrow_item().0.as_key_slice() } // is_valid returns if the iterator has reached the end or errored. diff --git a/mini-lsm-starter/src/tests.rs b/mini-lsm-starter/src/tests.rs index a5acad172..3c95eabc5 100644 --- a/mini-lsm-starter/src/tests.rs +++ b/mini-lsm-starter/src/tests.rs @@ -16,3 +16,4 @@ mod week2_day4; mod week2_day5; mod week2_day6; mod week3_day1; +mod week3_day2; diff --git a/mini-lsm-starter/src/tests/week3_day2.rs b/mini-lsm-starter/src/tests/week3_day2.rs new file mode 100644 index 000000000..df1f3ce89 --- /dev/null +++ b/mini-lsm-starter/src/tests/week3_day2.rs @@ -0,0 +1,61 @@ +use std::time::Duration; + +use tempfile::tempdir; + +use crate::{ + compact::CompactionOptions, + lsm_storage::{LsmStorageOptions, MiniLsm}, + tests::harness::dump_files_in_dir, +}; + +#[test] +fn test_task3_compaction_integration() { + let dir = tempdir().unwrap(); + let mut options = LsmStorageOptions::default_for_week2_test(CompactionOptions::NoCompaction); + options.enable_wal = true; + let storage = MiniLsm::open(&dir, options.clone()).unwrap(); + let _txn = storage.new_txn().unwrap(); + for i in 0..=20000 { + storage + .put(b"0", format!("{:02000}", i).as_bytes()) + .unwrap(); + } + std::thread::sleep(Duration::from_secs(1)); // wait until all memtables flush + while { + let snapshot = storage.inner.state.read(); + !snapshot.imm_memtables.is_empty() + } { + storage.inner.force_flush_next_imm_memtable().unwrap(); + } + assert!(storage.inner.state.read().l0_sstables.len() > 1); + storage.force_full_compaction().unwrap(); + storage.dump_structure(); + dump_files_in_dir(&dir); + assert!(storage.inner.state.read().l0_sstables.is_empty()); + assert_eq!(storage.inner.state.read().levels.len(), 1); + // same key in the same SST + assert_eq!(storage.inner.state.read().levels[0].1.len(), 1); + for i in 0..=100 { + storage + .put(b"1", format!("{:02000}", i).as_bytes()) + .unwrap(); + } + storage + .inner + .force_freeze_memtable(&storage.inner.state_lock.lock()) + .unwrap(); + std::thread::sleep(Duration::from_secs(1)); // wait until all memtables flush + while { + let snapshot = storage.inner.state.read(); + !snapshot.imm_memtables.is_empty() + } { + storage.inner.force_flush_next_imm_memtable().unwrap(); + } + storage.force_full_compaction().unwrap(); + storage.dump_structure(); + dump_files_in_dir(&dir); + assert!(storage.inner.state.read().l0_sstables.is_empty()); + assert_eq!(storage.inner.state.read().levels.len(), 1); + // same key in the same SST, now we should split two + assert_eq!(storage.inner.state.read().levels[0].1.len(), 2); +} diff --git a/mini-lsm-starter/src/wal.rs b/mini-lsm-starter/src/wal.rs index bd043328b..1f1e93537 100644 --- a/mini-lsm-starter/src/wal.rs +++ b/mini-lsm-starter/src/wal.rs @@ -11,6 +11,8 @@ use bytes::{Buf, BufMut, Bytes}; use crossbeam_skiplist::SkipMap; use parking_lot::Mutex; +use crate::key::{KeyBytes, KeySlice}; + // | key_len | key | value_len | value | checksum | #[derive(Debug)] pub struct Wal { @@ -31,7 +33,7 @@ impl Wal { }) } - pub fn recover(path: impl AsRef, skiplist: &SkipMap) -> Result { + pub fn recover(path: impl AsRef, skiplist: &SkipMap) -> Result { let path = path.as_ref(); let mut file = OpenOptions::new() .read(true) @@ -49,6 +51,9 @@ impl Wal { let key = Bytes::copy_from_slice(&buf[..key_len]); hasher.write(&key); buf.advance(key_len); + // read ts + let ts = buf.get_u64(); + hasher.write_u64(ts); let value_len = buf.get_u16() as usize; hasher.write_u16(value_len as u16); let value = Bytes::copy_from_slice(&buf[..value_len]); @@ -58,24 +63,27 @@ impl Wal { if checksum != hasher.finalize() { return Err(anyhow::anyhow!("WAL checksum mismatch")); } - skiplist.insert(key, value); + skiplist.insert(KeyBytes::from_bytes_with_ts(key, ts), value); } Ok(Self { file: Arc::new(Mutex::new(BufWriter::new(file))), }) } - pub fn put(&self, key: &[u8], value: &[u8]) -> Result<()> { + pub fn put(&self, key: KeySlice, value: &[u8]) -> Result<()> { let mut file = self.file.lock(); let mut buf: Vec = Vec::with_capacity( - key.len() + value.len() + std::mem::size_of::() + std::mem::size_of::(), + key.raw_len() + value.len() + std::mem::size_of::() + std::mem::size_of::(), ); // use a crc32fast::Hasher to compute the checksum incrementally on each field. let mut hasher = crc32fast::Hasher::new(); - buf.put_u16(key.len() as u16); - hasher.write_u16(key.len() as u16); - buf.put_slice(key); - hasher.write(key); + buf.put_u16(key.key_len() as u16); + hasher.write_u16(key.key_len() as u16); + buf.put_slice(key.key_ref()); + hasher.write(key.key_ref()); + // write ts + buf.put_u64(key.ts()); + hasher.write_u64(key.ts()); buf.put_u16(value.len() as u16); hasher.write_u16(value.len() as u16); buf.put_slice(value); From d96885261eae422b98b38766c1b65f100a1b2394 Mon Sep 17 00:00:00 2001 From: husharp Date: Sun, 25 Feb 2024 12:52:03 +0800 Subject: [PATCH 17/22] support transaction api Signed-off-by: husharp --- mini-lsm-starter/src/lsm_iterator.rs | 23 +- mini-lsm-starter/src/lsm_storage.rs | 60 +++-- mini-lsm-starter/src/mem_table.rs | 2 +- mini-lsm-starter/src/mvcc.rs | 8 +- mini-lsm-starter/src/mvcc/txn.rs | 22 +- mini-lsm-starter/src/table.rs | 14 +- mini-lsm-starter/src/table/builder.rs | 9 +- mini-lsm-starter/src/tests.rs | 1 + mini-lsm-starter/src/tests/week3_day3.rs | 276 +++++++++++++++++++++++ 9 files changed, 376 insertions(+), 39 deletions(-) create mode 100644 mini-lsm-starter/src/tests/week3_day3.rs diff --git a/mini-lsm-starter/src/lsm_iterator.rs b/mini-lsm-starter/src/lsm_iterator.rs index 0264a41df..230b867f6 100644 --- a/mini-lsm-starter/src/lsm_iterator.rs +++ b/mini-lsm-starter/src/lsm_iterator.rs @@ -23,15 +23,21 @@ pub struct LsmIterator { inner: LsmIteratorInner, end_bound: Bound, is_valid: bool, + read_ts: u64, prev_key: Vec, } impl LsmIterator { - pub(crate) fn new(iter: LsmIteratorInner, end_bound: Bound) -> Result { + pub(crate) fn new( + iter: LsmIteratorInner, + end_bound: Bound, + read_ts: u64, + ) -> Result { let mut iter = Self { is_valid: iter.is_valid(), inner: iter, end_bound, + read_ts, prev_key: Vec::new(), }; iter.move_to_key()?; @@ -48,6 +54,21 @@ impl LsmIterator { } self.prev_key.clear(); self.prev_key.extend(self.inner.key().key_ref()); + // traverse all version of the same key + while self.is_valid() + && self.key() == self.prev_key + && self.inner.key().ts() > self.read_ts + { + self.next_inner()?; + } + if !self.is_valid() { + return Ok(()); + } + // if now the key is different(all ts above `read_ts``), + // need to move to the next key. + if self.key() != self.prev_key { + continue; + } if !self.value().is_empty() { return Ok(()); } diff --git a/mini-lsm-starter/src/lsm_storage.rs b/mini-lsm-starter/src/lsm_storage.rs index 5f354c2d2..0dea87119 100644 --- a/mini-lsm-starter/src/lsm_storage.rs +++ b/mini-lsm-starter/src/lsm_storage.rs @@ -24,6 +24,7 @@ use crate::key::{KeySlice, TS_RANGE_BEGIN, TS_RANGE_END}; use crate::lsm_iterator::{FusedIterator, LsmIterator}; use crate::manifest::{Manifest, ManifestRecord}; use crate::mem_table::{map_bound, map_key_bound_plus_ts, MemTable}; +use crate::mvcc::txn::{Transaction, TxnIterator}; use crate::mvcc::LsmMvccInner; use crate::table::{FileObject, SsTable, SsTableBuilder, SsTableIterator}; @@ -206,7 +207,7 @@ impl MiniLsm { })) } - pub fn new_txn(&self) -> Result<()> { + pub fn new_txn(&self) -> Result> { self.inner.new_txn() } @@ -230,11 +231,7 @@ impl MiniLsm { self.inner.sync() } - pub fn scan( - &self, - lower: Bound<&[u8]>, - upper: Bound<&[u8]>, - ) -> Result> { + pub fn scan(&self, lower: Bound<&[u8]>, upper: Bound<&[u8]>) -> Result { self.inner.scan(lower, upper) } @@ -294,6 +291,7 @@ impl LsmStorageInner { let mut next_sst_id = 1; // recover from MANIFEST, `/MANIFEST` let manifest_path = path.join("MANIFEST"); + let mut last_commit_ts = 0; if !manifest_path.exists() { if options.enable_wal { let memtable = Arc::new(MemTable::create_with_wal( @@ -346,6 +344,8 @@ impl LsmStorageInner { FileObject::open(&Self::path_of_sst_static(path, sst_id)) .context("failed to open SST")?, )?; + // update last_commit_ts + last_commit_ts = last_commit_ts.max(sst.max_ts()); state.sstables.insert(sst_id, Arc::new(sst)); next_sst_id = next_sst_id.max(sst_id); sst_cnt += 1; @@ -359,6 +359,14 @@ impl LsmStorageInner { for id in memtables { let memtable = MemTable::recover_from_wal(id, Self::path_of_wal_static(path, id))?; + // update last_commit_ts + let max_ts = memtable + .map + .iter() + .map(|entry| entry.key().ts()) + .max() + .unwrap_or(0); + last_commit_ts = last_commit_ts.max(max_ts); if !memtable.is_empty() { state.imm_memtables.insert(0, Arc::new(memtable)); wal_cnt += 1; @@ -387,7 +395,7 @@ impl LsmStorageInner { compaction_controller, manifest: Some(manifest), options: options.into(), - mvcc: Some(LsmMvccInner::new(0)), + mvcc: Some(LsmMvccInner::new(last_commit_ts)), }; storage.sync_dir()?; @@ -524,13 +532,17 @@ impl LsmStorageInner { Ok(()) } - pub fn new_txn(&self) -> Result<()> { - // no-op - Ok(()) + pub fn new_txn(self: &Arc) -> Result> { + Ok(self.mvcc().new_txn(self.clone(), self.options.serializable)) } /// Get a key from the storage. In week1 day7, this can be further optimized by using a bloom filter. - pub fn get(&self, key: &[u8]) -> Result> { + pub fn get(self: &Arc, key: &[u8]) -> Result> { + let txn = self.mvcc().new_txn(self.clone(), self.options.serializable); + txn.get(key) + } + + pub fn get_with_txn(&self, key: &[u8], read_ts: u64) -> Result> { let snapshot = self.state.read(); // search memtable firstly let mut memtable_iters = Vec::with_capacity(snapshot.imm_memtables.len() + 1); @@ -597,22 +609,33 @@ impl LsmStorageInner { } let merge_level_iter = MergeIterator::create(level_iters); - let two_merge_iterator = TwoMergeIterator::create(memtable_and_l0_iter, merge_level_iter)?; - if two_merge_iterator.is_valid() - && two_merge_iterator.key().key_ref() == key - && !two_merge_iterator.value().is_empty() - { - return Ok(Some(Bytes::copy_from_slice(two_merge_iterator.value()))); + let iter = LsmIterator::new( + TwoMergeIterator::create(memtable_and_l0_iter, merge_level_iter)?, + Bound::Unbounded, + read_ts, + )?; + if iter.is_valid() && iter.key() == key && !iter.value().is_empty() { + return Ok(Some(Bytes::copy_from_slice(iter.value()))); } Ok(None) } /// Create an iterator over a range of keys. - pub fn scan( + pub fn scan<'a>( + self: &'a Arc, + lower: Bound<&[u8]>, + upper: Bound<&[u8]>, + ) -> Result { + let txn = self.mvcc().new_txn(self.clone(), self.options.serializable); + txn.scan(lower, upper) + } + + pub fn scan_with_txn( &self, lower: Bound<&[u8]>, upper: Bound<&[u8]>, + read_ts: u64, ) -> Result> { let snapshot = { let guard = self.state.read(); @@ -709,6 +732,7 @@ impl LsmStorageInner { Ok(FusedIterator::new(LsmIterator::new( two_merge_iter, map_bound(upper), + read_ts, )?)) } diff --git a/mini-lsm-starter/src/mem_table.rs b/mini-lsm-starter/src/mem_table.rs index 165bb71bb..0d1d8589f 100644 --- a/mini-lsm-starter/src/mem_table.rs +++ b/mini-lsm-starter/src/mem_table.rs @@ -22,7 +22,7 @@ use crate::wal::Wal; /// chapters of week 1 and week 2. #[derive(Debug)] pub struct MemTable { - map: Arc>, + pub(crate) map: Arc>, wal: Option, id: usize, approximate_size: Arc, diff --git a/mini-lsm-starter/src/mvcc.rs b/mini-lsm-starter/src/mvcc.rs index 28481f4ec..54a0e8e02 100644 --- a/mini-lsm-starter/src/mvcc.rs +++ b/mini-lsm-starter/src/mvcc.rs @@ -55,6 +55,12 @@ impl LsmMvccInner { } pub fn new_txn(&self, inner: Arc, serializable: bool) -> Arc { - unimplemented!() + Arc::new(Transaction { + read_ts: self.ts.lock().0, + inner, + local_storage: Arc::new(Default::default()), + committed: Arc::new(Default::default()), + key_hashes: None, + }) } } diff --git a/mini-lsm-starter/src/mvcc/txn.rs b/mini-lsm-starter/src/mvcc/txn.rs index 33ec6a0df..5673d6c0d 100644 --- a/mini-lsm-starter/src/mvcc/txn.rs +++ b/mini-lsm-starter/src/mvcc/txn.rs @@ -14,7 +14,7 @@ use ouroboros::self_referencing; use parking_lot::Mutex; use crate::{ - iterators::{two_merge_iterator::TwoMergeIterator, StorageIterator}, + iterators::StorageIterator, lsm_iterator::{FusedIterator, LsmIterator}, lsm_storage::LsmStorageInner, }; @@ -30,11 +30,14 @@ pub struct Transaction { impl Transaction { pub fn get(&self, key: &[u8]) -> Result> { - unimplemented!() + self.inner.get_with_txn(key, self.read_ts) } pub fn scan(self: &Arc, lower: Bound<&[u8]>, upper: Bound<&[u8]>) -> Result { - unimplemented!() + TxnIterator::create( + self.clone(), + self.inner.scan_with_txn(lower, upper, self.read_ts)?, + ) } pub fn put(&self, key: &[u8], value: &[u8]) { @@ -90,16 +93,13 @@ impl StorageIterator for TxnLocalIterator { } pub struct TxnIterator { - _txn: Arc, - iter: TwoMergeIterator>, + txn: Arc, + iter: FusedIterator, } impl TxnIterator { - pub fn create( - txn: Arc, - iter: TwoMergeIterator>, - ) -> Result { - unimplemented!() + pub fn create(txn: Arc, iter: FusedIterator) -> Result { + Ok(Self { txn, iter }) } } @@ -119,7 +119,7 @@ impl StorageIterator for TxnIterator { } fn next(&mut self) -> Result<()> { - unimplemented!() + self.iter.next() } fn num_active_iterators(&self) -> usize { diff --git a/mini-lsm-starter/src/table.rs b/mini-lsm-starter/src/table.rs index 382aa42ed..41522de33 100644 --- a/mini-lsm-starter/src/table.rs +++ b/mini-lsm-starter/src/table.rs @@ -41,7 +41,7 @@ impl BlockMeta { /// Encode block meta to a buffer. /// You may add extra fields to the buffer, /// in order to help keep track of `first_key` when decoding from the same buffer in the future. - pub fn encode_block_metas(block_meta: &[BlockMeta], buf: &mut Vec) { + pub fn encode_block_metas(block_meta: &[BlockMeta], buf: &mut Vec, max_ts: u64) { let original_len = buf.len(); let meta_len = block_meta.len(); buf.put_u32(block_meta.len() as u32); @@ -54,13 +54,15 @@ impl BlockMeta { buf.put_slice(meta.last_key.key_ref()); buf.put_u64(meta.last_key.ts()); } + // put max ts + buf.put_u64(max_ts); // add checksum let checksum = crc32fast::hash(&buf[original_len + 4..]); buf.put_u32(checksum); } /// Decode block meta from a buffer. - pub fn decode_block_metas(mut buf: &[u8]) -> Result> { + pub fn decode_block_metas(mut buf: &[u8]) -> Result<(Vec, u64)> { // get meta data len let meta_len = buf.get_u32() as usize; // cal checksum @@ -80,11 +82,13 @@ impl BlockMeta { last_key, }); } + let max_ts = buf.get_u64(); // checksum if checksum != buf.get_u32() { return Err(anyhow::anyhow!("BlockMeta checksum mismatch")); } - Ok(block_meta) + + Ok((block_meta, max_ts)) } } @@ -162,7 +166,7 @@ impl SsTable { let raw_block_meta_offset = file.read(bloom_offset - 4, 4)?; let block_meta_offset = (&raw_block_meta_offset[..]).get_u32() as u64; let raw_block_meta = file.read(block_meta_offset, bloom_offset - 4 - block_meta_offset)?; - let block_metas = BlockMeta::decode_block_metas(raw_block_meta.as_slice())?; + let (block_metas, max_ts) = BlockMeta::decode_block_metas(raw_block_meta.as_slice())?; // decode data blocks let raw_data = file.read(0, block_meta_offset)?; @@ -176,7 +180,7 @@ impl SsTable { block_metas, block_cache, bloom: Some(bloom), - max_ts: 0, + max_ts, }; Ok(sst_table) } diff --git a/mini-lsm-starter/src/table/builder.rs b/mini-lsm-starter/src/table/builder.rs index c2fa11bd5..28be9774a 100644 --- a/mini-lsm-starter/src/table/builder.rs +++ b/mini-lsm-starter/src/table/builder.rs @@ -20,6 +20,7 @@ pub struct SsTableBuilder { pub(crate) metas: Vec, block_size: usize, key_hashes: Vec, + max_ts: u64, } /* @@ -41,6 +42,7 @@ impl SsTableBuilder { metas: Vec::new(), block_size, key_hashes: Vec::new(), + max_ts: 0, } } @@ -55,6 +57,9 @@ impl SsTableBuilder { // add the key hash to the bloom filter self.key_hashes.push(farmhash::fingerprint32(key.key_ref())); + if key.ts() > self.max_ts { + self.max_ts = key.ts(); + } // block builder returns false when the block is full. if self.block_builder.add(key, value) { self.last_key.set_from_slice(key); @@ -107,7 +112,7 @@ impl SsTableBuilder { let meta_len = self.metas.len(); let block_meta_offset = self.data.len(); let mut buf = self.data; - BlockMeta::encode_block_metas(&self.metas, &mut buf); + BlockMeta::encode_block_metas(&self.metas, &mut buf, self.max_ts); // extra info for the meta block offset buf.put_u32(block_meta_offset as u32); // create bloom filter and encode it @@ -127,7 +132,7 @@ impl SsTableBuilder { block_meta_offset, block_cache, bloom: Some(bloom), - max_ts: 0, + max_ts: self.max_ts, }; Ok(sst_table) } diff --git a/mini-lsm-starter/src/tests.rs b/mini-lsm-starter/src/tests.rs index 3c95eabc5..261af8eca 100644 --- a/mini-lsm-starter/src/tests.rs +++ b/mini-lsm-starter/src/tests.rs @@ -17,3 +17,4 @@ mod week2_day5; mod week2_day6; mod week3_day1; mod week3_day2; +mod week3_day3; diff --git a/mini-lsm-starter/src/tests/week3_day3.rs b/mini-lsm-starter/src/tests/week3_day3.rs new file mode 100644 index 000000000..a0b2532f0 --- /dev/null +++ b/mini-lsm-starter/src/tests/week3_day3.rs @@ -0,0 +1,276 @@ +use std::ops::Bound; + +use bytes::Bytes; +use tempfile::tempdir; + +use crate::{ + compact::CompactionOptions, + key::KeySlice, + lsm_storage::{LsmStorageOptions, MiniLsm}, + table::SsTableBuilder, + tests::harness::check_lsm_iter_result_by_key, +}; + +#[test] +fn test_task2_memtable_mvcc() { + let dir = tempdir().unwrap(); + let mut options = LsmStorageOptions::default_for_week2_test(CompactionOptions::NoCompaction); + options.enable_wal = true; + let storage = MiniLsm::open(&dir, options.clone()).unwrap(); + storage.put(b"a", b"1").unwrap(); + storage.put(b"b", b"1").unwrap(); + let snapshot1 = storage.new_txn().unwrap(); + storage.put(b"a", b"2").unwrap(); + let snapshot2 = storage.new_txn().unwrap(); + storage.delete(b"b").unwrap(); + storage.put(b"c", b"1").unwrap(); + let snapshot3 = storage.new_txn().unwrap(); + assert_eq!(snapshot1.get(b"a").unwrap(), Some(Bytes::from_static(b"1"))); + assert_eq!(snapshot1.get(b"b").unwrap(), Some(Bytes::from_static(b"1"))); + assert_eq!(snapshot1.get(b"c").unwrap(), None); + check_lsm_iter_result_by_key( + &mut snapshot1.scan(Bound::Unbounded, Bound::Unbounded).unwrap(), + vec![ + (Bytes::from("a"), Bytes::from("1")), + (Bytes::from("b"), Bytes::from("1")), + ], + ); + assert_eq!(snapshot2.get(b"a").unwrap(), Some(Bytes::from_static(b"2"))); + assert_eq!(snapshot2.get(b"b").unwrap(), Some(Bytes::from_static(b"1"))); + assert_eq!(snapshot2.get(b"c").unwrap(), None); + check_lsm_iter_result_by_key( + &mut snapshot2.scan(Bound::Unbounded, Bound::Unbounded).unwrap(), + vec![ + (Bytes::from("a"), Bytes::from("2")), + (Bytes::from("b"), Bytes::from("1")), + ], + ); + assert_eq!(snapshot3.get(b"a").unwrap(), Some(Bytes::from_static(b"2"))); + assert_eq!(snapshot3.get(b"b").unwrap(), None); + assert_eq!(snapshot3.get(b"c").unwrap(), Some(Bytes::from_static(b"1"))); + check_lsm_iter_result_by_key( + &mut snapshot3.scan(Bound::Unbounded, Bound::Unbounded).unwrap(), + vec![ + (Bytes::from("a"), Bytes::from("2")), + (Bytes::from("c"), Bytes::from("1")), + ], + ); + storage + .inner + .force_freeze_memtable(&storage.inner.state_lock.lock()) + .unwrap(); + storage.put(b"a", b"3").unwrap(); + storage.put(b"b", b"3").unwrap(); + let snapshot4 = storage.new_txn().unwrap(); + storage.put(b"a", b"4").unwrap(); + let snapshot5 = storage.new_txn().unwrap(); + storage.delete(b"b").unwrap(); + storage.put(b"c", b"5").unwrap(); + let snapshot6 = storage.new_txn().unwrap(); + assert_eq!(snapshot1.get(b"a").unwrap(), Some(Bytes::from_static(b"1"))); + assert_eq!(snapshot1.get(b"b").unwrap(), Some(Bytes::from_static(b"1"))); + assert_eq!(snapshot1.get(b"c").unwrap(), None); + check_lsm_iter_result_by_key( + &mut snapshot1.scan(Bound::Unbounded, Bound::Unbounded).unwrap(), + vec![ + (Bytes::from("a"), Bytes::from("1")), + (Bytes::from("b"), Bytes::from("1")), + ], + ); + assert_eq!(snapshot2.get(b"a").unwrap(), Some(Bytes::from_static(b"2"))); + assert_eq!(snapshot2.get(b"b").unwrap(), Some(Bytes::from_static(b"1"))); + assert_eq!(snapshot2.get(b"c").unwrap(), None); + check_lsm_iter_result_by_key( + &mut snapshot2.scan(Bound::Unbounded, Bound::Unbounded).unwrap(), + vec![ + (Bytes::from("a"), Bytes::from("2")), + (Bytes::from("b"), Bytes::from("1")), + ], + ); + assert_eq!(snapshot3.get(b"a").unwrap(), Some(Bytes::from_static(b"2"))); + assert_eq!(snapshot3.get(b"b").unwrap(), None); + assert_eq!(snapshot3.get(b"c").unwrap(), Some(Bytes::from_static(b"1"))); + check_lsm_iter_result_by_key( + &mut snapshot3.scan(Bound::Unbounded, Bound::Unbounded).unwrap(), + vec![ + (Bytes::from("a"), Bytes::from("2")), + (Bytes::from("c"), Bytes::from("1")), + ], + ); + assert_eq!(snapshot4.get(b"a").unwrap(), Some(Bytes::from_static(b"3"))); + assert_eq!(snapshot4.get(b"b").unwrap(), Some(Bytes::from_static(b"3"))); + assert_eq!(snapshot4.get(b"c").unwrap(), Some(Bytes::from_static(b"1"))); + check_lsm_iter_result_by_key( + &mut snapshot4.scan(Bound::Unbounded, Bound::Unbounded).unwrap(), + vec![ + (Bytes::from("a"), Bytes::from("3")), + (Bytes::from("b"), Bytes::from("3")), + (Bytes::from("c"), Bytes::from("1")), + ], + ); + assert_eq!(snapshot5.get(b"a").unwrap(), Some(Bytes::from_static(b"4"))); + assert_eq!(snapshot5.get(b"b").unwrap(), Some(Bytes::from_static(b"3"))); + assert_eq!(snapshot5.get(b"c").unwrap(), Some(Bytes::from_static(b"1"))); + check_lsm_iter_result_by_key( + &mut snapshot5.scan(Bound::Unbounded, Bound::Unbounded).unwrap(), + vec![ + (Bytes::from("a"), Bytes::from("4")), + (Bytes::from("b"), Bytes::from("3")), + (Bytes::from("c"), Bytes::from("1")), + ], + ); + assert_eq!(snapshot6.get(b"a").unwrap(), Some(Bytes::from_static(b"4"))); + assert_eq!(snapshot6.get(b"b").unwrap(), None); + assert_eq!(snapshot6.get(b"c").unwrap(), Some(Bytes::from_static(b"5"))); + check_lsm_iter_result_by_key( + &mut snapshot6.scan(Bound::Unbounded, Bound::Unbounded).unwrap(), + vec![ + (Bytes::from("a"), Bytes::from("4")), + (Bytes::from("c"), Bytes::from("5")), + ], + ); +} + +#[test] +fn test_task2_lsm_iterator_mvcc() { + let dir = tempdir().unwrap(); + let mut options = LsmStorageOptions::default_for_week2_test(CompactionOptions::NoCompaction); + options.enable_wal = true; + let storage = MiniLsm::open(&dir, options.clone()).unwrap(); + storage.put(b"a", b"1").unwrap(); + storage.put(b"b", b"1").unwrap(); + let snapshot1 = storage.new_txn().unwrap(); + storage.put(b"a", b"2").unwrap(); + let snapshot2 = storage.new_txn().unwrap(); + storage.delete(b"b").unwrap(); + storage.put(b"c", b"1").unwrap(); + let snapshot3 = storage.new_txn().unwrap(); + storage.force_flush().unwrap(); + assert_eq!(snapshot1.get(b"a").unwrap(), Some(Bytes::from_static(b"1"))); + assert_eq!(snapshot1.get(b"b").unwrap(), Some(Bytes::from_static(b"1"))); + assert_eq!(snapshot1.get(b"c").unwrap(), None); + check_lsm_iter_result_by_key( + &mut snapshot1.scan(Bound::Unbounded, Bound::Unbounded).unwrap(), + vec![ + (Bytes::from("a"), Bytes::from("1")), + (Bytes::from("b"), Bytes::from("1")), + ], + ); + assert_eq!(snapshot2.get(b"a").unwrap(), Some(Bytes::from_static(b"2"))); + assert_eq!(snapshot2.get(b"b").unwrap(), Some(Bytes::from_static(b"1"))); + assert_eq!(snapshot2.get(b"c").unwrap(), None); + check_lsm_iter_result_by_key( + &mut snapshot2.scan(Bound::Unbounded, Bound::Unbounded).unwrap(), + vec![ + (Bytes::from("a"), Bytes::from("2")), + (Bytes::from("b"), Bytes::from("1")), + ], + ); + assert_eq!(snapshot3.get(b"a").unwrap(), Some(Bytes::from_static(b"2"))); + assert_eq!(snapshot3.get(b"b").unwrap(), None); + assert_eq!(snapshot3.get(b"c").unwrap(), Some(Bytes::from_static(b"1"))); + check_lsm_iter_result_by_key( + &mut snapshot3.scan(Bound::Unbounded, Bound::Unbounded).unwrap(), + vec![ + (Bytes::from("a"), Bytes::from("2")), + (Bytes::from("c"), Bytes::from("1")), + ], + ); + storage.put(b"a", b"3").unwrap(); + storage.put(b"b", b"3").unwrap(); + let snapshot4 = storage.new_txn().unwrap(); + storage.put(b"a", b"4").unwrap(); + let snapshot5 = storage.new_txn().unwrap(); + storage.delete(b"b").unwrap(); + storage.put(b"c", b"5").unwrap(); + let snapshot6 = storage.new_txn().unwrap(); + storage.force_flush().unwrap(); + assert_eq!(snapshot1.get(b"a").unwrap(), Some(Bytes::from_static(b"1"))); + assert_eq!(snapshot1.get(b"b").unwrap(), Some(Bytes::from_static(b"1"))); + assert_eq!(snapshot1.get(b"c").unwrap(), None); + check_lsm_iter_result_by_key( + &mut snapshot1.scan(Bound::Unbounded, Bound::Unbounded).unwrap(), + vec![ + (Bytes::from("a"), Bytes::from("1")), + (Bytes::from("b"), Bytes::from("1")), + ], + ); + assert_eq!(snapshot2.get(b"a").unwrap(), Some(Bytes::from_static(b"2"))); + assert_eq!(snapshot2.get(b"b").unwrap(), Some(Bytes::from_static(b"1"))); + assert_eq!(snapshot2.get(b"c").unwrap(), None); + check_lsm_iter_result_by_key( + &mut snapshot2.scan(Bound::Unbounded, Bound::Unbounded).unwrap(), + vec![ + (Bytes::from("a"), Bytes::from("2")), + (Bytes::from("b"), Bytes::from("1")), + ], + ); + assert_eq!(snapshot3.get(b"a").unwrap(), Some(Bytes::from_static(b"2"))); + assert_eq!(snapshot3.get(b"b").unwrap(), None); + assert_eq!(snapshot3.get(b"c").unwrap(), Some(Bytes::from_static(b"1"))); + check_lsm_iter_result_by_key( + &mut snapshot3.scan(Bound::Unbounded, Bound::Unbounded).unwrap(), + vec![ + (Bytes::from("a"), Bytes::from("2")), + (Bytes::from("c"), Bytes::from("1")), + ], + ); + assert_eq!(snapshot4.get(b"a").unwrap(), Some(Bytes::from_static(b"3"))); + assert_eq!(snapshot4.get(b"b").unwrap(), Some(Bytes::from_static(b"3"))); + assert_eq!(snapshot4.get(b"c").unwrap(), Some(Bytes::from_static(b"1"))); + check_lsm_iter_result_by_key( + &mut snapshot4.scan(Bound::Unbounded, Bound::Unbounded).unwrap(), + vec![ + (Bytes::from("a"), Bytes::from("3")), + (Bytes::from("b"), Bytes::from("3")), + (Bytes::from("c"), Bytes::from("1")), + ], + ); + assert_eq!(snapshot5.get(b"a").unwrap(), Some(Bytes::from_static(b"4"))); + assert_eq!(snapshot5.get(b"b").unwrap(), Some(Bytes::from_static(b"3"))); + assert_eq!(snapshot5.get(b"c").unwrap(), Some(Bytes::from_static(b"1"))); + check_lsm_iter_result_by_key( + &mut snapshot5.scan(Bound::Unbounded, Bound::Unbounded).unwrap(), + vec![ + (Bytes::from("a"), Bytes::from("4")), + (Bytes::from("b"), Bytes::from("3")), + (Bytes::from("c"), Bytes::from("1")), + ], + ); + assert_eq!(snapshot6.get(b"a").unwrap(), Some(Bytes::from_static(b"4"))); + assert_eq!(snapshot6.get(b"b").unwrap(), None); + assert_eq!(snapshot6.get(b"c").unwrap(), Some(Bytes::from_static(b"5"))); + check_lsm_iter_result_by_key( + &mut snapshot6.scan(Bound::Unbounded, Bound::Unbounded).unwrap(), + vec![ + (Bytes::from("a"), Bytes::from("4")), + (Bytes::from("c"), Bytes::from("5")), + ], + ); + check_lsm_iter_result_by_key( + &mut snapshot6 + .scan(Bound::Included(b"a"), Bound::Included(b"a")) + .unwrap(), + vec![(Bytes::from("a"), Bytes::from("4"))], + ); + check_lsm_iter_result_by_key( + &mut snapshot6 + .scan(Bound::Excluded(b"a"), Bound::Excluded(b"c")) + .unwrap(), + vec![], + ); +} + +#[test] +fn test_task3_sst_ts() { + let mut builder = SsTableBuilder::new(16); + builder.add(KeySlice::for_testing_from_slice_with_ts(b"11", 1), b"11"); + builder.add(KeySlice::for_testing_from_slice_with_ts(b"22", 2), b"22"); + builder.add(KeySlice::for_testing_from_slice_with_ts(b"33", 3), b"11"); + builder.add(KeySlice::for_testing_from_slice_with_ts(b"44", 4), b"22"); + builder.add(KeySlice::for_testing_from_slice_with_ts(b"55", 5), b"11"); + builder.add(KeySlice::for_testing_from_slice_with_ts(b"66", 6), b"22"); + let dir = tempdir().unwrap(); + let sst = builder.build_for_test(dir.path().join("1.sst")).unwrap(); + assert_eq!(sst.max_ts(), 6); +} From 26abb62d49e363a4987f27b26dd301c450bc4da7 Mon Sep 17 00:00:00 2001 From: husharp Date: Sun, 25 Feb 2024 20:41:53 +0800 Subject: [PATCH 18/22] support watermark and gc Signed-off-by: husharp --- mini-lsm-starter/src/compact.rs | 32 +++- mini-lsm-starter/src/mvcc.rs | 7 +- mini-lsm-starter/src/mvcc/txn.rs | 4 +- mini-lsm-starter/src/mvcc/watermark.rs | 25 ++- mini-lsm-starter/src/tests.rs | 1 + mini-lsm-starter/src/tests/week3_day4.rs | 189 +++++++++++++++++++++++ 6 files changed, 249 insertions(+), 9 deletions(-) create mode 100644 mini-lsm-starter/src/tests/week3_day4.rs diff --git a/mini-lsm-starter/src/compact.rs b/mini-lsm-starter/src/compact.rs index f21413c3a..554eee7d1 100644 --- a/mini-lsm-starter/src/compact.rs +++ b/mini-lsm-starter/src/compact.rs @@ -118,22 +118,48 @@ impl LsmStorageInner { fn compact_generate_sst_from_iter( &self, mut iter: impl for<'a> StorageIterator = KeySlice<'a>>, - _compact_to_bottom_level: bool, + compact_to_bottom_level: bool, ) -> Result>> { let mut new_ssts = Vec::new(); // compact the iterators let mut builder = None; let mut last_key = Vec::::new(); + // All ts (strictly) below this ts can be garbage collected. + let gc_ts = self.mvcc().watermark(); + let mut last_gc_key = Vec::::new(); while iter.is_valid() { if builder.is_none() { builder = Some(SsTableBuilder::new(self.options.block_size)); } let builder_inner = builder.as_mut().unwrap(); + // 1. for all versions of a key below or equal to the watermark, keep the latest version. + if iter.key().ts() <= gc_ts { + let same_as_last_gc_key = iter.key().key_ref() == last_gc_key; + if !same_as_last_gc_key { + last_gc_key.clear(); + last_gc_key.extend(iter.key().key_ref()); + } + + if same_as_last_gc_key { + iter.next()?; + continue; + } + } + let same_as_last_key = iter.key().key_ref() == last_key; + // 2. gc the version of a key is empty and below the watermark + if compact_to_bottom_level && iter.key().ts() <= gc_ts && iter.value().is_empty() { + // if this key is not same as the last key, need to update last_key + if !same_as_last_key { + last_key.clear(); + last_key.extend(iter.key().key_ref()); + } + iter.next()?; + continue; + } - // keep ALL versions of a key during the compaction.(NOT remove empty keys for now[week 3, day 2]) - // the same key with different timestamps are put in the same SST file, even if it exceeds the SST size limit + // the same key with different timestamps are put in the same SST file, even if it exceeds the SST size limit if builder_inner.estimated_size() >= self.options.target_sst_size && !same_as_last_key { let sst_id = self.next_sst_id(); let old_builder = builder.take().unwrap(); diff --git a/mini-lsm-starter/src/mvcc.rs b/mini-lsm-starter/src/mvcc.rs index 54a0e8e02..7bb4cf519 100644 --- a/mini-lsm-starter/src/mvcc.rs +++ b/mini-lsm-starter/src/mvcc.rs @@ -2,7 +2,7 @@ #![allow(dead_code)] // TODO(you): remove this lint after implementing this mod pub mod txn; -mod watermark; +pub mod watermark; use std::{ collections::{BTreeMap, HashSet}, @@ -55,8 +55,11 @@ impl LsmMvccInner { } pub fn new_txn(&self, inner: Arc, serializable: bool) -> Arc { + let mut ts = self.ts.lock(); + let read_ts = ts.0; + ts.1.add_reader(read_ts); Arc::new(Transaction { - read_ts: self.ts.lock().0, + read_ts, inner, local_storage: Arc::new(Default::default()), committed: Arc::new(Default::default()), diff --git a/mini-lsm-starter/src/mvcc/txn.rs b/mini-lsm-starter/src/mvcc/txn.rs index 5673d6c0d..379352e84 100644 --- a/mini-lsm-starter/src/mvcc/txn.rs +++ b/mini-lsm-starter/src/mvcc/txn.rs @@ -54,7 +54,9 @@ impl Transaction { } impl Drop for Transaction { - fn drop(&mut self) {} + fn drop(&mut self) { + self.inner.mvcc().ts.lock().1.remove_reader(self.read_ts); + } } type SkipMapRangeIter<'a> = diff --git a/mini-lsm-starter/src/mvcc/watermark.rs b/mini-lsm-starter/src/mvcc/watermark.rs index 4bbb4fa01..617df2369 100644 --- a/mini-lsm-starter/src/mvcc/watermark.rs +++ b/mini-lsm-starter/src/mvcc/watermark.rs @@ -3,10 +3,17 @@ use std::collections::BTreeMap; +// Watermark is the lowest read timestamp among all in-progress transactions. pub struct Watermark { readers: BTreeMap, } +impl Default for Watermark { + fn default() -> Self { + Self::new() + } +} + impl Watermark { pub fn new() -> Self { Self { @@ -14,11 +21,23 @@ impl Watermark { } } - pub fn add_reader(&mut self, ts: u64) {} + pub fn add_reader(&mut self, ts: u64) { + *self.readers.entry(ts).or_default() += 1; + } + + pub fn remove_reader(&mut self, ts: u64) { + let cnt = self.readers.get_mut(&ts).unwrap(); + *cnt -= 1; + if *cnt == 0 { + self.readers.remove(&ts); + } + } - pub fn remove_reader(&mut self, ts: u64) {} + pub fn num_retained_snapshots(&self) -> usize { + self.readers.len() + } pub fn watermark(&self) -> Option { - Some(0) + self.readers.first_key_value().map(|(ts, _)| *ts) } } diff --git a/mini-lsm-starter/src/tests.rs b/mini-lsm-starter/src/tests.rs index 261af8eca..90a0af52d 100644 --- a/mini-lsm-starter/src/tests.rs +++ b/mini-lsm-starter/src/tests.rs @@ -18,3 +18,4 @@ mod week2_day6; mod week3_day1; mod week3_day2; mod week3_day3; +mod week3_day4; diff --git a/mini-lsm-starter/src/tests/week3_day4.rs b/mini-lsm-starter/src/tests/week3_day4.rs new file mode 100644 index 000000000..428a2e605 --- /dev/null +++ b/mini-lsm-starter/src/tests/week3_day4.rs @@ -0,0 +1,189 @@ +use bytes::Bytes; +use tempfile::tempdir; + +use crate::{ + compact::CompactionOptions, + lsm_storage::{LsmStorageOptions, MiniLsm, WriteBatchRecord}, + mvcc::watermark::Watermark, +}; + +use super::harness::{check_iter_result_by_key, construct_merge_iterator_over_storage}; + +#[test] +fn test_task1_watermark() { + let mut watermark = Watermark::new(); + watermark.add_reader(0); + for i in 1..=1000 { + watermark.add_reader(i); + assert_eq!(watermark.watermark(), Some(0)); + assert_eq!(watermark.num_retained_snapshots(), i as usize + 1); + } + let mut cnt = 1001; + for i in 0..500 { + watermark.remove_reader(i); + assert_eq!(watermark.watermark(), Some(i + 1)); + cnt -= 1; + assert_eq!(watermark.num_retained_snapshots(), cnt); + } + for i in (501..=1000).rev() { + watermark.remove_reader(i); + assert_eq!(watermark.watermark(), Some(500)); + cnt -= 1; + assert_eq!(watermark.num_retained_snapshots(), cnt); + } + watermark.remove_reader(500); + assert_eq!(watermark.watermark(), None); + assert_eq!(watermark.num_retained_snapshots(), 0); + watermark.add_reader(2000); + watermark.add_reader(2000); + watermark.add_reader(2001); + assert_eq!(watermark.num_retained_snapshots(), 2); + assert_eq!(watermark.watermark(), Some(2000)); + watermark.remove_reader(2000); + assert_eq!(watermark.num_retained_snapshots(), 2); + assert_eq!(watermark.watermark(), Some(2000)); + watermark.remove_reader(2000); + assert_eq!(watermark.num_retained_snapshots(), 1); + assert_eq!(watermark.watermark(), Some(2001)); +} + +#[test] +fn test_task2_snapshot_watermark() { + let dir = tempdir().unwrap(); + let options = LsmStorageOptions::default_for_week2_test(CompactionOptions::NoCompaction); + let storage = MiniLsm::open(&dir, options.clone()).unwrap(); + let txn1 = storage.new_txn().unwrap(); + let txn2 = storage.new_txn().unwrap(); + storage.put(b"233", b"23333").unwrap(); + let txn3 = storage.new_txn().unwrap(); + assert_eq!(storage.inner.mvcc().watermark(), txn1.read_ts); + drop(txn1); + assert_eq!(storage.inner.mvcc().watermark(), txn2.read_ts); + drop(txn2); + assert_eq!(storage.inner.mvcc().watermark(), txn3.read_ts); + drop(txn3); + assert_eq!( + storage.inner.mvcc().watermark(), + storage.inner.mvcc().latest_commit_ts() + ); +} + +#[test] +fn test_task3_mvcc_compaction() { + let dir = tempdir().unwrap(); + let options = LsmStorageOptions::default_for_week2_test(CompactionOptions::NoCompaction); + let storage = MiniLsm::open(&dir, options.clone()).unwrap(); + let snapshot0 = storage.new_txn().unwrap(); + storage + .write_batch(&[ + WriteBatchRecord::Put(b"a", b"1"), + WriteBatchRecord::Put(b"b", b"1"), + ]) + .unwrap(); + let snapshot1 = storage.new_txn().unwrap(); + storage + .write_batch(&[ + WriteBatchRecord::Put(b"a", b"2"), + WriteBatchRecord::Put(b"d", b"2"), + ]) + .unwrap(); + let snapshot2 = storage.new_txn().unwrap(); + storage + .write_batch(&[ + WriteBatchRecord::Put(b"a", b"3"), + WriteBatchRecord::Del(b"d"), + ]) + .unwrap(); + let snapshot3 = storage.new_txn().unwrap(); + storage + .write_batch(&[ + WriteBatchRecord::Put(b"c", b"4"), + WriteBatchRecord::Del(b"a"), + ]) + .unwrap(); + + storage.force_flush().unwrap(); + storage.force_full_compaction().unwrap(); + storage.dump_structure(); + + let mut iter = construct_merge_iterator_over_storage(&storage.inner.state.read()); + check_iter_result_by_key( + &mut iter, + vec![ + (Bytes::from("a"), Bytes::new()), + (Bytes::from("a"), Bytes::from("3")), + (Bytes::from("a"), Bytes::from("2")), + (Bytes::from("a"), Bytes::from("1")), + (Bytes::from("b"), Bytes::from("1")), + (Bytes::from("c"), Bytes::from("4")), + (Bytes::from("d"), Bytes::new()), + (Bytes::from("d"), Bytes::from("2")), + ], + ); + + drop(snapshot0); + storage.force_full_compaction().unwrap(); + storage.dump_structure(); + + let mut iter = construct_merge_iterator_over_storage(&storage.inner.state.read()); + check_iter_result_by_key( + &mut iter, + vec![ + (Bytes::from("a"), Bytes::new()), + (Bytes::from("a"), Bytes::from("3")), + (Bytes::from("a"), Bytes::from("2")), + (Bytes::from("a"), Bytes::from("1")), + (Bytes::from("b"), Bytes::from("1")), + (Bytes::from("c"), Bytes::from("4")), + (Bytes::from("d"), Bytes::new()), + (Bytes::from("d"), Bytes::from("2")), + ], + ); + + drop(snapshot1); + storage.force_full_compaction().unwrap(); + storage.dump_structure(); + + println!("snapshot2: {:?}", snapshot2.read_ts); + let mut iter = construct_merge_iterator_over_storage(&storage.inner.state.read()); + check_iter_result_by_key( + &mut iter, + vec![ + (Bytes::from("a"), Bytes::new()), + (Bytes::from("a"), Bytes::from("3")), + (Bytes::from("a"), Bytes::from("2")), + (Bytes::from("b"), Bytes::from("1")), + (Bytes::from("c"), Bytes::from("4")), + (Bytes::from("d"), Bytes::new()), + (Bytes::from("d"), Bytes::from("2")), + ], + ); + + drop(snapshot2); + storage.force_full_compaction().unwrap(); + storage.dump_structure(); + + let mut iter = construct_merge_iterator_over_storage(&storage.inner.state.read()); + check_iter_result_by_key( + &mut iter, + vec![ + (Bytes::from("a"), Bytes::new()), + (Bytes::from("a"), Bytes::from("3")), + (Bytes::from("b"), Bytes::from("1")), + (Bytes::from("c"), Bytes::from("4")), + ], + ); + + drop(snapshot3); + storage.force_full_compaction().unwrap(); + storage.dump_structure(); + + let mut iter = construct_merge_iterator_over_storage(&storage.inner.state.read()); + check_iter_result_by_key( + &mut iter, + vec![ + (Bytes::from("b"), Bytes::from("1")), + (Bytes::from("c"), Bytes::from("4")), + ], + ); +} From 26256f4f86d22434a292cad0051c78f07e53c54d Mon Sep 17 00:00:00 2001 From: husharp Date: Sun, 25 Feb 2024 21:20:11 +0800 Subject: [PATCH 19/22] support local storage in txn Signed-off-by: husharp --- mini-lsm-starter/src/mem_table.rs | 3 +- mini-lsm-starter/src/mvcc/txn.rs | 109 +++++++++++++++++++---- mini-lsm-starter/src/tests.rs | 1 + mini-lsm-starter/src/tests/week3_day5.rs | 75 ++++++++++++++++ 4 files changed, 170 insertions(+), 18 deletions(-) create mode 100644 mini-lsm-starter/src/tests/week3_day5.rs diff --git a/mini-lsm-starter/src/mem_table.rs b/mini-lsm-starter/src/mem_table.rs index 0d1d8589f..6ba0f2d8b 100644 --- a/mini-lsm-starter/src/mem_table.rs +++ b/mini-lsm-starter/src/mem_table.rs @@ -7,8 +7,7 @@ use std::sync::Arc; use anyhow::Result; use bytes::Bytes; -use crossbeam_skiplist::map::Entry; -use crossbeam_skiplist::SkipMap; +use crossbeam_skiplist::{map::Entry, SkipMap}; use ouroboros::self_referencing; use crate::iterators::StorageIterator; diff --git a/mini-lsm-starter/src/mvcc/txn.rs b/mini-lsm-starter/src/mvcc/txn.rs index 379352e84..ea9f5347a 100644 --- a/mini-lsm-starter/src/mvcc/txn.rs +++ b/mini-lsm-starter/src/mvcc/txn.rs @@ -4,21 +4,26 @@ use std::{ collections::HashSet, ops::Bound, - sync::{atomic::AtomicBool, Arc}, + sync::{ + atomic::{AtomicBool, Ordering}, + Arc, + }, }; use anyhow::Result; use bytes::Bytes; -use crossbeam_skiplist::SkipMap; +use crossbeam_skiplist::{map::Entry, SkipMap}; use ouroboros::self_referencing; use parking_lot::Mutex; use crate::{ - iterators::StorageIterator, + iterators::{two_merge_iterator::TwoMergeIterator, StorageIterator}, lsm_iterator::{FusedIterator, LsmIterator}, - lsm_storage::LsmStorageInner, + lsm_storage::{LsmStorageInner, WriteBatchRecord}, + mem_table::map_bound, }; + pub struct Transaction { pub(crate) read_ts: u64, pub(crate) inner: Arc, @@ -30,26 +35,72 @@ pub struct Transaction { impl Transaction { pub fn get(&self, key: &[u8]) -> Result> { + if self.committed.load(Ordering::SeqCst) { + panic!("cannot operate on committed txn!"); + } + if let Some(entry) = self.local_storage.get(key) { + if entry.value().is_empty() { + return Ok(None); + } + return Ok(Some(entry.value().clone())); + } self.inner.get_with_txn(key, self.read_ts) } pub fn scan(self: &Arc, lower: Bound<&[u8]>, upper: Bound<&[u8]>) -> Result { + if self.committed.load(Ordering::SeqCst) { + panic!("cannot operate on committed txn!"); + } + let mut local_iter = TxnLocalIteratorBuilder { + map: self.local_storage.clone(), + iter_builder: |map| map.range((map_bound(lower), map_bound(upper))), + item: (Bytes::new(), Bytes::new()), + } + .build(); + + // handle deletions + let next = local_iter.with_iter_mut(|iter| TxnLocalIterator::entry_to_item(iter.next())); + local_iter.with_mut(|x| *x.item = next); + TxnIterator::create( self.clone(), - self.inner.scan_with_txn(lower, upper, self.read_ts)?, + TwoMergeIterator::create( + local_iter, + self.inner.scan_with_txn(lower, upper, self.read_ts)?, + )?, ) } pub fn put(&self, key: &[u8], value: &[u8]) { - unimplemented!() + if self.committed.load(Ordering::SeqCst) { + panic!("cannot operate on committed txn!"); + } + self.local_storage + + .insert(Bytes::copy_from_slice(key), Bytes::copy_from_slice(value)); } pub fn delete(&self, key: &[u8]) { - unimplemented!() + if self.committed.load(Ordering::SeqCst) { + panic!("cannot operate on committed txn!"); + } + self.local_storage + .insert(Bytes::copy_from_slice(key), Bytes::new()); } pub fn commit(&self) -> Result<()> { - unimplemented!() + self.committed + .compare_exchange(false, true, Ordering::SeqCst, Ordering::SeqCst) + .expect("cannot operate on committed txn!"); + let batch = self.local_storage.iter().map(|entry| { + if entry.value().is_empty() { + WriteBatchRecord::Del(entry.key().clone()) + } else { + WriteBatchRecord::Put(entry.key().clone(), entry.value().clone()) + } + }).collect::>(); + self.inner.write_batch(&batch)?; + Ok(()) } } @@ -74,34 +125,59 @@ pub struct TxnLocalIterator { item: (Bytes, Bytes), } +impl TxnLocalIterator { + // This function is used to convert a `SkipMap` entry to a key-value pair. + fn entry_to_item(entry: Option>) -> (Bytes, Bytes) { + entry + .map(|x| (x.key().clone(), x.value().clone())) + .unwrap_or_else(|| (Bytes::new(), Bytes::new())) + } +} + impl StorageIterator for TxnLocalIterator { type KeyType<'a> = &'a [u8]; fn value(&self) -> &[u8] { - unimplemented!() + &self.borrow_item().1 } fn key(&self) -> &[u8] { - unimplemented!() + &self.borrow_item().0[..] } fn is_valid(&self) -> bool { - unimplemented!() + !self.borrow_item().0.is_empty() } fn next(&mut self) -> Result<()> { - unimplemented!() + let next = self.with_iter_mut(|iter| TxnLocalIterator::entry_to_item(iter.next())); + self.with_item_mut(|item| *item = next); + Ok(()) } } pub struct TxnIterator { txn: Arc, - iter: FusedIterator, + iter: TwoMergeIterator>, } impl TxnIterator { - pub fn create(txn: Arc, iter: FusedIterator) -> Result { - Ok(Self { txn, iter }) + pub fn create( + txn: Arc, + iter: TwoMergeIterator>, + ) -> Result { + let mut iter = Self { txn, iter }; + iter.skip_deletes()?; + Ok(iter) + } + + // TwoMergeIterator will retain the deletion markers in the child iterators, + // we need to modify your TxnIterator implementation to correctly handle deletions. + fn skip_deletes(&mut self) -> Result<()> { + while self.iter.is_valid() && self.iter.value().is_empty() { + self.iter.next()?; + } + Ok(()) } } @@ -121,7 +197,8 @@ impl StorageIterator for TxnIterator { } fn next(&mut self) -> Result<()> { - self.iter.next() + self.iter.next()?; + self.skip_deletes() } fn num_active_iterators(&self) -> usize { diff --git a/mini-lsm-starter/src/tests.rs b/mini-lsm-starter/src/tests.rs index 90a0af52d..a8b0f8600 100644 --- a/mini-lsm-starter/src/tests.rs +++ b/mini-lsm-starter/src/tests.rs @@ -19,3 +19,4 @@ mod week3_day1; mod week3_day2; mod week3_day3; mod week3_day4; +mod week3_day5; diff --git a/mini-lsm-starter/src/tests/week3_day5.rs b/mini-lsm-starter/src/tests/week3_day5.rs new file mode 100644 index 000000000..6a6b3bdbf --- /dev/null +++ b/mini-lsm-starter/src/tests/week3_day5.rs @@ -0,0 +1,75 @@ +use std::ops::Bound; + +use bytes::Bytes; +use tempfile::tempdir; + +use crate::{ + compact::CompactionOptions, + lsm_storage::{LsmStorageOptions, MiniLsm}, + tests::harness::check_lsm_iter_result_by_key, +}; + +#[test] +fn test_txn_integration() { + let dir = tempdir().unwrap(); + let options = LsmStorageOptions::default_for_week2_test(CompactionOptions::NoCompaction); + let storage = MiniLsm::open(&dir, options.clone()).unwrap(); + let txn1 = storage.new_txn().unwrap(); + let txn2 = storage.new_txn().unwrap(); + txn1.put(b"test1", b"233"); + txn2.put(b"test2", b"233"); + check_lsm_iter_result_by_key( + &mut txn1.scan(Bound::Unbounded, Bound::Unbounded).unwrap(), + vec![(Bytes::from("test1"), Bytes::from("233"))], + ); + check_lsm_iter_result_by_key( + &mut txn2.scan(Bound::Unbounded, Bound::Unbounded).unwrap(), + vec![(Bytes::from("test2"), Bytes::from("233"))], + ); + let txn3 = storage.new_txn().unwrap(); + check_lsm_iter_result_by_key( + &mut txn3.scan(Bound::Unbounded, Bound::Unbounded).unwrap(), + vec![], + ); + txn1.commit().unwrap(); + txn2.commit().unwrap(); + check_lsm_iter_result_by_key( + &mut txn3.scan(Bound::Unbounded, Bound::Unbounded).unwrap(), + vec![], + ); + drop(txn3); + check_lsm_iter_result_by_key( + &mut storage.scan(Bound::Unbounded, Bound::Unbounded).unwrap(), + vec![ + (Bytes::from("test1"), Bytes::from("233")), + (Bytes::from("test2"), Bytes::from("233")), + ], + ); + let txn4 = storage.new_txn().unwrap(); + assert_eq!(txn4.get(b"test1").unwrap(), Some(Bytes::from("233"))); + assert_eq!(txn4.get(b"test2").unwrap(), Some(Bytes::from("233"))); + check_lsm_iter_result_by_key( + &mut txn4.scan(Bound::Unbounded, Bound::Unbounded).unwrap(), + vec![ + (Bytes::from("test1"), Bytes::from("233")), + (Bytes::from("test2"), Bytes::from("233")), + ], + ); + txn4.put(b"test2", b"2333"); + assert_eq!(txn4.get(b"test1").unwrap(), Some(Bytes::from("233"))); + assert_eq!(txn4.get(b"test2").unwrap(), Some(Bytes::from("2333"))); + check_lsm_iter_result_by_key( + &mut txn4.scan(Bound::Unbounded, Bound::Unbounded).unwrap(), + vec![ + (Bytes::from("test1"), Bytes::from("233")), + (Bytes::from("test2"), Bytes::from("2333")), + ], + ); + txn4.delete(b"test2"); + assert_eq!(txn4.get(b"test1").unwrap(), Some(Bytes::from("233"))); + assert_eq!(txn4.get(b"test2").unwrap(), None); + check_lsm_iter_result_by_key( + &mut txn4.scan(Bound::Unbounded, Bound::Unbounded).unwrap(), + vec![(Bytes::from("test1"), Bytes::from("233"))], + ); +} From b5d378f034edd0c4e54cd893fd19665f9ab4c5e4 Mon Sep 17 00:00:00 2001 From: husharp Date: Tue, 27 Feb 2024 17:59:38 +0800 Subject: [PATCH 20/22] Serializable Snapshot Isolation Signed-off-by: husharp --- mini-lsm-starter/src/lsm_storage.rs | 75 ++++++++++++---- mini-lsm-starter/src/mvcc.rs | 9 +- mini-lsm-starter/src/mvcc/txn.rs | 106 +++++++++++++++++++--- mini-lsm-starter/src/mvcc/watermark.rs | 3 - mini-lsm-starter/src/tests.rs | 1 + mini-lsm-starter/src/tests/week3_day6.rs | 108 +++++++++++++++++++++++ 6 files changed, 268 insertions(+), 34 deletions(-) create mode 100644 mini-lsm-starter/src/tests/week3_day6.rs diff --git a/mini-lsm-starter/src/lsm_storage.rs b/mini-lsm-starter/src/lsm_storage.rs index 0dea87119..a4ea54b43 100644 --- a/mini-lsm-starter/src/lsm_storage.rs +++ b/mini-lsm-starter/src/lsm_storage.rs @@ -667,8 +667,8 @@ impl LsmStorageInner { if check_intersect_of_range( lower, upper, - sst.first_key().key_ref(), - sst.last_key().key_ref(), + sst.first_key().as_key_slice(), + sst.last_key().as_key_slice(), ) { // SST iterator does not support passing an end bound to it. // Therefore, need to handle the end_bound manually in LsmIterator @@ -703,7 +703,15 @@ impl LsmStorageInner { println!("level: {}", level); let mut ssts = Vec::with_capacity(level_sst_ids.len()); for sst_id in level_sst_ids.iter() { - ssts.push(snapshot.sstables[sst_id].clone()); + let sst = snapshot.sstables[sst_id].clone(); + if check_intersect_of_range( + lower, + upper, + sst.first_key().as_key_slice(), + sst.last_key().as_key_slice(), + ) { + ssts.push(sst); + } } let concat_iter = match lower { Bound::Included(key) => SstConcatIterator::create_and_seek_to_key( @@ -737,7 +745,30 @@ impl LsmStorageInner { } /// Write a batch of data into the storage. Implement in week 2 day 7. - pub fn write_batch>(&self, batch: &[WriteBatchRecord]) -> Result<()> { + pub fn write_batch>( + self: &Arc, + batch: &[WriteBatchRecord], + ) -> Result<()> { + if self.options.serializable { + let txn = self.mvcc().new_txn(self.clone(), true); + for record in batch { + match record { + WriteBatchRecord::Put(key, value) => { + txn.put(key.as_ref(), value.as_ref()); + } + WriteBatchRecord::Del(key) => { + txn.delete(key.as_ref()); + } + } + } + txn.commit()?; + } else { + self.write_batch_inner(batch)?; + } + Ok(()) + } + + pub fn write_batch_inner>(&self, batch: &[WriteBatchRecord]) -> Result { let _lck = self.mvcc().write_lock.lock(); let ts = self.mvcc().latest_commit_ts() + 1; for record in batch { @@ -772,17 +803,31 @@ impl LsmStorageInner { } } self.mvcc().update_commit_ts(ts); - Ok(()) + Ok(ts) } /// Put a key-value pair into the storage by writing into the current memtable. - pub fn put(&self, key: &[u8], value: &[u8]) -> Result<()> { - self.write_batch(&[WriteBatchRecord::Put(key, value)]) + pub fn put(self: &Arc, key: &[u8], value: &[u8]) -> Result<()> { + if self.options.serializable { + let txn = self.mvcc().new_txn(self.clone(), true); + txn.put(key, value); + txn.commit()?; + } else { + self.write_batch_inner(&[WriteBatchRecord::Put(key, value)])?; + } + Ok(()) } /// Remove a key from the storage by writing an empty value. - pub fn delete(&self, key: &[u8]) -> Result<()> { - self.write_batch(&[WriteBatchRecord::Del(key)]) + pub fn delete(self: &Arc, key: &[u8]) -> Result<()> { + if self.options.serializable { + let txn = self.mvcc().new_txn(self.clone(), true); + txn.delete(key); + txn.commit()?; + } else { + self.write_batch_inner(&[WriteBatchRecord::Del(key)])?; + } + Ok(()) } } @@ -797,27 +842,27 @@ fn key_within(key: &[u8], sst_begin: &[u8], sst_end: &[u8]) -> bool { fn check_intersect_of_range( begin: Bound<&[u8]>, end: Bound<&[u8]>, - sst_begin: &[u8], - sst_end: &[u8], + sst_begin: KeySlice, + sst_end: KeySlice, ) -> bool { println!( "intersected: {:?} {:?}, sst: {:?} {:?}", begin, end, sst_begin, sst_end ); match end { - Bound::Excluded(key) if key <= sst_begin => { + Bound::Excluded(key) if key <= sst_begin.key_ref() => { return false; } - Bound::Included(key) if key < sst_begin => { + Bound::Included(key) if key < sst_begin.key_ref() => { return false; } _ => {} } match begin { - Bound::Excluded(key) if sst_end <= key => { + Bound::Excluded(key) if sst_end.key_ref() <= key => { return false; } - Bound::Included(key) if sst_end < key => { + Bound::Included(key) if sst_end.key_ref() < key => { return false; } _ => {} diff --git a/mini-lsm-starter/src/mvcc.rs b/mini-lsm-starter/src/mvcc.rs index 7bb4cf519..1e4f94d51 100644 --- a/mini-lsm-starter/src/mvcc.rs +++ b/mini-lsm-starter/src/mvcc.rs @@ -1,6 +1,3 @@ -#![allow(unused_variables)] // TODO(you): remove this lint after implementing this mod -#![allow(dead_code)] // TODO(you): remove this lint after implementing this mod - pub mod txn; pub mod watermark; @@ -63,7 +60,11 @@ impl LsmMvccInner { inner, local_storage: Arc::new(Default::default()), committed: Arc::new(Default::default()), - key_hashes: None, + key_hashes: if serializable { + Some(Mutex::new((HashSet::new(), HashSet::new()))) + } else { + None + }, }) } } diff --git a/mini-lsm-starter/src/mvcc/txn.rs b/mini-lsm-starter/src/mvcc/txn.rs index ea9f5347a..12c9d0ecc 100644 --- a/mini-lsm-starter/src/mvcc/txn.rs +++ b/mini-lsm-starter/src/mvcc/txn.rs @@ -1,6 +1,3 @@ -#![allow(unused_variables)] // TODO(you): remove this lint after implementing this mod -#![allow(dead_code)] // TODO(you): remove this lint after implementing this mod - use std::{ collections::HashSet, ops::Bound, @@ -23,6 +20,7 @@ use crate::{ mem_table::map_bound, }; +use super::CommittedTxnData; pub struct Transaction { pub(crate) read_ts: u64, @@ -38,6 +36,13 @@ impl Transaction { if self.committed.load(Ordering::SeqCst) { panic!("cannot operate on committed txn!"); } + + if let Some(guard) = &self.key_hashes { + let mut guard = guard.lock(); + let (_, read_set) = &mut *guard; + read_set.insert(farmhash::hash32(key)); + } + if let Some(entry) = self.local_storage.get(key) { if entry.value().is_empty() { return Ok(None); @@ -75,8 +80,12 @@ impl Transaction { if self.committed.load(Ordering::SeqCst) { panic!("cannot operate on committed txn!"); } + if let Some(key_hashes) = &self.key_hashes { + let mut key_hashes = key_hashes.lock(); + let (write_hashes, _) = &mut *key_hashes; + write_hashes.insert(farmhash::hash32(key)); + } self.local_storage - .insert(Bytes::copy_from_slice(key), Bytes::copy_from_slice(value)); } @@ -84,6 +93,11 @@ impl Transaction { if self.committed.load(Ordering::SeqCst) { panic!("cannot operate on committed txn!"); } + if let Some(key_hashes) = &self.key_hashes { + let mut key_hashes = key_hashes.lock(); + let (write_hashes, _) = &mut *key_hashes; + write_hashes.insert(farmhash::hash32(key)); + } self.local_storage .insert(Bytes::copy_from_slice(key), Bytes::new()); } @@ -92,14 +106,71 @@ impl Transaction { self.committed .compare_exchange(false, true, Ordering::SeqCst, Ordering::SeqCst) .expect("cannot operate on committed txn!"); - let batch = self.local_storage.iter().map(|entry| { - if entry.value().is_empty() { - WriteBatchRecord::Del(entry.key().clone()) - } else { - WriteBatchRecord::Put(entry.key().clone(), entry.value().clone()) + // ensures only one transaction goes into the transaction verification and commit phase. + let _commit_lock = self.inner.mvcc().commit_lock.lock(); + if let Some(guard) = &self.key_hashes { + let guard = guard.lock(); + let (write_set, read_set) = &*guard; + println!( + "commit txn: write_set: {:?}, read_set: {:?}", + write_set, read_set + ); + if !write_set.is_empty() { + let committed_txns = self.inner.mvcc().committed_txns.lock(); + // go through all transactions with commit timestamp within range (read_ts, expected_commit_ts) (both excluded bounds) + for (_, txn_data) in committed_txns.range(self.read_ts + 1..) { + for key_hash in read_set { + // if the read set of the current transaction overlaps with the write set of any transaction. + if txn_data.key_hashes.contains(key_hash) { + println!( + "txn conflict detected: {:?} {:?}", + write_set, txn_data.key_hashes + ); + return Err(anyhow::anyhow!("txn conflict detected")); + } + } + } + } + } + + let batch = self + .local_storage + .iter() + .map(|entry| { + if entry.value().is_empty() { + WriteBatchRecord::Del(entry.key().clone()) + } else { + WriteBatchRecord::Put(entry.key().clone(), entry.value().clone()) + } + }) + .collect::>(); + let commit_ts = self.inner.write_batch_inner(&batch)?; + + // insert the write set into the committed_txns + if let Some(_) = &self.key_hashes { + let mut committed_txns = self.inner.mvcc().committed_txns.lock(); + let mut key_hashes = self.key_hashes.as_ref().unwrap().lock(); + let (write_set, _) = &mut *key_hashes; + committed_txns.insert( + commit_ts, + CommittedTxnData { + key_hashes: std::mem::take(write_set), + read_ts: self.read_ts, + commit_ts, + }, + ); + + // remove all transactions below the watermark + let watermark = self.inner.mvcc().watermark(); + while let Some(entry) = committed_txns.first_entry() { + if *entry.key() < watermark { + entry.remove(); + } else { + break; + } } - }).collect::>(); - self.inner.write_batch(&batch)?; + } + Ok(()) } } @@ -168,10 +239,13 @@ impl TxnIterator { ) -> Result { let mut iter = Self { txn, iter }; iter.skip_deletes()?; + if iter.is_valid() { + iter.add_to_read_set(iter.key()); + } Ok(iter) } - // TwoMergeIterator will retain the deletion markers in the child iterators, + // TwoMergeIterator will retain the deletion markers in the child iterators, // we need to modify your TxnIterator implementation to correctly handle deletions. fn skip_deletes(&mut self) -> Result<()> { while self.iter.is_valid() && self.iter.value().is_empty() { @@ -179,6 +253,14 @@ impl TxnIterator { } Ok(()) } + + fn add_to_read_set(&self, key: &[u8]) { + if let Some(guard) = &self.txn.key_hashes { + let mut guard = guard.lock(); + let (_, read_set) = &mut *guard; + read_set.insert(farmhash::hash32(key)); + } + } } impl StorageIterator for TxnIterator { diff --git a/mini-lsm-starter/src/mvcc/watermark.rs b/mini-lsm-starter/src/mvcc/watermark.rs index 617df2369..05fc56f44 100644 --- a/mini-lsm-starter/src/mvcc/watermark.rs +++ b/mini-lsm-starter/src/mvcc/watermark.rs @@ -1,6 +1,3 @@ -#![allow(unused_variables)] // TODO(you): remove this lint after implementing this mod -#![allow(dead_code)] // TODO(you): remove this lint after implementing this mod - use std::collections::BTreeMap; // Watermark is the lowest read timestamp among all in-progress transactions. diff --git a/mini-lsm-starter/src/tests.rs b/mini-lsm-starter/src/tests.rs index a8b0f8600..761a0c026 100644 --- a/mini-lsm-starter/src/tests.rs +++ b/mini-lsm-starter/src/tests.rs @@ -20,3 +20,4 @@ mod week3_day2; mod week3_day3; mod week3_day4; mod week3_day5; +mod week3_day6; diff --git a/mini-lsm-starter/src/tests/week3_day6.rs b/mini-lsm-starter/src/tests/week3_day6.rs new file mode 100644 index 000000000..aa194a300 --- /dev/null +++ b/mini-lsm-starter/src/tests/week3_day6.rs @@ -0,0 +1,108 @@ +use std::ops::Bound; + +use bytes::Bytes; +use tempfile::tempdir; + +use crate::{ + compact::CompactionOptions, + iterators::StorageIterator, + lsm_storage::{LsmStorageOptions, MiniLsm}, +}; + +#[test] +fn test_serializable_1() { + let dir = tempdir().unwrap(); + let mut options = LsmStorageOptions::default_for_week2_test(CompactionOptions::NoCompaction); + options.serializable = true; + let storage = MiniLsm::open(&dir, options.clone()).unwrap(); + storage.put(b"key1", b"1").unwrap(); + storage.put(b"key2", b"2").unwrap(); + let txn1 = storage.new_txn().unwrap(); + let txn2 = storage.new_txn().unwrap(); + txn1.put(b"key1", &txn1.get(b"key2").unwrap().unwrap()); + txn2.put(b"key2", &txn2.get(b"key1").unwrap().unwrap()); + txn1.commit().unwrap(); + assert!(txn2.commit().is_err()); + drop(txn2); + assert_eq!(storage.get(b"key1").unwrap(), Some(Bytes::from("2"))); + assert_eq!(storage.get(b"key2").unwrap(), Some(Bytes::from("2"))); +} + +#[test] +fn test_serializable_2() { + let dir = tempdir().unwrap(); + let mut options = LsmStorageOptions::default_for_week2_test(CompactionOptions::NoCompaction); + options.serializable = true; + let storage = MiniLsm::open(&dir, options.clone()).unwrap(); + let txn1 = storage.new_txn().unwrap(); + let txn2 = storage.new_txn().unwrap(); + txn1.put(b"key1", b"1"); + txn2.put(b"key1", b"2"); + txn1.commit().unwrap(); + txn2.commit().unwrap(); + assert_eq!(storage.get(b"key1").unwrap(), Some(Bytes::from("2"))); +} + +#[test] +fn test_serializable_3_ts_range() { + let dir = tempdir().unwrap(); + let mut options = LsmStorageOptions::default_for_week2_test(CompactionOptions::NoCompaction); + options.serializable = true; + let storage = MiniLsm::open(&dir, options.clone()).unwrap(); + storage.put(b"key1", b"1").unwrap(); + storage.put(b"key2", b"2").unwrap(); + let txn1 = storage.new_txn().unwrap(); + txn1.put(b"key1", &txn1.get(b"key2").unwrap().unwrap()); + txn1.commit().unwrap(); + let txn2 = storage.new_txn().unwrap(); + txn2.put(b"key2", &txn2.get(b"key1").unwrap().unwrap()); + txn2.commit().unwrap(); + drop(txn2); + assert_eq!(storage.get(b"key1").unwrap(), Some(Bytes::from("2"))); + assert_eq!(storage.get(b"key2").unwrap(), Some(Bytes::from("2"))); +} + +#[test] +fn test_serializable_4_scan() { + let dir = tempdir().unwrap(); + let mut options = LsmStorageOptions::default_for_week2_test(CompactionOptions::NoCompaction); + options.serializable = true; + let storage = MiniLsm::open(&dir, options.clone()).unwrap(); + storage.put(b"key1", b"1").unwrap(); + storage.put(b"key2", b"2").unwrap(); + let txn1 = storage.new_txn().unwrap(); + let txn2 = storage.new_txn().unwrap(); + txn1.put(b"key1", &txn1.get(b"key2").unwrap().unwrap()); + txn1.commit().unwrap(); + let mut iter = txn2.scan(Bound::Unbounded, Bound::Unbounded).unwrap(); + while iter.is_valid() { + iter.next().unwrap(); + } + txn2.put(b"key2", b"1"); + assert!(txn2.commit().is_err()); + drop(txn2); + assert_eq!(storage.get(b"key1").unwrap(), Some(Bytes::from("2"))); + assert_eq!(storage.get(b"key2").unwrap(), Some(Bytes::from("2"))); +} + +#[test] +fn test_serializable_5_read_only() { + let dir = tempdir().unwrap(); + let mut options = LsmStorageOptions::default_for_week2_test(CompactionOptions::NoCompaction); + options.serializable = true; + let storage = MiniLsm::open(&dir, options.clone()).unwrap(); + storage.put(b"key1", b"1").unwrap(); + storage.put(b"key2", b"2").unwrap(); + let txn1 = storage.new_txn().unwrap(); + txn1.put(b"key1", &txn1.get(b"key2").unwrap().unwrap()); + txn1.commit().unwrap(); + let txn2 = storage.new_txn().unwrap(); + txn2.get(b"key1").unwrap().unwrap(); + let mut iter = txn2.scan(Bound::Unbounded, Bound::Unbounded).unwrap(); + while iter.is_valid() { + iter.next().unwrap(); + } + txn2.commit().unwrap(); + assert_eq!(storage.get(b"key1").unwrap(), Some(Bytes::from("2"))); + assert_eq!(storage.get(b"key2").unwrap(), Some(Bytes::from("2"))); +} From e3798eb17df0658848e89fd5565d17230f6e3a09 Mon Sep 17 00:00:00 2001 From: husharp Date: Tue, 27 Feb 2024 18:10:41 +0800 Subject: [PATCH 21/22] merge master Signed-off-by: husharp --- Cargo.lock | 4 + README.md | 53 ++++++---- mini-lsm-book/src/00-overview.md | 48 +++++---- mini-lsm-book/src/00-preface.md | 68 ++++++------- mini-lsm-book/src/SUMMARY.md | 6 +- mini-lsm-book/src/mini-lsm-logo.png | Bin 0 -> 66387 bytes mini-lsm-book/src/sitemap.txt | 7 ++ mini-lsm-book/src/sitemap.xml | 92 ++++++++++++------ mini-lsm-book/src/week1-01-memtable.md | 5 +- mini-lsm-book/src/week1-02-merge-iterator.md | 4 +- mini-lsm-book/src/week1-03-block.md | 2 +- mini-lsm-book/src/week1-05-read-path.md | 1 + .../src/week1-07-sst-optimizations.md | 6 +- mini-lsm-book/src/week2-01-compaction.md | 6 ++ mini-lsm-book/src/week2-02-simple.md | 6 ++ mini-lsm-book/src/week2-03-tiered.md | 9 +- mini-lsm-book/src/week2-04-leveled.md | 18 +++- mini-lsm-book/src/week2-05-manifest.md | 8 ++ mini-lsm-book/src/week2-06-wal.md | 7 ++ mini-lsm-book/src/week3-04-watermark.md | 2 + mini-lsm-book/src/week3-06-serializable.md | 12 ++- .../src/week3-07-compaction-filter.md | 44 ++++++++- mini-lsm-book/src/week3-overview.md | 4 +- mini-lsm-mvcc/Cargo.toml | 2 + mini-lsm-mvcc/src/compact.rs | 23 ++++- mini-lsm-mvcc/src/compact/leveled.rs | 1 + mini-lsm-mvcc/src/iterators/merge_iterator.rs | 2 +- .../src/iterators/two_merge_iterator.rs | 2 + mini-lsm-mvcc/src/lsm_iterator.rs | 4 +- mini-lsm-mvcc/src/lsm_storage.rs | 16 +++ mini-lsm-mvcc/src/mem_table.rs | 3 +- mini-lsm-mvcc/src/tests.rs | 1 + mini-lsm-mvcc/src/tests/week3_day7.rs | 70 +++++++++++++ .../src/iterators/merge_iterator.rs | 2 +- mini-lsm-starter/src/lsm_iterator.rs | 6 +- mini-lsm-starter/src/lsm_storage.rs | 16 +++ mini-lsm/Cargo.toml | 2 + mini-lsm/src/iterators/merge_iterator.rs | 2 +- mini-lsm/src/lsm_iterator.rs | 4 +- mini-lsm/src/lsm_storage.rs | 17 ++++ mini-lsm/src/mem_table.rs | 3 +- mini-lsm/src/tests/harness.rs | 34 ++++--- mini-lsm/src/tests/week1_day2.rs | 6 +- mini-lsm/src/tests/week1_day6.rs | 2 +- 44 files changed, 462 insertions(+), 168 deletions(-) create mode 100644 mini-lsm-book/src/mini-lsm-logo.png create mode 100644 mini-lsm-mvcc/src/tests/week3_day7.rs diff --git a/Cargo.lock b/Cargo.lock index 1a761d952..3619b2711 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -489,9 +489,11 @@ dependencies = [ "crossbeam-skiplist", "farmhash", "moka", + "nom", "ouroboros", "parking_lot", "rand", + "rustyline", "serde", "serde_json", "tempfile", @@ -511,9 +513,11 @@ dependencies = [ "crossbeam-skiplist", "farmhash", "moka", + "nom", "ouroboros", "parking_lot", "rand", + "rustyline", "serde", "serde_json", "tempfile", diff --git a/README.md b/README.md index 754ee0fb6..d59582ca3 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,5 @@ +![banner](./mini-lsm-book/src/mini-lsm-logo.png) + # LSM in a Week [![CI (main)](https://github.com/skyzh/mini-lsm/actions/workflows/main.yml/badge.svg)](https://github.com/skyzh/mini-lsm/actions/workflows/main.yml) @@ -26,7 +28,7 @@ You should modify code in `mini-lsm-starter` directory. ``` cargo x install-tools -cargo copy-test --week 1 --day 1 +cargo x copy-test --week 1 --day 1 cargo x scheck cargo run --bin mini-lsm-cli cargo run --bin compaction-simulator @@ -45,7 +47,7 @@ cargo x book If you changed public API in the reference solution, you might also need to synchronize it to the starter crate. To do this, use `cargo x sync`. -## Structure +## Code Structure * mini-lsm: the final solution code for <= week 2 * mini-lsm-mvcc: the final solution code for week 3 MVCC @@ -70,28 +72,41 @@ cargo run --bin compaction-simulator-ref cargo run --bin compaction-simulator-mvcc-ref ``` -## Progress +## Tutorial Structure -We are working on chapter 3 and more test cases for all existing contents. +We have 3 weeks + 1 extra week (in progress) for this tutorial. * Week 1: Storage Format + Engine Skeleton * Week 2: Compaction and Persistence * Week 3: Multi-Version Concurrency Control -* The Extra Week / Rest of Your Life: Optimizations (unlikely to be available in 2024...) - -✅: Finished \ -🚧: WIP and will likely be available soon - -| Week + Chapter | Topic | Solution | Starter Code | Writeup | -| -------------- | ----------------------------------------------- | -------- | ------------ | ------- | -| 3.1 | Timestamp Key Encoding | ✅ | ✅ | ✅ | -| 3.2 | Snapshot Read - Blocks, Memtables, and SSTs | ✅ | ✅ | ✅ | -| 3.3 | Snapshot Read - Engine Read Path | ✅ | ✅ | ✅ | -| 3.4 | Watermark and Garbage Collection | ✅ | ✅ | ✅ | -| 3.5 | Transactions and Optimistic Concurrency Control | ✅ | ✅ | ✅ | -| 3.6 | Serializable Snapshot Isolation | ✅ | ✅ | ✅ | -| 3.7 | Compaction Filter | 🚧 | | | +* The Extra Week / Rest of Your Life: Optimizations (unlikely to be available in 2024...) + +![Tutorial Roadmap](./mini-lsm-book/src/lsm-tutorial/00-full-overview.svg) + +| Week + Chapter | Topic | +| -------------- | ----------------------------------------------------------- | +| 1.1 | Memtable | +| 1.2 | Merge Iterator | +| 1.3 | Block | +| 1.4 | Sorted String Table (SST) | +| 1.5 | Read Path | +| 1.6 | Write Path | +| 1.7 | SST Optimizations: Prefix Key Encoding + Bloom Filters | +| 2.1 | Compaction Implementation | +| 2.2 | Simple Compaction Strategy (Traditional Leveled Compaction) | +| 2.3 | Tiered Compaction Strategy (RocksDB Universal Compaction) | +| 2.4 | Leveled Compaction Strategy (RocksDB Leveled Compaction) | +| 2.5 | Manifest | +| 2.6 | Write-Ahead Log (WAL) | +| 2.7 | Batch Write and Checksums | +| 3.1 | Timestamp Key Encoding | +| 3.2 | Snapshot Read - Memtables and Timestamps | +| 3.3 | Snapshot Read - Transaction API | +| 3.4 | Watermark and Garbage Collection | +| 3.5 | Transactions and Optimistic Concurrency Control | +| 3.6 | Serializable Snapshot Isolation | +| 3.7 | Compaction Filters | ## License -The Mini-LSM starter code and solution are under Apache 2.0 license. The author reserves the full copyright of the tutorial materials (markdown files and figures). +The Mini-LSM starter code and solution are under [Apache 2.0 license](LICENSE). The author reserves the full copyright of the tutorial materials (markdown files and figures). diff --git a/mini-lsm-book/src/00-overview.md b/mini-lsm-book/src/00-overview.md index d80ed7af8..8314938aa 100644 --- a/mini-lsm-book/src/00-overview.md +++ b/mini-lsm-book/src/00-overview.md @@ -1,11 +1,23 @@ # Mini-LSM Course Overview +## Tutorial Structure + +![Tutorial Overview](lsm-tutorial/00-full-overview.svg) + +We have three parts (weeks) for this tutorial. In the first week, we will focus on the storage structure and the storage format of an LSM storage engine. In the second week, we will deeply dive into compactions and implement persistence support for the storage engine. In the third week, we will implement multi-version concurrency control. + +* [The First Week: Mini-LSM](./week1-overview.md) +* [The Second Week: Compaction and Persistence](./week2-overview.md) +* [The Third Week: Multi-Version Concurrency Control](./week3-overview.md) + +Please look at [Environment Setup](./00-get-started.md) to set up the environment. + ## Overview of LSM -An LSM storage engine generally contains 3 parts: +An LSM storage engine generally contains three parts: 1. Write-ahead log to persist temporary data for recovery. -2. SSTs on the disk for maintaining a tree structure. +2. SSTs on the disk to maintain an LSM-tree structure. 3. Mem-tables in memory for batching small writes. The storage engine generally provides the following interfaces: @@ -19,24 +31,20 @@ To ensure persistence, * `Sync()`: ensure all the operations before `sync` are persisted to the disk. -Some engines choose to combine `Put` and `Delete` into a single operation called `WriteBatch`, which accepts a batch -of key value pairs. +Some engines choose to combine `Put` and `Delete` into a single operation called `WriteBatch`, which accepts a batch of key-value pairs. -In this tutorial, we assume the LSM tree is using leveled compaction algorithm, which is commonly used in real-world -systems. +In this tutorial, we assume the LSM tree is using a leveled compaction algorithm, which is commonly used in real-world systems. ### Write Path ![Write Path](lsm-tutorial/00-lsm-write-flow.svg) -The write path of LSM contains 4 steps: +The write path of LSM contains four steps: -1. Write the key-value pair to write-ahead log, so that it can be recovered after the storage engine crashes. -2. Write the key-value pair to memtable. After (1) and (2) completes, we can notify the user that the write operation - is completed. -3. When a memtable is full, we will freeze them into immutable memtables, and will flush them to the disk as SST files in the background. -4. We will compact some files in some level into lower levels to maintain a good shape for the LSM tree, so that read - amplification is low. +1. Write the key-value pair to the write-ahead log so that it can be recovered after the storage engine crashes. +2. Write the key-value pair to memtable. After (1) and (2) are completed, we can notify the user that the write operation is completed. +3. (In the background) When a mem-table is full, we will freeze them into immutable mem-tables and flush them to the disk as SST files in the background. +4. (In the background) The engine will compact some files in some levels into lower levels to maintain a good shape for the LSM tree so that the read amplification is low. ### Read Path @@ -44,21 +52,9 @@ The write path of LSM contains 4 steps: When we want to read a key, -1. We will first probe all the memtables from latest to oldest. +1. We will first probe all the mem-tables from the latest to the oldest. 2. If the key is not found, we will then search the entire LSM tree containing SSTs to find the data. There are two types of read: lookup and scan. Lookup finds one key in the LSM tree, while scan iterates all keys within a range in the storage engine. We will cover both of them throughout the tutorial. -## Tutorial Structure - -![Tutorial Overview](lsm-tutorial/00-full-overview.svg) - -We have 3 parts (weeks) for this tutorial. In the first week, we will focus on the storage structure and the storage format of an LSM storage engine. In the second week, we will dive into compactions in depth and implement persistence support for the storage engine. In the third week, we will implement multi-version concurrency control. - -* [The First Week: Mini-LSM](./week1-overview.md) -* [The Second Week: Compaction and Persistence](./week2-overview.md) -* [The Third Week: Multi-Version Concurrency Control](./week3-overview.md) - -To set up the environment, please take a look at [Environment Setup](./00-get-started.md). - {{#include copyright.md}} diff --git a/mini-lsm-book/src/00-preface.md b/mini-lsm-book/src/00-preface.md index 041caee41..6c8fb1c1d 100644 --- a/mini-lsm-book/src/00-preface.md +++ b/mini-lsm-book/src/00-preface.md @@ -1,82 +1,74 @@ # Preface -![Tutorial Overview](lsm-tutorial/00-full-overview.svg) +![Banner](./mini-lsm-logo.png) -In this tutorial, you will learn how to build a simple LSM-Tree storage engine in the Rust programming language. +This course teaches you how to build a simple LSM-Tree storage engine in Rust. ## What is LSM, and Why LSM? -Log-structured merge tree is a data structure to maintain key-value pairs. This data structure is widely used in +Log-structured merge trees are data structures that maintain key-value pairs. This data structure is widely used in distributed database systems like [TiDB](https://www.pingcap.com) and [CockroachDB](https://www.cockroachlabs.com) as their underlying storage engine. [RocksDB](http://rocksdb.org), based on [LevelDB](https://github.com/google/leveldb), -is an implementation of LSM-Tree storage engine. It provides a wide range of key-value access functionalities and is -used in a lot of production systems. +is an implementation of LSM-Tree storage engines. It provides many key-value access functionalities and is +used in many production systems. Generally speaking, LSM Tree is an append-friendly data structure. It is more intuitive to compare LSM to other -key-value data structure like RB-Tree and B-Tree. For RB-Tree and B-Tree, all data operations are in-place. That is to -say, when you update the value corresponding to the key, the value will be overwritten at its original memory or disk -space. But in an LSM Tree, all write operations, i.e., insertions, updates, deletions, are performed in somewhere else. -These operations will be batched into SST (sorted string table) files and be written to the disk. Once written to the -disk, the file will not be changed. These operations are applied lazily on disk with a special task called compaction. -The compaction job will merge multiple SST files and remove unused data. +key-value data structures like RB-Tree and B-Tree. For RB-Tree and B-Tree, all data operations are in place. That is to +say, when you want to update the value corresponding to the key, the engine will overwrite its original memory or disk +space with the new value. But in an LSM Tree, all write operations, i.e., insertions, updates, deletions, are lazily applied to the storage. +The engine batches these operations into SST (sorted string table) files and writes them to the disk. Once written to the +disk, the engine will not directly modify them. In a particular background task called compaction, the engine will merge these files to apply the updates and deletions. -This architectural design makes LSM tree easy to work with. +This architectural design makes LSM trees easy to work with. -1. Data are immutable on persistent storage, which means that it is easier to offload the background tasks (compaction) - to remote servers. It is also feasible to directly store and serve data from cloud-native storage systems like S3. -2. An LSM tree can balance between read, write and space amplification by changing the compaction algorithm. The data - structure itself is super versatile and can be optimized for different workloads. +1. Data are immutable on persistent storage. Concurrency control is more straightforward. Offloading the background tasks (compaction) to remote servers is possible. Storing and serving data directly from cloud-native storage systems like S3 is also feasible. +2. Changing the compaction algorithm allows the storage engine to balance between read, write, and space amplification. The data structure is versatile, and by adjusting the compaction parameters, we can optimize the LSM structure for different workloads. -In this tutorial, we will learn how to build an LSM-Tree-based storage engine in the Rust programming language. +This course will teach you how to build an LSM-tree-based storage engine in the Rust programming language. ## Prerequisites -* You should know the basics of the Rust programming language. Reading [the Rust book](https://doc.rust-lang.org/book/) - is enough. -* You should know the basic concepts of key-value storage engines, i.e., why we need somehow complex design to achieve - persistence. If you have no experience with database systems and storage systems before, you can implement Bitcask - in [PingCAP Talent Plan](https://github.com/pingcap/talent-plan/tree/master/courses/rust/projects/project-2). -* Knowing the basics of an LSM tree is not a requirement but we recommend you to read something about it, e.g., the - overall idea of LevelDB. This would familiarize you with concepts like mutable and immutable mem-tables, SST, - compaction, WAL, etc. +* You should know the basics of the Rust programming language. Reading [the Rust book](https://doc.rust-lang.org/book/) is enough. +* You should know the basic concepts of key-value storage engines, i.e., why we need a complex design to achieve persistence. If you have no experience with database systems and storage systems before, you can implement Bitcask in [PingCAP Talent Plan](https://github.com/pingcap/talent-plan/tree/master/courses/rust/projects/project-2). +* Knowing the basics of an LSM tree is not a requirement, but we recommend you read something about it, e.g., the overall idea of LevelDB. Knowing them beforehand would familiarize you with concepts like mutable and immutable mem-tables, SST, compaction, WAL, etc. -## What should you expect from this tutorial... +## What should you expect from this tutorial -After learning this course, you should have a deep understanding of how a LSM-based storage system works, gain hands-on experience of designing such systems, and apply what you have learned in your study and career. You will understand the design tradeoffs in such storage systems and find optimal ways to design a LSM-based storage system to meet your workload requirements/goals. This is a very in-depth tutorial that covers all the important implementation details and design choices of modern storage systems (i.e., RocksDB) based on the author's experience in several LSM-like storage systems, and you will be able to directly apply what you have learned in both industry and academia. +After taking this course, you should deeply understand how an LSM-based storage system works, gain hands-on experience in designing such systems, and apply what you have learned in your study and career. You will understand the design tradeoffs in such storage systems and find optimal ways to design an LSM-based storage system to meet your workload requirements/goals. This very in-depth tutorial covers all the essential implementation details and design choices of modern storage systems (i.e., RocksDB) based on the author's experience in several LSM-like storage systems, and you will be able to directly apply what you have learned in both industry and academia. ### Structure -The tutorial is a large course that is split into several parts (weeks). Each week usually has seven chapters, and each of the chapter can be finished within 2-3 hours. The first six chapters of each part will instruct you to build a working system, and the last chapter of each week will be a *snack time* chapter that implements some easy things over what you have built in the previous six days. In each chapter, there will be required tasks, *check you understanding* questions, and bonus tasks. +The tutorial is an extensive course with several parts (weeks). Each week has seven chapters; you can finish each within 2 to 3 hours. The first six chapters of each part will instruct you to build a working system, and the last chapter of each week will be a *snack time* chapter that implements some easy things over what you have built in the previous six days. Each chapter will have required tasks, *check your understanding* questions, and bonus tasks. ### Testing -We provide full test suite and some cli tools for you to validate if your solution is correct. Note that the test suite is not exhaustive, and your solution might not be 100% correct after passing all test cases. You might need to fix earlier bugs when implementing later parts of the system. We recommend you to think thoroughly about your implementation, especially when there are multi-thread operations and race conditions. +We provide a full test suite and some CLI tools for you to validate if your solution is correct. Note that the test suite is not exhaustive, and your solution might not be 100% correct after passing all test cases. You might need to fix earlier bugs when implementing later parts of the system. We recommend you think thoroughly about your implementation, especially when there are multi-thread operations and race conditions. ### Solution We have a solution that implements all the functionalities as required in the tutorial in the mini-lsm main repo. At the same time, we also have a mini-lsm solution checkpoint repo where each commit corresponds to a chapter in the tutorial. -Keeping such checkpoint repo up-to-date to the mini-lsm tutorial is hard because each bug fix or new feature will need to go through all commits (or checkpoints). Therefore, this repo might not be using the latest starter code or incorporating the latest features from the mini-lsm tutorial. +Keeping such a checkpoint repo up-to-date with the mini-lsm tutorial is challenging because each bug fix or new feature must go through all commits (or checkpoints). Therefore, this repo might not use the latest starter code or incorporate the latest features from the mini-lsm tutorial. -**TL;DR: We do not guarantee the solution checkpoint repo contains a correct solution, passes all tests, or has the correct doc comments.** For a correct implementation and the solution after implementing all things, please take a look at the solution in the main repo instead. [https://github.com/skyzh/mini-lsm/tree/main/mini-lsm](https://github.com/skyzh/mini-lsm/tree/main/mini-lsm). +**TL;DR: We do not guarantee the solution checkpoint repo contains a correct solution, passes all tests, or has the correct doc comments.** For a correct implementation and the solution after implementing everything, please look at the solution in the main repo instead. [https://github.com/skyzh/mini-lsm/tree/main/mini-lsm](https://github.com/skyzh/mini-lsm/tree/main/mini-lsm). -If you are stuck at some part of the tutorial or do not know where to implement a functionality, you can refer to this repo for help. You may compare the diff between commits to know what has been changed. Some functions in the mini-lsm tutorial might be changed multiple times throughout the chapters, and you can know what exactly are expected to be implemented for each chapter in this repo. +If you are stuck at some part of the tutorial or need help determining where to implement functionality, you can refer to this repo for help. You may compare the diff between commits to know what has been changed. You might need to modify some functions in the mini-lsm tutorial multiple times throughout the chapters, and you can understand what exactly is expected to be implemented for each chapter in this repo. You may access the solution checkpoint repo at [https://github.com/skyzh/mini-lsm-solution-checkpoint](https://github.com/skyzh/mini-lsm-solution-checkpoint). ### Feedbacks -Your feedback is greatly appreciated. We have rewritten the whole course from scratch in 2024 based on the feedbacks from the students. We hope you can share your learning experience and help us continuously improve the tutorial. Welcome to the [Discord community](https://skyzh.dev/join/discord) and share your experience. +Your feedback is greatly appreciated. We have rewritten the whole course from scratch in 2024 based on the feedback from the students. Please share your learning experience and help us continuously improve the tutorial. Welcome to the [Discord community](https://skyzh.dev/join/discord) and share your experience. -The long story of why we rewrote it: The tutorial was originally planned as a general guidance that students start from an empty directory and implement whatever they want based on the specification we had. We had minimal tests that checks if the behavior is correct. However, the original tutorial is too open-ended that caused huge obstacles with the learning experience. As students do not have an overview of the whole system beforehand and the instructions are kind of vague, sometimes it is hard for the students to know why a design decision is made and what they need to achieve a goal. And some part of the course is too compact that it is impossible to deliver expected contents within just one chapter. Therefore, we completely redesigned the course to have a easier learning curve and clearer learning goals. The original one-week tutorial is now split into two weeks (first week on storage format, and second week on deep-dive compaction), with an extra part on MVCC. We hope you find this course interesting and helpful in your study and career. We would like to thank everyone who commented in [Feedback after coding day 1](https://github.com/skyzh/mini-lsm/issues/11) and [Hello, when is the next update plan for the tutorial?](https://github.com/skyzh/mini-lsm/issues/7) -- your feedback greatly helped us improve the course. +The long story of why we rewrote it: The tutorial was originally planned as a general guidance that students start from an empty directory and implement whatever they want based on the specifications we had. We had minimal tests that checked if the behavior was correct. However, the original tutorial was too open-ended, which caused huge obstacles to the learning experience. As students do not have an overview of the whole system beforehand and the instructions are vague, sometimes it is hard for them to know why a design decision is made and what they need to achieve a goal. Some parts of the course were so compact that delivering the expected contents within just one chapter was impossible. Therefore, we completely redesigned the course for an easier learning curve and clearer learning goals. The original one-week tutorial is now split into two weeks (the first week on storage format and the second week on deep-dive compaction), with an extra part on MVCC. We hope you find this course interesting and helpful in your study and career. We want to thank everyone who commented in [Feedback after coding day 1](https://github.com/skyzh/mini-lsm/issues/11) and [Hello, when is the next update plan for the tutorial?](https://github.com/skyzh/mini-lsm/issues/7) -- Your feedback greatly helped us improve the course. ### License -The source code of this course is licensed under Apache 2.0, while the author owns the full copyright of the tutorial itself (markdown files + figures). +The source code of this course is licensed under Apache 2.0, while the author owns the complete copyright of the tutorial itself (markdown files + figures). ### Will this tutorial be free forever? -Yes! Everything publicly available now will be free forever and will receive lifetime updates and bug fixes. Meanwhile, we might provide paid code review and office hour services in the future. For the DLC part (*rest of your life* chapters), we do not have plans to finish them as of 2024, and have not decided whether they will be public available or not. +Yes! Everything publicly available now will be free forever and receive lifetime updates and bug fixes. Meanwhile, we might provide paid code review and office hour services. For the DLC part (*rest of your life* chapters), we do not have plans to finish them as of 2024 and have yet to decide whether they will be publicly available. ## Community @@ -86,11 +78,11 @@ You may join skyzh's Discord server and study with the mini-lsm community. ## Get Started -Now, you may go ahead and get an overview of the LSM structure in [Mini-LSM Course Overview](./00-overview.md). +Now, you can get an overview of the LSM structure in [Mini-LSM Course Overview](./00-overview.md). ## About the Author -As of writing (at the beginning of 2024), Chi obtained his master's degree in Computer Science from Carnegie Mellon University and his bachelor's degree from Shanghai Jiao Tong University. He has been working on a variety of database systems including [TiKV][db1], [AgateDB][db2], [TerarkDB][db3], [RisingWave][db4], and [Neon][db5]. Since 2022, he worked as a teaching assistant for [CMU's Database Systems course](https://15445.courses.cs.cmu) for three semesters on the BusTub educational system, where he added a lot of new features and more challenges to the course (check out the re-designed [query execution](https://15445.courses.cs.cmu.edu/fall2022/project3/) project and the super challenging [multi-version concurrency control](https://15445.courses.cs.cmu.edu/fall2023/project4/) project). Besides working on the BusTub educational system, he is also a maintainer of the [RisingLight](https://github.com/risinglightdb/risinglight) educational database system. Chi is interested in exploring how the Rust programming language can fit in the database world. Check out his previous tutorial on building a vectorized expression framework [type-exercise-in-rust](https://github.com/skyzh/type-exercise-in-rust) and on building a vector database [write-you-a-vector-db](https://github.com/skyzh/write-you-a-vector-db) if you are also interested in that topic. +As of writing (at the beginning of 2024), Chi obtained his master's degree in Computer Science from Carnegie Mellon University and his bachelor's degree from Shanghai Jiao Tong University. He has been working on a variety of database systems, including [TiKV][db1], [AgateDB][db2], [TerarkDB][db3], [RisingWave][db4], and [Neon][db5]. Since 2022, he has worked as a teaching assistant for [CMU's Database Systems course](https://15445.courses.cs.cmu) for three semesters on the BusTub educational system, where he added a lot of new features and more challenges to the course (check out the redesigned [query execution](https://15445.courses.cs.cmu.edu/fall2022/project3/) project and the super challenging [multi-version concurrency control](https://15445.courses.cs.cmu.edu/fall2023/project4/) project). Besides working on the BusTub educational system, he also maintains the [RisingLight](https://github.com/risinglightdb/risinglight) educational database system. Chi is interested in exploring how the Rust programming language can fit into the database world. Check out his previous tutorial on building a vectorized expression framework [type-exercise-in-rust](https://github.com/skyzh/type-exercise-in-rust) and on building a vector database [write-you-a-vector-db](https://github.com/skyzh/write-you-a-vector-db) if you are also interested in that topic. [db1]: https://github.com/tikv/tikv [db2]: https://github.com/tikv/agatedb diff --git a/mini-lsm-book/src/SUMMARY.md b/mini-lsm-book/src/SUMMARY.md index 38d8eeab5..ed66d7d8e 100644 --- a/mini-lsm-book/src/SUMMARY.md +++ b/mini-lsm-book/src/SUMMARY.md @@ -27,9 +27,9 @@ - [Snapshots - Memtables and Timestamps](./week3-02-snapshot-read-part-1.md) - [Snapshots - Transaction API](./week3-03-snapshot-read-part-2.md) - [Watermark and GC](./week3-04-watermark.md) - - [Transaction and OCC (WIP)](./week3-05-txn-occ.md) - - [Serializable Snapshot Isolation (WIP)](./week3-06-serializable.md) - - [Snack Time: Compaction Filter (WIP)](./week3-07-compaction-filter.md) + - [Transaction and OCC](./week3-05-txn-occ.md) + - [Serializable Snapshot Isolation](./week3-06-serializable.md) + - [Snack Time: Compaction Filters](./week3-07-compaction-filter.md) - [The Rest of Your Life (TBD)](./week4-overview.md) --- diff --git a/mini-lsm-book/src/mini-lsm-logo.png b/mini-lsm-book/src/mini-lsm-logo.png new file mode 100644 index 0000000000000000000000000000000000000000..ab6459804806f2a843e6d27f23e7cfa7a055b2ed GIT binary patch literal 66387 zcmeEug6Dh1Zjf_GN$EyG>FydpQbIzyr36Xo z8sNLP=lGuId!Ot33qHLrVI24DS+n-q_x-Ci{!eAZaItS-e{FyTsgwC8f z_vjJ^JY&>q`v?3wYa=fvd?v5!)_3?vsDX-vp|tcF26%kw%!RWT&YVLIIRnR^#sBB= zS$KT&%=w?spE;v+mh^ufYn{FG@A1CIXE6Uc#t?oZf5hQ0y!xNt=iZ+E_l$2bejR;5 z=Tsz^)m=vkUG>*!nR z8ZbMWzeHYjhR=})9-152>QFkGn_1ZKIP%~Ac?1tUMxJK5P5JW>TT}kqD$-9WMJ%li zC^?z$GvB{0fK5qB$!D!^$Rq#g@jowzfAQabVQc%6hlRz#!GYOEeH(eB|M~Ox zIBks$|9vM5n}0qQd_WfDe^^+V@3Z{Rx#6XJ$g@211~!&vcF5@!EsSji*!X@P{J)?6 z_uc+}m8`X~0gM-UnE>lQFZs{s{`c`BmgX<54Qyu?@hw`@!i`gji2s|E6_0haiRIFl;F7wB-bNjJI?;` zCq~%$w&=5O-uvOaPFY0FCH%pLPeT?q#hb+IQpUTz_B8uwHERKV7L}c?l)IBtUZsrE zEHz21L(j9ECVOCBx<)DV>daY;e;*WKcP5nTE{Xo%&mzZt6+BN<&~)}c5B~XbA(OLM zIsFr<*8iU5*J<%pbpQK2l%=mQm6q1zxc}o_|9Jrx6YKvP4*W-`nh*w~-CoA6|9a!o57xE557*BBi>jPhUPZU}&Il<2$_FqG#^rXG;>!bYp z;p8uiKmV{s9!dRQLk+FRKlkr(zh0RM(`;T1lTXX;n^;>lK)hvFC4!=c* z{}Azhiw?g=-{b}_lsX6 zhRxeN43C2j?IhRDwuK~@wPs>%*Y6*qXX@CsI@vmywvKnE&|BStv&}VIUA$YJ?qBmw zDPItu#Smek^yJvq%s4xNg+A!^s6}sfRFB!~XU|8eTjZ7w-aP0FFvC#Rv>SLP%de=D z%Q^ZYm1I;&<#~93^Wpb&S^k}Vne!*p6el_Zips0DjLQTFw-LeAa#<}^w?yrno=ly( zw30dS?Q5TsJ1@SA)^@Ly_Vgy!a{Lgfq;`;hYFOj9F_}|5xcj~>wBmC;$93vj#z|cNqgQ6c zZUh;tAy{q%C8<<|esVJF9Cb%))!qDiq_}ObL0gwCB8Ku=#)ww9`$C0l>-R%?^4H9wy=uSeCkSG0q}H9NN~i-t1$ z9?cr+&d}Ud_aA?04(*1DT)C@Um_WN;(0E1DCUWe)PqTLg4EB1MnM@(K9^Gcw=kI@> zlx)Lwe|Rt#KDCv!*ObpRFEBpJXwFhXEHWcpaN{*u3zgLcF1t@m z&%+ux>)z!kad}pJstKbzKG~}~g=MNGA%~s$=O)_tOGfSX|p~=6vwRA|Ux&cFsc;Ja+ zAai=d(1#E|?CjCb6#Nz`+eiF)JrekGLz)f~lP>G1xv(cKgoDc3ZgvUUxHY<5Q?6UY zb^Fcxd4b9LU4p0Ril?Su6_-*l6y54b@Ett0ZNsU^@QFF(9C0ye3QT(Z<9Z5(q*mP4 zQG50^BQ&H^@to#i4oqw1LQ;>aMhxTmf7G9&wMPq{E}SySnMK?<>vVd&N6b*OtlKe9 zmm<0GAfuutd%3Fl7RqYb;`$dmtj1CuKJMi%-t|tUg_saC&+|lgU|gT=zIV>J~UMTQ^Wr7j~s>ra5DU`xQ!q&33O?Yw|#p zcF6;gnUtDyOs?X-P3Nu`FYDli*58aulJ90XpI3Ksh#heBUDiX*+>>xHS5CA!)N0JU zJV^V+<!Z}*xGilyzY3_nm;Lhu3fN*!8ndO1^=JC0v zPwwPZEHUJvUNaW>WCuBnTeS|VTasO63_QBEHB_^?4$BiK5g7xH^HQh=jKS$*rWT%O zyI7S`vYUN_dj0ERgL~Vr&Ru!<{e5`*0JBA;nvORicBJ;fkHRo#BL1DMC6YjQ%LW=- zMJq*o5^C>=YPv#Cu_nJP;t8y|z*y^6aR*6}LPd?HlI6VWmRqYEJ$3Ige&|OA-qPU9 zxxdC(bG$o7uTI~%*zKKaI znK`6o%wxziiC1K9jx9jJZDCTK=J|m(Fm}-!MtHCKp`lO+{b6p!%xgiY73@+yT3FVd zD^TRJ*DDv2pmM3uo>$TmpKlGOvYL16m-B2Cpjpc(RI*5e;1c^0sy8=S0(W+4tsP6| z;k1|F@psus;_Ujn=Vt_pD&(Ep2b?wD!J5L@0yp`+^6{`-xsCo@XaFVuR!Jy4RrQ1_tF1b&k*!xnwGY2Ur*Mr; zncNu+%um+~=B86~fF59&{qhAVzR_}o))M=Q$c)yg#n`t?;?FtK^Gml+vBn)|gJ{!4 zg1K+bOh1e`V!B4K*V44=Cax9no{EP${Xx;B`;>hg!`F`yDIB7L%}wfS@y$4abG(ui%J^bz|(uZGQMLh2RaGxI(n zD1{0Q=i4TVDQ#8~%AH3pQ*L`ZUzQTaqK-Z-eL&62C(t&eMn`oGmo4J;y6oK467Gt* z>k!zVTAl2E_`PCjNqy>YaQl1Ta&HQ!x>ft#^qO_Ib(h6=zM~p-izf56a1R97kH<8a zveS_FF?-<0JcX$eCc3=L^wjn}>9fk{3c_GRy1i1zkML zIb@cDlNVr+tE(P=mVN`!q^P87@Fy38n!+MUVBb6F#>~2@DJF(@)2=Cpd}eg8o~)-% zX{{}AYKZZdd9!VX7t6Z-1*sWyd?NP3UZ(k|U9xq@Ru;fvSW~MD4JlJBK268p+j#&p zP#dx|G6U3F;m{Y-X|AWd4BOBwRkvO`RoY=uTPVkUj{O| zdIRE}YoYa7VuB{3)fjJ3uL&CLht^C`>#hsDn&w8S_UCI42lA8Pnylt&HapL|i`RWm zi^uQk*33iolD-LcI^G$|UX1eZ*q_|1br@7Nl15KnV-8Xcn!QD>-aG^gLRit{E1sNq z5qI;qz}WM!+$!gzOl4R~roQa^P3nhB&jhFMa%C>ZvEG`1c6*iYNBtihJ)w?IHwo9u zWkhDMRj(^rLQ2wkFMlwIaIrDy0&-FhbgM_;L}48x{=>t?)T=y;Z(n1Zmd98NzTcyw z95bKG*l922vX|fX@xBqX-$XAU7oV=W<7ukUU4B%g;q~B$-wWai=kQBVcmWEygdK7^ zSfSO5;i22ovt))t(w3fswel%<)~N%btO;Me*3gO<-GgT08vT;fcK9$|&n!(k`Z12V zNfS8DNBb2EA1`2R#8k?99QGwz4)qZgf96ylmmhkD7j8M8{3^lql%VbzGe6$ZvPk&s z$A~eDYV+~zhgMrYCdL#MRX^?!Q%Rdz67O@~pblj>e67{W^Z5CllLS@+mjuhJqUJ{$ zZQZWl*wd4oO_CjMRxfuK>WOP)7%g+rFk^lgKEx7#K9Db4nh`1FQJc$BITu!u`A(6) zW~j1$-F>f`%zpUYY2A2U&1Snz7q`*Ej=5>T38ucf;z53-VY1t9Ovy>zN!H2fvGHaQ z*Ca`~p@Q~NZS*ijo=p5e2By+?>Xc`$%OQ9#Mp@a?4n#H<5EN@Sq*-40tW0)9j6IFD zYWvHRbdBOs!b2M<8Omd(?2_5@p%s`BY^HZ9Jrz4F1~UB5&us3TNcrBb!fTsCG-{bkNfg_?&vQoh$!H2E4Y-~#;ju=~utq-Hd_@Sf9@NG2h zE=Rrm?D+mWynS@8uBCHdJht>&Up@70^@S^%Zq_YQ_p2~WX`)WH(*=)b1yA%k7x;^s z0yZ!palv-cj4D!>lxUH5W~}Y4S8AJUJl3nq{?MGOFSkLzHgmK$r-i z@+$v!&+yItYDQ0(b%t-QZs?8H>d~s9pmy4V868)wg$3rJ$why4i%ME+6_mZi*jTGR z<8hm6xn^pYQVkG-S^H02Wq0fBJHHA5+>acVJs0AK9~i{IZq(TcpWN!ZMJkVOQH#JS zdMnO5H6%`9N!Rms*Ugczm zak`f~Ih2akZE@b^en;_fnXO!r=RV-#iN6RHqDU%dQ`>@`y!Upomr}DPSUtmj)Hq%H z^l(ViOudbcy_?Ub(pO@~W`DT$aiy1L|GQJ^;QobH!IO>M_+*pM0v`IMBfDv|1+U5R z%;VI}**s4?-b*Qk_jYuUQK#;Oo!YtXYPlcGA1uda`LK2$BOK3{VdvneIkaL=Aa^gc z!hwSJftV0RHn&^uhhqUJ_qED}VD4`uYp6@ECtJyaP+^_#zH2oH=6P`NEk3_hm~O15 zyRo$`i`Chw*GqHd1*K6upts#OV&vRav5lL~lp4=wBAzTld5Do8!_GYqbU>kEob`(u z^*}Z2$&F9pJMvVv%CQv(sHsF&q;kiGZa!Nh% z&^$@N&DrhR(O6i;#jPKWSJ&4CTGw&`;dC8TDr;CZ!%{QK5f=4mr70cK_HYyRwo~dv z*oC9MtkD>aO7$s)>N8H~NvDN-4=Rx*bhk9OM~%{`^HHLv=g!g8X{}{>L+O<;9oN0u zqqU>K@><1`Z$3?;PEW3Dty5WHu^J4j!2Gl%W*d7@zXS@>vM%URoPnmeDu^!7rmWTz zSV*v`{o&1GTW;>H9?5&v|Df`SBJ^2%#FOFXJiYc|Pd+g=doqHTX5|z1tfFfX7*0x# zKR#Ye2C0E>Csr;T$R?_1P5?Sjk=x}-?gvk>q>Y{?B;d<(Ug*vWPZnWy&g zk}{zBCl|`Cv2whKhaL?@m|9*J7^Z8TZF88g8^ZZxpo{9-GR_ipw%ty<=%?q6dPPdn zrtA7HB{goI39Q8c&bnAzZ>^T6xSO{0D{nZd*V*qBi{=Ah9je_ewRXAHzZXDdMeH`} z1wWZMjf()LhP%|Mnp5Ov7bsWg1)tO7-gN z{F)Sw;U1ge$+v8K{>VMQS;PA>59{tNJc1&Xhu1vQ$f(c#v1XfodNSmZXkl=?fx&89 z0|HA6OTK>F+h8)08JX&QD)_0IGxZXhoXBzzB)k?g#xEDqr?hl~4f287>gyk-}1 z477CYrPf8SF?|<@MH9jMn5W^!r*JyYuW3SiJzb#P=xlxF3dy|Td?&SCIv~jx=jHyx*xsTJ=A$$& zlyQ-Riw3#g299*F|D40HjxWje-QffQGlBY$oHjLLuE&w_3+7`5#1&@3^hi8x>FLDaPuRFs|r7_I6% zc#g14k{oCJ&z2dDmKdKKKYF^t!*&^F!Huu#j;whHZGrXQ5G8$Y+@`y@TOj_O4F!lU zA@19$8#rk=k+C^fLIe(9G;Gz!j2rEa7$xU)p_nYc;`LsRHWz`(>g3z%1XyR!&U>e} zw(j(JovV1&TkUw%H2?S;S6yb!OOcs&2}YIzIrzPEUBv}1-l#+2g_HF9GajaKy=Peu zWO^^Q_rHzb0}&yBx$2IhNc(Z9z>il6J(U_RdJdU+k^3NLob2R#xCwjFT+RocO#`en z$QtiUJ+cN!-hHMF)&|us*Ql7PlIzLLhEpt6@9ZuxnN(mnFhOAyQp~I zX#CwlpHgOSy|i(~*KJ4Nfn|$pHBx0wrUWqGjK*qxkL|sbFb!QjDaVHNAx%Vfgu0MAJg}MmUN$V_sJ9*tXM2Fasr9HbtnUq63{UX=OqyitC5pqIX9b6~M|A z-On``%U}PxN)#1&sRkBwh}l=`j(f@y#soyV;(Sz3e7ry3!Z!+4_I*bH>Q5E(v+3aT zn^L~$PWEn^gzuSpJ+C?aXu8tkE#l_dY-h(5m_6;F1P4t`)JH>AeqjEU-J;|x|4KpQ z$??=Fsu}y(gRxs|aIq{pAQa@OR2|fsFx*#6#hq-xOkWazzOf`dgV)VUzxR}jN@Cst zD9{rYlr0ooCbeWfgg>S7RuuyGyrf0&;u&-?={woDg+e~iB=&95)@82Sv+h%yBC~MZ;H2k86^&o;%# zga3LX&>KfttQMLXP^=^I+^`zQv9U>|7lu)=P$Uj*9kbuC8#WYsDnB=f7kXKd;vA`{ zUiDg)TXq?h)l|mVxLl)elR1NGP(2A;6;{fVoOmI=e(LFYnZ7jXY*Jg7Rc?@r z>UMH1OUu_NhZr2l@wd|Ha6VaMRWCN1)$NsC%8_;73@;zIeiA@<_(RE6A9ZrL>^3YX zR{ipkZIi)#3I?~J=&XF)hbXnI7mWl&Nod_AFI{TI(U*<5v#b>xWA&fH1ZwWntevwy zs=iNSG1VT2`51P$1r7eF^1&hNl79$z>DvWBcn~>$^reT_HKvcwy{(?WN^{atfe62gardo|{|giE%oF-t$?t ze6DoVZEr&Fk-}opAdz|Z0W6R4ln%C<`OB;#EE$b2JXOU`r#U**Y5XA)8!(#?5n1A#*{1$6sq*vVEy2^R%=9Fl5$q4q z3v&z()=9-E(Gb5cR0>C%HZ4+>dNk&~h@^tWLYx?C3Ux-y2udQj3{ZBczPSc=%JG%X zB0LyeqaCO=Cu{kqGsffu7V4`FH?H17w=37mn-`i#MQsiHx0B+RkNkD4ox2!Xar(Ki z)fv>cfmDi8U3t$0!GXtiEEQk>o&~zeSy`Axy917$I;QTU5s8 zwKNI4r}i3|N)mbOt4~zEb2;jzrLQGKxF&4-5Z_}|w9%bN~<_!Pb#2$|(Ss)bUYC&krSD43IiaChHjs!aT}aFAf#9l9Se5 zcSp}sSSF%guLWFARRRN##qIkO)+Y-3;s_Su{uYy(_{z*95w6Cq=ezBp6~|o+_%BYEdpbg000^y2MdEkXbxXaZ*D?sTPd1}Hw&JHi!C_&95)xf6IXwAA>=Dbf z&e%0^5_WK_wBBE{c|wcxDc^0?l+ZBW@WD&?;0Uh15u%|1ZX>UWwSM0f5cQA`Atx+G zN~FfUP{>|Y2%vz2t%jM0rmIKDjzk`=JJssLfh0A`v@nT>WSjfW6z==e2)o;#7eUQ| z?HyuV;9H0_-z5SQsbQhStg|;0z(SW$Qkz;3sAqGuTJEvgM$bE%RUN6C`(dw-Kv6HMoNp~gWJ3hS|?`P7z zDUVZV_OWR^gbH(Urg;aS#02mO^c9{eO$FR>Ah!m()q1ID9XC$bS5%DtO>i6fy7%1| zC(El9&kVsKTWeI! z>M&MTsVI{NysKol?8XHpQv@x82sA6JC_s2T~oR!DzA*$?a-hrfg2zh1g zU_`Soh$*Gwt00DT>W5g-d9*mVCi-jhW)4lnZa3Rvg7=RgNPv|rR}7UKxtAdNaT>G3 zj*i_rGGMzQGSjAiSXl#OOvWwD|D_!-3Z9=a+p1d5BfT9E!#oW@gDeF*B-5nRkuZJF zD*H5MzH!H>aw{}p%ceDuoYm#YJAv1I@i7HyPp8k_<+O+$v1%o|{CSDc#*CKlqsq&V z=f`4yd$vzu>)m~y;j#Tzf7CGO4 zJ#*s~MVAqQNyNbvptvGN?lc!z**(*+3U1wAfHqn;nX%_&4aa}1JWF3%WJZ#NQ203; z+{|tIOcWNLUPEkBScpV70r25jUertPil~!})L==KLYZs@=6B&+UD1F#L%T=jWJGk) zhZo6gpN`Y8xjThCv3A$zzU{oC6t8?Ibc>o6gl~}Jl!iNX?lO$sQUMu|yfU|g`-}Mmg;mhr zeF2yGoZ_xgkT})2%+_LHHO%|ir6;Iq6jxCb&X?lhqss0Yu zyhdr{_%*n4KDM^Saj`Z$bM}W{!JZ9NB^*^{Z{z{4zU(Wo{ceARUBtjSU!*dC1=@yG zm*$#|gmJq18o!KT#EY~6fYO{;Q>{z1&?;N&lUbD8iKf&_+7#L>*6 zaAtBmjnwnz7f2wY-QH49kk_EW?E}9(UC!#FH`2!r(N%@XW^VRn)oUQPRoN6Pns`=J zFM#+ZquJcX=m6o2tIgY6A$*&k=;F=TYPx1!-j0{=z$zPBsN`-bw?KzkuLijRcQ8k+ z@cqO=V~M%AZfM7nYTIyAwNPJSyqnb^tfcL>bK}`{WtDG%DF|Y)7%ss8Bp;qXs&10` z0sn#ypXqwG_OVIBYuo}!@CEC+fYcOeUL!$5An$&p2w;df7j6e$afD7O3#R>C;s!KU zsEFSpWzRhrR*HVII0wXPHORelWUl?38Gz^ztf$T_yNAp9>!{H7Ctz=JH>PU1Fs{d4 zAc|o(i~<$)UBzEHkLqb^Fian_>oNDTxK3ZT8j~T z7JSlVkgdli^Eo>HK}*}@X1FYWbn0JzRf3iCif6k=WLA!JjWz*~9o}?zb|kdIEgO4? ziR(;8LD*}P9Jr(Os%yJ&yPY_PYxhxmlj;Mx607Sa#~g^L2&4{*w6E@GgD6$Q!e$~y zg@J^#s@n^tIuxws6j)s2FB{VZb~1N|l3#xQ2VXXM9b4!eVd$p0WuN3!ZniTLS2b6J z9VUZs0#=fN#yu?wV-Zd9F1>U&c~D|=v($=V)n-XDq_;sgmR>s^E#qGpc`i2Yd&^o} zYW#wSN|Jk5!j!gqq+9{kx-p_?Toa+=Ce9%oTo6j8PEh?JI z9ZoB@@6E->@s!sYiOKNywD@VkHO4<@M}R1?O7gR@FWuLyLB4kwht&&3RfG1JjmdijW|V8vQO}{&4aP?fafY?Bd*zCJ9xp+U zWs!6L*D`xVeEpd6^HejbS-~Hhl9K?q0jjwNsc97EBONT0tjsydytd|{E zZY$80%#q;=+_CBUs`V3J+u{*&-na*VKo>v5dZ;ki!hLkeW6!}Oy3B%_&;`8j+4hZ{ zLG|_BF-r>7;RIrzChx=1(28q0x>zM3#0~_9+4)Gk<)rTh9oH!-_VUavmw5BC%#DbQ zd3+To9;Vb1R6hocI5>`XcklE5-3#XZ_^k#g0MzRr4!DIp=*dE9ws+(6qBMD|nI#C&Z z!jEaPWllEnX6|;*MGDuKpl@YwNb9h=xpcmIcIO00QE#tA7aj4^RBtnqsGB;P+bH##(-T^;J}_qduE2{G!trUa*`5keUv#_O9 zWO~0d5L}^v1nUMOyl9Bgm|0i01m8W_No+$xx07ARYE0e6>*YC9 z!}|fzUwN*|tr<#A=P3^xrzciT+EbFYT(t&tIRWCFBUq$3?=jpajT_dZ^UmY=SEnt% zXtbU>AyaH@$5aE|p|pNu@COzjWE9VJ?<-^w^#voLRYIa+W4V0r&LKxS4Fk>W!}T>i z22Beu6g*%=OzE(Dnn3<2yCX8WX6~W>%*rCvZ;RrjHfisexft=#9><>%2MIs05{kYh zhVrMiZ5A&>xnTPVbw3d#O3+pmSZHlgs|eYG^bvoHlVtU?m$ekp5*5yFNCc{#HCNJAIc=7b9$|I! z(t&G=j_O$ygN$11vE)^FgZcXM=qr}WzT72cH|gHlNgdk}*N zO)RfhLrx(g=cp%w%xE16L`QU4{K7Dcmxi6UKpGgV-evaOg?o9l zyvWerv%-yf+V)dqMza(ySXa)pqSC6Lken%!g0`%VFB15me8-{Cw(C`3tZ4`Bimtxu z`b8!C>p|c=|3K0hM8(-PkQ_lwVKIYUoq2<3O-^ebT6=|c>XfL3*C?+)XzwOLl8I@) z_MtN?MSmH^(QqpwX8o~>YB+8Qst20+^)+MOek#RrMuYA}d4*BEbO&6KnMquF#*g%4 z`fO3@5Y&+zAY5+YTH3Gg4#0ie6a?xG`!Ijs6G?Y5xZ+oqO z(~j91&TRi2TEWo9I7{s`scg7j6 zX!Mw9Lm0wQAtm?WX)|DA+GtTJI>Vc9H`uIHU!Y@7dql1Vc`(u|S6<58zEZ;f-2tlqRoAcQAf)(w%3f_C7*?8MX%{l6-udff z_#u;ThA^G4Lyj4*=bJyKi3hfvg)Pa#FhT)A6-&(EJ(7=v2mEVpcS^Fo&T=C-oTk)! zl(c!!rD?o$j6jwTkDw}~PW?WNh~l}a>!Z1q_B`scJ=|q}4);pE_YMbVqkgP8)Ky1? zcvP+wb?9cigupLuPg#H_J-TK!h43CHD~*L`--J47x)X|RN|{9It~J;_*kO z^Ihi|Q=ho*3@GabhxI;Tt8wp8pDY5!DLO*}6>tc-jr`^Hqd-E zH#G-{UJC^K&7tDHn=LdBuuNJXlbYKC&T^<)6&jRvSy9N)5!avsl-~~ZIX?@76jFx+ zKh9*xZXsa(Aytm73!Sk<;1znO^rL35WgYpxd^lS5b?Si8O2;Sjro7#)+uJ4i&T7

rkGpG`A_*K$M0LW$Jd{GWEk#6qZSd7}x1|BeG< zd5HjIk?5O$!$mDaz6fe&Ic`YjZOJ+siI3ZrKs;XBNCd+T?8PP0QnEd#$O5WjJi3yoe7)YkEAR8*ifQYGNzULd>y31H9A;x>ZauD3*MkbszX^j|tSa)a7DYost$RDm+NV5e z(ZN^+4EYOBuRnyI)q2m)J1fp+U&Yoothmklu)JD#mbn95K8wTc?}bjkWG@C|%dRv# z;uJ-5MZICkIHgFNE^8!9Sqnkd4%Ko=6Fcxao2ZHDT_1?i&hKf71B9C#<46M?iIozC zR<#u$f6{FBoY=#v$kaG{?7)n#M&qZb$J&SVMa6Tk1B z3o&}Zf9ptRZ={A-*uU-bDXDXBpexrtsLeeeHnjge)8Q4_>&|W~j8Z^d2AXaIQa+lm5RbZG(bs|nZ zN|5O`q@SL2xLF12+*%TZM9OpkOReJ@QKB15N+o${zywtvFOVj zwJYX0Yh33G-FiVz8CWB&dW`Lw-x5AjJx<~Ue;nCwfD?Hm6Ew|BqP#3P{zi8cH!CNu zRM=hGMsC^aclZ}rdJ+IpskE-UBRTtO8xNowdNp<+kl2?yC6+Nb(2IAyV&dUx8=2$r zHl-MqXBM{h5u;uGeDI)6;L(!+O@+ae&wUiA^cIiKOH>RiIS>3FdfSl;Nw zd6r~|SF+UnXrNl`Z%_>6)ffa2EX>=+?fUm%gpbvSbQvWUrToI#`53rwTXUK>Xf2K* z4T|23mbI@u_)|eDV1$#4g8Lsk*@w`!vq+d^42uE^8V+RFR`>p-+Q7*Mz_DB17>7A} zJ>!5&G9Fur)ZR0JdLkvu%HL5VVd^_G;tn7eyD!(I(tUj5KJ*466SanEy+D~kHPc+- z)gWEaPc`(G)vRfqe6g(XeZ&;S9-XUmqg$DIuk{w_K3pRodA-^EQ!-L_DyCU5bL zhcQ&ywuA%xbD<(U4BxiMx6SmmR~%mg*|`m6)zk`#?*|n-TVXS22bD z#s%P2F`fej!UGbwh{<{PdB?O4F~-*6_V(JcAtp;42OEE5ICZpD3PoDca?|vBTvl3( z*dx~z&aH)TqE=yAid-1kmCzK5_xBV`e<+PVA-TGey_c}oD-C9PM+>PXU;Q7MR93tf z1E@AgK1d3bw?=%I#h3?)t9^u_QWgWZM#cG&J%qNX^VMtovh~VFO|q4>oIW)3NV{Z@ zm)mtQvO#qFc6wXubMq?+)_C%8e;Fki6tJpFyoP)s6TAx|X+L-JbzWz^f~0wnQt{5B zIU@2DA~V;nI+ORO7RICP%*6Q&UdogSSDvkhJYa$l3oK{U;s~d=P$bA@dQ^cgpB~IpSU@y=i@npF~{Y%VdIdd``eoaZO7>yv&YVqG!GZjQ^CIgF%nAPmAoeyAZ~Oh))+?kcUwOt7%QcKbZ&eS0$;o>?#pSz!uY z^_X#Jb26@ilG;_lH3P)bM$`{aJK1(JB(k9S;+;3PB6w?2wOh0c^Z3wW@|vSa>jwAz zt{v!I@pT;5_-FBcfyGPG;(U5zst(zq7AUne(TRi&V4LK!2Pv3)$5s$g=&A%2shVNr zwU8-L$w7fhG7L|jWZ|a%6%SbS^egExQ7=H1U={6fOh^%YPml8nLE~wQt=fY)o)!Oe{Qn0JL z?Nv`D7MxE;&(Qs_a=o z`661=9?2jc4ho6rip((onY(zcFyL_be`+9A=_`u*RF_~#BOu$oqy*h&NvDvVUM)=9 z3pk!X;ojfI5@Jy|6XX-A>f^Ca1q)SPYP&%mSu#{Pv)_}2$qDPXv|qk7+0E6}H%hAr zwxT>D+JBaOsPIngZB*WA#Ju3~cApg$8aBQ_rap`ikTEQ_V`0R6h$}R#1WAPhBo$7C zhItjK$)6Q`CD!_AcM1CbjR_IN@X9rDXs&=fDKncj9$L{Cr;;obkMVfdTv+v9rcl=#i_=0~G%>ME_pl`n z=IpnRCqp_ayCkV|NZ0c=iT@im6|}K_Y!4ih=bUt$1xc{{srFFjFa%%j^QFxG#7(%I zOr|*M5`7oOcw_f!dmocdz;-Mo3-V3UUbCF!-HuV3xX8>6H+}JO_Z+|=<=VKW_>1B| zWT=@L(+Pf<^QS)1CEtS(3WruJ_W)os(k+wp<*WyUY9MKL4AB1c8(?o1OaENVEcLXp ztL?@oZ)GbMWUZ@Y9PHe&M?}ayg`=Akcv$hA!>l~ zdKb^gxqWNJ5|&bK5Tf<+dK@3@B%h)eYFA#WILO0>u1lW)q(6-B0N|>bCkc6HxXR$0J67NikejCGqq* zt%Td9`GZQlS&_u&TkbpA+RgV?MM=T=j~fbgEO{I-Bru}()sZn5H1E+oxx&n!8$UCZ zO*(HPWqoXNw+wu-_pJBUir&rF=H^2bi+NeWx)@}CwFNZFK`LBtX&w^)#XZk<;R+?U z=h3X+*U<#(#Roa8pEqr=nV}r-Prn;4ZA?_k z4BM&O{=fpeBy1wDbLdb(?Nr6_7c(t!)NEU@57DDe*(Beza z;MD|J2_89jJYQ`9B4HvWb;3vLb2;NbY~fpHF%GAED}{q*KQt1BFD1+m;%X#1eeIwZ zHhU$NItl4$Q2k8G&L2KN;1EhO?@_(4I&C2K_hNX*SIFc( zRk!(Rr{QxtK@LI#q_Y;-NOXt-!6;DMLV^}65$gtlk63?k7&Ngyx)tx*dpR_r)(uMd z>~70`qbo0f0tC`CfkQJ;8k@E-tQX=2Q-f_6N@Ybge3Mw+R$N1mOPaI-C_sI+(D^Fj zPuyQtYx&EVSPB5;M2<-`OwWR6jyTh$a?)CM1Byma$Ty^elQ(3!=Obc_oYCsiT1av{ zg7*&SRQtO(Z2FnNN{r;2;Lrge&Di!i2ye?M9s_A)r$2lJk^RX~Vz=;Q*K=#S0MWOrXVqh$cP&!%%gk^Ghj{1NSZ&tz zifJEGNgRx*Z3pIjv^|WF{J`8?YJs^E-U0pMcEuwtz273xPS2WbpPsxgi z6e?LHhsY=scUXg@Z^KvTzo%A6)$>fC?vbUFjdeKT$ixEW2+624c0DVbaJ8rv`G;`UWIj3OkV}KtZ@z zu>RWKX@A;RS>cNZaroBqe%sJTh z>B3^vfDfV#sQe9dt%-E2QhP2ScjSVsO|YxiM-&wh>NSm*s4n4R!)addP90&ZoWqlu zBa^e)XxL=puF+ZoyWp?r^)bWN(~8;r(YT|hS%bw`9|+0txz0W%y?6&A=R&U4YZCS4 zcCxNP*bsF!B4)TY?6MJ^LKa9y z_c!TR{;a(hI5N-RRAt%_gVhMJ*eRZjDBe*h{D}Arq``erZ4WB!DQxfpCJ%7JWGth^ zXAeFzKE57Pn#^0wf}x1@6HOB+DSo({PK5z= zbJpQ2I9}AGdJscgbOfR-px(NZD*#cnNwG(B-j``l4hMGz))qmK zb67tQ_XkvZJ>=UqP&4wvFD^yP)w=@D`{|u}5 z@rYiC3nW#l|jxV|Fqhy^`FFFh=6;BDxIEB>ZErg5Pf_4mBmc*S?*-o>u0Pf0JFa*RXTErsWR(GQ`Aj3JBrbIv%vc z7NGZe%Xza@_mDjsNLjX)HO&moYak^Dy1f#O9bmE&S|KbViuY>|^RvrR$pZWq``t>- zF;Wk@lJ{VHiW%XuksGpS9=-5 zav({DZnrg?)s+;PWhO`mAe)5!iT=}o`lFI1`0r`}ia(>jiTAZ%mg>wm41@?NMJNzH^-ZHA{F4`WZrKLka=?+Qhltz$l zq!H=v5)hE?ZjkQo2I-Ink#3~B{`)-7t@nO^zg-8z0Sd?M*?X@w=Uj8GpZd%N!-n=G zFr4#$r6PgMJ%H%tKq`rjjOW9e2Zh~@J>tOv$3l8P6RBlI%L}Wme)N!gYD>uvpiT3O z{qakSrPv+GcJK(6mnD`jFIS8r1thjJq*g=(YVFrQ{99!b&GyzMZb-}k8W89-IO+A$ z8OR6y=Ah!WPeAHrr976T353-3v0v??^P1u?VD^f(oUE3jf%x~^4N!;#`RQ8E@vOo~ zY<-ht+5?W>Pxu+&?K>Om$TnokXSR9I^U=^TfoX0-lB+Hzaj$7Mzag zS8Dge*HPk=K6Zr+hx zKz5qDh(Kf+RUl~e-$H*MPn#$T$d^XGbbW@_bz^Ba(Y*RAgBl2YLgGqT-0x5!>4PC@iC)-heAv>UsZJs zbkp!dyiux)N%j!;H+Tt$k#4_d%^4NWKR5mF)F$sb3;K z8Fap&3{kmR%yzJvyXyo$0xe#=W!;Ye1Uji01ZL1LooquHc01=00OTFa%6~^NsJ!wU z1YDguwu--~lf-5Dv>S!RxI||;gX=E(-RbT!HUISC2hf2G%tQR!BBngp8j!|#U^fGW z>pn~10%>Jn`w6m^gl;PFFD}SL0~_!2)Edsp#k;tc+HGGutH8AAO?@-E@}~o{V~~A2W~! zz!4g5)3vOpG>^Yc&$pS+ZXZ2t+WsUVlw2Y!w_8Z@Qz`nW0m&h;U{A#k;Sx<;RkWt|;=o)9*oL#S$QcSgPVHu&ewv0+wJ4cpg0z z^tT>(Uq6G+h%_3QQ1p;<4&@K~z#5`@{8VOe@NH{}cNgL;OH2*=T&a8C@ih0m2rAwE zb7T^=@%*Z0o`Y?SZos7uY0kb@5(OR^1moq==Bq`EhC+u(8!AhCxnJ71Ve+69pujI! zBw13M8Y$FOSX}&%AwdyW*JxN82#fez5z((4FCg?5L}Or#i;u0N`XRFn#uGvOGJ|Tn zJ7Y^{JfwNUNO{6ZmH2Rw8U zLBWIB5F_2`aFCa@jwasD6?Q$jx82brejk$TUlW6Wo$g`s>vLmzy;_#{Afo*NB|#=N zf}h%|qpO0=XQvD%{PGOj#w2Z(uPZkC3%PR{CLzr?|20I2_PUSt@DcbM<{(`+kSguL zH~yPde_&JE0&QB;?sbB#MEONu39&zo+q^!@rnb-mvK|Yu;6|9S)oO+tXj%cF8GNA=vdHy6tQ8{J z3h@TV%MecqFbs8p6fr9=Mm~cpv~ijA!`w_#xf~GHbGat@XZ79FMbOCk*IZ?JVG#y$(_v2IxJznzdsPXDO(~&P@8dKx}$H z4bUFUA}QaoN9?7)t~eQWJFLBF<)nO#4F!8!BmM#{PtaK=3k4{4%oCPuZ~e0G>(Pq_ z|JD-`7&h=lNV4_i2Cn-FG9gR3`kPJH>o^+`z*dW2*PW}urnZ7ckAigA#&l+)4*8~2 z3)mhb{tA$9gOei~5$et$-Gz|eQ})bTxz^oUbTH#!AGB?=18G|RUFny{#-&rx0#saF zTRapptnZQ5Y)1eA1;y(H2lk-0*is&YhU7>ODiuXZmoMm|*oCI%|{^L9rDORmI z2CV5OMwVIAvZ_xD2d`sj1$Y$yIep;hTmVOd*5^5%ph!U0ky44Vy>U|}%_%J*+8P)P zXTCXZ;G@Va3iNtMwWcd{4@%?0j-+(>Gf(>s#r6|gUSc7PFpkVM4K=k3Up#CwqWFOXgvi@<*b zTXm@?>#!=r3y6|jU%mszznuV%(V$oGd|WTja;1?8JlRRbT;J3L>Mg2AH@Ko8y*$+} zFY5TwJEz!3#UlGgLY(Arv_TE=zPSM05_85rz#T~4VxC$d4A5gZZ#YFy~Y2M&gHc4 zd3;6FS6i{x?m@YkIlL2|5*{8tUsgs~3)39aJkC0=MYdU42`*1$=skRN)L#`XQTVXq zmkxx90(LIfhiNs}e=~OkdHMKG$81cmM%wSk`0$x^8a;3A^EAGu3GaE}a~ZDUbdxsF zPVL8WY0hRasFzE}QV$Ln5_M){e{3m<`V+`53NN7^zc<3_&;0Sa2ea<(dUrCH+tzkx zEWJJ$E%UDiN|{Dgcfu4Z9-H&w{Kh~eksR@eF*I-IdJK0`I@Q5?j7E;WFPx~9FLEhH zx7o}#VrS=o&bO>QuA4Jrr-sK!Vjip&hOkP<)dAsPt4jYcj*p4`$w#VpD_h;v7JMY- z!9?$@OT$CH8;k`P&Ra~XnwkkJ#ZbxOW!aOtU(5kZo@S%%S}cvi;(QwT&Q;^o1SV~S z9c@Ix=8N6Q9*J8kM*2?HeyXZ)FN0QE=8*6jLm8}(lD^1{PL`V0qi|_WQ=B(k8a1Y) zQ1}d7zc|!%mg=pR8f+)gLnk*&(V=*ZsgL zQ!7|^Tnnq@k4jR+u-#4?jHP+W_xW_a&ty2E&HJ$#8AthtoS(-8s8ooGuT{&<)2@y5 zu6`ae&hLk%Q_6drG0$!i_r$XuGhZX-kDconLXbvXiH8LRO<&24nuF>``)Cm^z(>O8 z9L84q2I>2FUan*$7w!4u{q;*?ZvD4*6ER)V^l6O@#mq7Rwxe8ZxWQ7O~H zl~AH5&6+0{|4z(f|6Y8pJ}#N%Q@ij`$dp5n?7DFCioIUGtl_W{D+Q7U>B%%`ps^q$ zs)-UlLF)XvHn$84qUF@1o_FG)m%e#a&vZ%}XFaEqK^BGhwkoVe~qIx!G{1eEq+#7-&v`;|y z$#7>h)n%?+xBV~u_aHHsw88vB+x))%@3;rEW!llrM>W*`OQr#wG9&p^tDUT{r*A)m zHF`~R_3{`LA|$YUveq1oh~)(%wfxP0{eUkMXEp5YoIPA=^9TwGT2i9(E~Zw<2ut?| z;3Jg@zL*xy2(@HD7#(3PMnk;ghCWu-@RKot0!%g;6g=K@wC_0*_k=c=DI@wedtkZE zpZ4tJ9cqiw1STT%*K9Mj$?Rq+rIZANElvlf#kl?0d=A18=-i*G&y~+EWb)2@s@GD? zQabW?T^B)@^PC?xk(GJ785wgLW(Ed<(RW|=fw12W%=25Ikgoi3IcXm;nPAcCAs9}3 zUGh>ST(wAvs&GYrlxd5VNO+~rVs5=eLMlbSSm`i}V~N+w-E546ImF~mla(|h(HImG zTo6GA^>>duS;2E?vQx6t>)nM}5W2QS73e1_?H4zU-Mf{aY@kLtx{z**ltD zPuEK-+M5k}zD3m@faa=ifA@gy^_4^|2%SV9%6fSbpIV*I`S9~(j(7@(c^3Rqn}-Wz z*;9sn-;-r3WJjo~(1V+0nL}o(Oa>|SlYL`qYMEqv!YEfmec3N5OfTUlkh@e~duMrZ zvTvSFRFvtp@yaGJno`-NI`@EpQIOH;YHJ!hIwn#ByQJ=J38h(WjP#|$`6#7=13gSV zXgIk?Y%k1K|5<`5I*rFP2Tx4FylnIo+$Hr=jnjwQ^9*s+Ilp?1wq!r;5j1ptJye<~(W5%L4t21I;QEXy zF)U%aST78jC^1O=7V9itcvuq@+MI8Xnl3r*Pt!4WpvIAWrL^6vv)_`bVG?zB9hHhA z6+xuD(l(jOlRh<=6?F(RE>bN?_%kHta1KttL_kBr54mKvZ1ebIE!Ox|tuZYJ%fUM9 z-@KOdr%YKJe?5V5t)64K1_OT#Q2{ZGRlGCVsNXJjHcwdW8!GDzic~A`Pg_W6Q;XNo z@Ziy3;ovkYojk%0@TDA4Te&lb_)mD~U|9qa_fW3V!YDqOks?OERa=9D8GBH%SUVSd zuwH-Ary;$JGgxl4n<-X34~yj3p06@;IbI3p(2T-f-6-ST`U?Ak632O-KL@=UR_NAr zJmdEb4=nvpmBNo}Juih+6sy0pRS}QJ(nNlbQT%~o7C;{Jmu}V%c%wSswJ;`gTHsk$ ztUl5g*OL6n&-q^1zS%oR`k;5;Y(cq_>@aXRFDBbW$u;%6K2NckvR+&iPZXvf>7$=X z8m{uUFev^QQE{y5eJ_Z+~?2e`L-M5hBc%<_>eVHkauN%r8)f&=2XC;k% zBV-MUn_FY)+Pb>jP&l8$X5wwij8TS7EO2iJXDU8*DCNuCO0TAAYu1>4NP3%%?54v_ zp`!1onmk6#KAD9QW<%DybD`Jf-c<%|S(q!*llu|-%XA@TaIFs33(;|XJ?0R!uZC<` zh)5uci+)XyPoaQ1M%6G+9?li4+e$Pe5DD1NdhZXBLrMxWXdQ{{Skmc5o{ZBz!sUM!AI#Z#_{y+Jo>&VKlDZ(?oeC z-e2wt5?V`uF%PAF8WgTZzb326Pekqs4-O5uGmG#;`I~jP;b;&WH+1%eoAF8vJjc-T z;Beg>wubV@$ou_^q=zicT>L#TOqK3dP?w!G~75oo}yM zz7R4Obw&9KUs!xa)e7z>3r^TODM%d2I+2&(elBXX{@n~v7sz52^46jk31tA`R2dDR z=Y{`s>J`ALf9zai!m@n@PJP-eMVzFrOhFyIF*TSF_R}W)CK^ zd|JijJvY+Ndy~WTj`&HLgq!rSqd~*E^ylsfv)ZDuMQ;|FRmsD$AYj9R{Z6~_qfXBkI(#o^m$FXisO-W%d<X@ zj6=ZrC2s3*pRHI~-_Eu0pl(;Inteb{r63$O;_+R!Zi}wl+*D_8+hlP z<99mIzapV}>MzU)>+(#x-WFOEwL0qKi*adhXIbWk`>B;_&6H~X>W8tSlS%*nE0DcA z;kBIu8RYu6Ns@a}{*X;%wnfP*l&4cU>**iB8!wo=*-4T$6!%x&i_wT4VN8H~z|Mjh z7J9RSCgAt^!2+$(HaLn_F-KpiT)ko?OT_=M__R9^1>La_(i2v4KX%v&9n!x&lKiV& z@RD)B??bPq#MxIITtl4MfKO;7!!g&h??gmbndKdUB$;i@6+T5(rD>3q0cS}Dc7};SGngLc z zxh$W5X>|np7HE5g=PizpcbDvDW3D-fR&uNIyKzG=%Pv6i6N^QUKfjKEg&+Lt6r=H+ z+3F@-LxJBw1N)N1O4}M6OITGMFb9(q%-e)Jsl@d_esM6za`J=6;jgVUqsOe3tl@$! zf)wdb5VermY}L#npdvi(frDvuOEHsINdf!EdtdGE8dZkLLxxPOVUB{27s?)-B$M|> zSu|#8g5Jj~0^>Im{e{Z&*}7xPhen&KOttA}K)97p;h?K!`?)x?cMp;br7A)xCRkC` zEZ)v0G78FY3TFwXTV*s_NO+#{s}@AZqs1A#paLDNr5Os^AX!5qgKAnVL^g2YzM`hh zBvY))JqEY2jN8ikS$S;D@Vs>!ZsS^OXj< z65%78ALD0NBoowR&|fm4eG7-s7*_J8*k8iZRulEFuGrG~T%7H-bCNAJHk@`Rx)LP7 zyE1=&-%VA;BqpvM+(weelx%>7hzrSf;u)8($RW!(x`QdDMq?I0Q#_Y~JHG@V)%0gn z*M@u+Cc2J%muhhvwY`=Ex+P1max<@;X{=SI|Ds+!E5AW^04U53_QRCn=A$RF2L(+L z8kC_V$wKxidR=_dw}=PbpL8rb%jqJHPWf zFE5HW+zfyC^Ss(qC=*XFp`;~+%?`>Q!wE$bdpa%7qkb#V)!-{q5g3%LFk`#bGfZSY zC!*fYUO$Jo;@@%TYpWy>d>1%eZT#3vqip7@9;2yvK=%Dv&#oWt3p)^2_l=uA5iFISLtmTX4SwEOp3nYt(bjOHk4|Lzvy1+5mRo&AJ|&wO`p?6K zni4+M{i~y;fe4aVIAW}i+2N{l9Q@3-wAsyP6ikG7f-UIj6OhB5P0=Y^1BsL)_1%7Z zqgpCkL-8d(&rj%jXGMZYCG1j0#DF>uvBYq#NXCNH*;hFAVBlxc8F&}$6e&s}KHg4cX!&~5xp%<4a90R%X60w~idZ7)sCEtVh*BTuUT?9qs= zB~F<00@}=)Q^s3U7>XV_rH*emr+Z70|Kd{ zwY@1#2V>xw`*YpTNoa*it?#dnEGqQDp6zEefTSzv9mgC3iyui9i;^7orpmM%nNqTmJr%!N{B1J%Q$RAZ`Ai4x5eeynqJm7B!2&XnTrDE5XJPxziEgifvNKS66jOv z*&a=mghiMC41=d%qE^mxtc`P{IuYyYkGzRuK zj^Gem+eml??0)!O!CY|Xh!l>KvG!3vSHR4DbxSA>wObnK1gS=+!&~^qdL&>C;mdN%|$v@F+R_A?>X-g=Q|FZ*0MX;qYp2 z%ycw`N;;-jN@kox<0tc{cK(yPQnSGfR`dq~M6wyDDE1e5N>&jlsfRpcR*@l?6#Y5k zp-KhvRvv?TC@Swnx%pdR%<|sA02tv77lGAa`=k%p9V`y+cz$0fK0fV7(oXy3Gy#x^ zwd*Z`tQCoj2swFbIx%F4y;y;P!Wg_izp9Lg!uSDf!C?E1Q3MIvY+rJ^4NWFE1dHm+ zLUn#C2W}Fpkwh|^iJVDQ+I;dbofFCC$Q8YAdSm5aLVQF1<<;zgu-zjFt3uXC2O>$uX}U3NkF#b1kbpVhMY> z$br5APyz&KIFuRuo0(^4vC`g-A!Z8s2+B99sgtY};gImIfs9n1s-yG|gbs>y68*c} zbYvx-#%+hktTW%_VCE4To{+yr(F^G+14=xd70EVwVy_NXDv8mgd7E+lk=&aj!1r;F zTDWDnFm@(+KmOH!A}}6KD09Eummn1`&X-N3cmHrn<-3HV0qnQ2EI4Fg>b#`sO$(Am zW6U?Y$&8w8a(Zb3#f4v-D0+dYt@x&?Km)=P$Am0qOHWeGJs7g|-Uxc$+?;NNTcBAc zcHP1|A1&gVp|7X$&E(ii19>Wt^@laIz6gOS8MFsPePB^YFntL*^`nwd}5uEHcRT`}W?& zmfXpKINW-s7#ELVpI??Y$$~OR#5~ZS`x`pRD`NOefSsqRb{3#@usqmK-(YW|pRu4G z$N08ENOU5NY)ueEAd|U-&saS%)h+1B2F5Ka7du{%cLZs6TVg9aJBX5P*nf zM@p+qUX;2aipovv>yw$nX^|2`?@j}u5Nka_{(ID6vW8(mU;prd0Z?k?n76D*g`@!6 zBNsEWe}qBT*<0t+&fs^0u}T_zc>D1>9(E~0tz3sN273XWgf~DL(F+N$*mddA_cPIY z=N3go=ui}yznTddV%OY?F&IGhgKa!YN6ojv8eUYS7gb@(PT9p4M2 zr2X!=m=ZPp?3~TPSQgD{I0VW0GsXiaGbo1gqooEGjY`s=YUVmvp{D>Zp1Kt{N?Yon z{-A^w2k)X1Ttc4~jV)jgppc+7UO`Rcy!yhYK=1Z!tL+J~OQd*F`f|#d4w)-M|)3nQ`Vt0dImpf{!gSf{=CBCl@#JImuGp5BLrwE$AMH8~v}E7AIQ(_zb7X8Zw~?891Rp3B{k`A;E=%fGZ+4`aKM5 zb+{%0$5UZfhp*0ZVJecOj;kW4k{wrLpCb%hf&Il$gN=G-H4fzwy!(}0A5({0nTmgO ztedYRuKo~|ii&(ZZ@j3I71OykK7Jh|g{*2{?J^8hFW7DFIw z_EaQ5)V9RmHrMn&a#2(GbwWY|z=Znj(Cz#de%*FXKyo&gi>Ft$t*&cD^@duBCca@3 zU}Qu^`5?}r!%a#8>LjVE%@#_NtE}3LVU>sWI{E*UNTA2|7cJ_slUHIP+f4iYO@}=9zBWgh|atbH<>4foiH4HPx8X;QUj- zyhTGQqz32zOE=>m4(KPXgDB1~(^8h$KQo1%f!Jo$8=NhRF*1xzBfs44#h1kSs^|b1 zQB+(~>VT4}Mgv$cr)a;{s~6s9qiHR%NwR&z)yEn!*VZ z@=KXeFSYghIvfN_{*?P(%p4*OgC25e#VuE|CcBMqIo(Y7MD*IQ!!X)o!O4O~{iwqW z;{A-8)wJ&U0QVzSvLyXZ2CDBxD0as8oAQYcpJx<&=G7wAQjMxYS=>wn0MS4cQkzdQ zN;L-gA#DPWIYhw2Pu5U@x;Y7>7~d)8BnA!#GYdu*Qv=T;y7TGrzDj!3(9~o>^vTGV zC2it>r)w(!WC4Jd9hQ6@vu<%soqz6RRTqRNeqGznbvl^UvnGg83K|AN%3CMu3wemh z)l#ZRgXgiFwjoEdf7TyP++9pwM1uey=psj_^=|5*&uq6P z&YM4@fyi`{*nKSEL=n)_!{WMfAqrVZXqFb{D;jrztgbTHDiF;Z1fi1L7cP)}9m^Qu)AMuY-`{WE73 zIo4azRZxYHh=Jz`Z*J%!eMfn+PdqPTOB4stYiH_q9o&n0e0zFb!3;i^F^2CG67*K~ zpTQqH-{@&JJJMzF&sBW7PU-Z?YnDs^!NX^|m52_orDUl-XIB^+A*?n3<2uP_{f@AA z4@Z$tU4oDdg2FTd5PLhgv2SZ20N9r4m6jWM+3#cr$=}DadBSN4oqi;Fj?#x5PjLfv za2a=iFzPP3a;3pZ`NByFOG3%NvtDT}3Prs&K~!LUX|Mz!d8of6(}UcnHg`^$){MX? zrV?sESgHL$4U*_@egK;Vs7Jd>z-M0pSk;-$`Ls2fitk=BU8MXXTS-j>hEf>jzab(P zDKzAxQeJ-|O9Yg>IpT!?-#?hEco!cSA`MnRngn-}yb0paNpj%xcOS%0P4;~$sH{f) z0rvDq8$JM0;kz^Nqn6?R)PcOP@PF%z(NIWH|MlU^fY|wjzl3`F2hf!Djzy@4-k#44 ze{+52DFdHFpQ)u|;Jz>r+xdHimh1OQ#DJgkJpiF>>yAtq>fhJ>-@mU~!aV4$9X@3{ z{_nr}XBm{M?vPC3jfnAIYxw_u!kr3&_SoMyGLrwiBme%;|MLitdW7cWC zS$%?4NgxxuE6C)!dtCkdyG6ZRN4&f@q$h^@!z)TnUA6E4Vm!PA#$@}SJ}WKP!}*^& zxkb9+=#=wp2`xiM>$toa)%qeZCEO@*Duw%ltR1@V! zJ2o!f$0cr{z)7!_Clxq1+;(ZoNyMqiZZ@vnY{^4E8pHD|;lqLF%-yOw+Uq%0}&C}H1#cuh;j4}y>GQM&(Y^u8Uxi~c8M*03;rWQ}Y`Y9_2iQ7(xBY4Gz(IPMm0CC1NIfqJ zbe~3l#uJ}Ga2(?4sfIA0(ONrcbz#ik@BPg^0Xtcb5sb^1@E{J1DeX)q#k2tmmbfaK z)c-&m-*AZPfJq%n4vl3?3w=1ZJ5gRI-zXq-WHnoT8PA}O(cv|a*xZu9(&DriEHu23 zt0^%RY)Ue;4bF@PX%xBJ*>T5hw`f19*%~(A&rVH}e_vJ;Mk2C{O`G{wEZ7C#9;DVZ z8kdGKl7J}n3SB;zzpY|$PoUtoMkv|n(V&Z_KSG@>OTI*WpdbAJ3ZeljvO6*!Oq9dyvSYjX?C8V)TF$ z8b_dJhZ)#Op251yb=tnQ#gYR3(kxgX5J)+D$Mz*78q%d_{4~Cgq}*&Aa{Y?yX98#F+*GTCRz`7ktyoZ`Tr$_jYIYBeLPqzrn9UDa{HLkV zre^2;C^9kkk?#5=MZe2kG|t(h9u1mCzh$!e7j=qfn=7|*AS{w zP_du{f}a4i##V5ly51OUUBuN@SC3@yIC-kU}Ppnf|lj<-sf-{9SgH(-L~NvUa8IWSSdF zfSS~1cl+3k#hlermC;@fyn#VTv-5FaqH$Tk+j-6Ei&)?TsnBSrWK<+SV$x)+^}_&8 zlsEfxy+XT-&feFsPtA^Bp1ng&BXcR9h4_Ee9nU)*w+9W8-X4#ow+J2m4(f0s;))>Q zgc-tjV~B#fNJ1lexAeI28s}>dtDSzbnum!D7A;-x#|MyDh^8x2`Q6wQx+Z9|w>!&T zwm4nMtuj#a^B=W8z9`r2h=3bs8HUc&Q_e99MBN%nh(^L=()@O9>_!Lsc1^RZA70nCy@K0@P1$6WqP$>-5sK&Jv=1yO8X+ zhEzo);z9Fk9%D1Qk$j8K`FB-lCJa{%*^Mi ze=7X6r3=UF^5rFb!dAO)bCXzsb!G5$f*Sw_b_gZGa==x0RgReYswZQitzTnT@-_>~ zujw4`^FmH-Y=OL3)qPL-8_95jbh=UZY?(~jG7KZZV4Ya2<3A^>z#&VU@%V%pi0XUKePisdcHW_6?pXr#Lp3^(@v%^HrZJw@(E&VMK2Q=23guQnaL*lD{LdiKPS5yn9b*SOB`G96DMFR3B!+^*{%)si`y39!p$ zXw{QXFjb4$w`pg!-26P@0RU5sNv7+}g0zgQ*>D0=Nh!k=RR72lXK{}$5?vzGC+Ig0 z1XayC6S3W$lJ93ru(N+n{|Le6UPz?-XEmDg;;7l%R z_tlmD?XrS>j*~AEl13rvygNOXCJKzmdsd}p`0TUA}SIT-W1$wQt0 zHD;4%-%x`^t7f4-bTSn5u+<#iI;W)?miXJYp&#KK3j&y`Fa%Humf(L?o5#f|66 z-!Oc^>AIQ3Ut|cb^4|EdBG6(HZq|2B-0&Tbt~CDF;%;?L|dSwz$w5vIZ|xKA&a!Jot^TzJ1;0Vsi?E;1DT+ zMihYYvy-eR;0oEM;Kp#dQpBrB@#}|Qg;g63R(}jc_$_znZoix=QGH_4ZC1H+T&sp_ z9ReA#`lGAD!zSl_BY7MTt?f4tS=ii5R;n?413skqKsgbdi4t;nFVp`1CRh{G*^a==n4NRyx`su|UHMLi zfq?6{RIf(9SAW{vh7f74=(3(byfSI2{xP(=B7SyDt7@OUQ^dI0ajgQ6h5p#ohndJw zk3ko!7}~KgEJ)&qdFV^vc!Xzl*!C*b@`FF%$!^0sM}G+o7x4BYanA~L3Wd(q8*by< zslY{iUAjuM&Ko$oD4{=HUM*GGNW5%u%0C>c=iA&#rG!7+ z9c(3(Eg2$j{0)#+{?d9$z49y|i=ieIC8R!vuipaAxs|@;BA+3czB+lOjpF4uxRUGg z>GUK()J?cUkF&Ug?jR8PrH8uHtlnAq0xKi?9TMD1q^n@9)wxKBdgulZ(&_|$?ZLeG z$X(GdGjm)GzuMml*qclnF{Gi$NlIAH(8~6AEJIg+=uj`o15wI!8=xeJd zVq0jw<4CfCoJun2LzLacwt}03Up^Rd6t3c!Ec$!hzf8x{kTO1t1#S(4G~hYRrFbuC zB*A6%kTI=C#-?7Lw%AuoM^+_nI96(Y%vD%1qn8cG=#x$Dk029=;19FW_jjR2ozi-? zHX3}tMC)sXbbjyAA%*XPO$BUI-D_g|=iwGB#ZTYn51<=#CZxMyqkkfXCh@QrgcGr2 zeaZ-|rtrFR6nk-r?_#h)fmZE^+ja1(-zjqqN~YCf4exuO#Z|C@zc|}xgf)8c#J+R; z;aMwD55$?y(mmzcMFz@#!3dW>MKtzat)HvRoHtkr{3^d;zry}pLG8(*RnS&VE$#2_ z6o6V;f3D(h=wCdMW{BrnoJo%R`pN|upD+jY*S3=N{!*7{xBO6pp00BUdh&8_*J@2t z^9Vegs6-{=7mjn-oiG@Ou?q8}&T;t@+`L%t{pMW$OT|Z&W{-W`4joAarZ)@g(^*t;aVgNemw zmm;v!*bP-sFU8VgpN8)dE4xBw$$h|1Y# zSoWr@$5cW`^f=N6mwe@)a3+(3rhn{to>10n<_nm+vPXVk4>&cx77#UfvD~Z&Xy&i6 zI#cSlURaf=R_r~W@Hth^+63QFhp8+3bBRDB!(zkkdg-TA4)HiYG@iovJX3V%Prv)) zy~BymVg2^>#C=%ZIQDww@*z*BQr|mT@R-3(CV{qmI20fC&=o6sXWT=m?j*cewYFF| zx=|@`E`R02&Mz7Dz($>>FX4!_Pg#NLZBB|zK+HFzRV6UY)@Zu<5Iz8D(&JWyoW?9 z^iVe@l-|vi1Qb*8ZBh)EA9GlP;NhZwyHalt!cgKDrNqfaDrc;&D-5^nlCymq2GwGf zNW zvc0?MI|A&V$8pQBhysnvOA#bP+=b9}9g1bhT>ccGL+M*{l-YIe<0_(<0;SW!tVCj+ zjwtGp&)SzVyPE61ci%+ei`u5D4FCY@oPlc$oS6AN-XH5Li>QCE)Kpy&(r?&>j88K! z0qWYfEbVu%mTWOzqJ-|-%q=vV3$(W*BqK#+;&a~doYG@NPv~jbb}sI=9D8%S9}+Mo zWKlt8*QNieiEyaIbwaL-v+ksx z!dZ%t5pO7NT|CtDY)>XH2tQlOp-Zw_%HilPJU&yLwx+fzuO+Ry zzB)e zphzJzAq%9!)fos!uKBAm3NcjZACu2fw@r-eRtvSs_)0i5mK!<-`nSyYUArG*8P zv32|WR@IJMy^;8z@QqHe5;)1X`exla!nS6&Hxs^!SExJ+cxrhtim3BM{1lp6g=hxNwl%lj*lvE9s!inF|KM~C=q_^ zo*(UF(M3UC5AVf8MBVcP8PzXL(MP z{9^rV-mB|DgZOO#K$-W0prn6A4~6qWKLYb>3y?zkj4x6~)bkeFg~gS2B1T)=zh8#E zG)DDEpf+mmsvLZN-=Se5mrAy8zBcu2eZ(-Z$Q{;VW-57nfi9QCEuN+a6Z-Z`MXfCX zC~A@duOBxADlW@H(fdO`x^LhrHWGgmkcz_9==j|n_}eoFR;l3X>Fbv!kF1Zo3oRA3 z`l3ecc?y|SvOlKB`TfOG+V8#biI-EDi1jj=AJ0(aPqFoQqDV4>Ps<2=9JZOH5*YWg zpuL>NQb*LtrYu})-I&NEP(!s>H~K;ukRKkeg<2CBDGa8_iFt+zp8W<=6(0)R*O?C^ zPiS%`j4y_<8{T|^BS$K=shqUApm-rofJL&{5JE(Qe(0_rMfLsK@z0Y?#&YE^51m`= z=WCUk+49?~Ji1?}){$&$etL%*z1hcxJMurm2%Y4Y^$&kNJaWxv(hm1kUE4B$d$gld zqEv|CW{n_?YnX=5Gl`9+pWvk!jOt(chmG?1DF5)>9U47<;}{I=2-wHXKw&+XTczn2 z{Uj{QKm&g_tgzOs?H6XCtGFmD{z$-XifY9~ZrOB{Ax;>1N*_X=)SrJQx2GyUZk)kN z?_KZ_MTZp`KH(<0XT|Jv#I4O!li&OkgEFw&vo_6^{h7}7P%pGw!^Lc}MWu@1j1v7v z)f<$-Bw(}5Yehwog`?1P8~hKcA6J7I7HAvRG*K)zdfA=b&Dhj0d1pe_)YTaXHu^(8 z{vfTHXVlNfv2iWU#jKZoNv0lLD z5~H;U;fo+c_8;~%P{6&TS5$jOCuPK)Hjm@gXTzOE(13U_T2)%w7)XP83`>Xe#@Hcj_ zOu65-W3wZcF9*V)u;i1={^sR$0uQX$psZ8RmuX#WQohH}7|;H|ZhyhOflOf~p{&el zx86(6G8REBXdG2fJYHFp-p3`6aTpul4QyohK5J+D9Z(w5RO(;hwnwm-F5nwQsOZPu zx48XkNjMi;4w61TesV}|3q@lrTd{faKDe?ifFD;r=CG{x92}TZj;Cc3mbPuB7+D9|BWSA9 z0M87b@zFStcH!5*_~P)-bNjQ0ya7w(APb|LE&QI`!>7Cy-ix=+oSJ_ zBM5oqT;4^=Dl4nLrght!s;Iv%4MaK{|MToxPx5M~)_h8*)e)&-^@C2q9`@eak=~di z@$R-}a3%rZA3?^TX>3*QLonq__Dki^M%-0`E7kTX?`hR9O-If4FdZsz?6zGa30Z$# zkhS4Ysi3^;PIP|zteRE%U?8Z$qikY}t_#1c3dJB=+rrNJye7IxtpU-u%@6vdhy)!; zBTO)8*gNA;e-!<0ai01Cffhhpk?mZ^L!CP_gdoJ6H-6YO6w5q9iA)T55jlQJC4su$ zGO4JzA+(<|=Y}&P)Y$B7G!r>iu66gck9o&z(x>uNWd7x#^to_|{ zgy$(xU-81+aqD!K0gPkfh^yJ?S>{ORiYzIi!CIe103HRi>t^rg@C%?B#JjZXqNjLKgMv@mADh5a{L0?@MUarNMT`ywl58dAAm!2P^ z6LBK+X|AM^#F#CJm?1TYr)UyC8Eb-N|2f)b??QLIFe9m*La#x{bd*#vv9foQN z7dcS>sg?XM7;drlJw6i&&+T=(1ZDT$FYz0i{#&mMqLh;-mUM>Di-hOYqtnofDTRcd ztihVJnw>EZ35iz@K$?rDPJY?2-d#yMWnYVwm7vvO5UWlWEmtgBNwQ&MvrV~}BZ~Vi zw;T*v$Z<}3yE%MspJ%u^cpf8F^LeoP*k^u=K|oG_lOd{tLvLfvPn($N#s6#XEraT4 zy0&dRxO;F9n&9ph+#$HTySr-$1lQnBg3HDwxCM6zZX1VS?~v=c@ArADzMtREt16Y+ zAPp^# z(?R!XOI>-lc%sR(e757>eeYYpEJ%Y8conpZqy4Y66C+;(Ff;A8BB)}Hk5EHJTOKmQ zs`Zve7&(F_jh@F{cWP@mZSYP}xuSQ+Vi0fe4&@B0UPG-yLwM8iE5|loAaD`^UcPe|;(x5SAUdwrXMokFAt zl*Zs)-IC`vl~u#?$34;q<)v1?Hb+Hv48P?7yXVI>p%2jB0$FGbf&?M~xBxBf7vmt=={$eEDm8qtoAm*)niWL5dv~=^YAk|0r z6Fylqdvu5^I5cANL9bMW6Ohaf!?d$OyJ}72`|Dtpyi>ljZi~ga6Fx(kko(yU5KUy_ zo8JfJsJ_TYhlOoVj7GGDk1z(>lf*>)u3Dw({9idFhv$o|=6-_!oGzw{0v`b!%i*$QO2ign z8YHVyWo<@ePt8H5POo`o4PAsav_T!1?AvhHNNn))>RMp5rmt#VCRMxfF+U2|*BR0< z3bWt1Kh|wyFZcH3Wic6x;Zx#H86tGOdTtd)iEt0H%jKSqOe$kKkIMj%nSb&Hll{5L zqTZHL#e&hVP$}_;lPb*TA+5L4+`PU}vNA&M0@ukOp{Y_#_tWQSO8pizD#V{UckEmx0IMxF2lICwQI0; zJ{tSFV7`^HY1PImNSr>HAW|ClqWmE{_`MR#!z~)3^3JEwtsl~f@G(qw zVt-d=_1DAi{X20-(?5aC^&TNM`D8M0vdw8TR4Oeo(gXkE2##-oWx&eZwk-=nb0UPh z$!2a%idfyucfYso6E;{~-kq|0H+IV&xQU)4I4rGOm}glF)Df^noH09&xjCzfMsKDl zL72z28|Zqtu(`V275ZuQKcy*ylbh1W>e;7tOwsZ{1*73;J`Q__10|Lb z+~hoKy{c#VF@w{-GzNK=`U)z?-aBU`#coTg;Ct9)o?3ql4sn>tVL0j_LCjC65x%zA zvv$PEDp{$0mQcSHJOlsLYLnD+6DfpwTS%E;F8wSaZXf$TT5p||CYwy9CIzOlE~quG z^~z!pL$833>%+y7L+CJ^$@5h^?XR-hWnTB=W;t6bL$C54Q`;>m6YpE%TBpmsQuyeD ziMdJ+m4ZZeI9loBh3{>yw(I#WT*(M}Y}nR69Eq}NdfsP;G`j8{w>0zn++?k=(W#~| ztN(DQLE9-h58UbKYDwWH|E`*P3cMB771h=|GTRDsBZGbeiEKoc#^)MmA%gd7g- ze?rmHK!QAJ^SX-X?BH)w^tJTrbTwFtITCbRG1$zdvVs51%@a!!$O_)sTt41ayk{Gi z&}swnI`!Jz0i1%a_c9Zne+PgyAi?t9`<6vV3aXk0*v!F$SU^sD;{>XaN1a}s=z}dd zMc&43i9w`F`|EHLZF~}6j`%D24BmaY4{wl&5hv0tTrCV_|6tOG{?5ev7YU_#9}@xk zJEO4g&iBUcagnn3>v%3Zx-GsRM|R~UVwqFrb{QaCa*V{_!o$L>u!M0`%KIv8G?*N& z^r10WVCQHq2^ENN8-0gFiFhK!Rs^tqkKXtE`l!^1Jh-H{_BG6BI`SYS{O5zgz~GQ+ zIt3XYum_l{hf~z8+|>_GlH4DgBad}Bvia0g1C);jZX}@aw#f(E<#rO%S+@`o>@T#p zvLJ8Q%=h;7)pA0qA9DQpF-a0|?Y786{-pX<)kg*$}vx7T2dco2%AlW*m3CJ)qtEdCmg;1I9)bXHog! z*U+}bL>8hHTWeHoTN)n+fF0}K6fu~6Dwq!pLkM;*@BCzUypO?6Dz8K1P4D0CNdXsZ z6euFCg=#%RmPKdO(*g1x%@E}I_=T!fXL>`T2&Iu+Thm;O`yaRaQX<_zEdRoHiub{d+x}5L^)B=XPm#MY%!kc z?fK@PX_n57SRPw~u|!I~n@~9Y?XgTQCtK>J_?lZ!At{^%b!^Y?xH9h<5Ex?-@fhB< z(};~g$PyrZT}s=zi%J4S86CbmGQ-L5SPK;sh#gfr&8t!%O6YZKw@t$V-OYN74;Q~3 zVI&ONkxGn_3>6XJ33(uA<7Zfh#dr0abL&7OYuUHMv7AR1%M^pQVs5X)y+NV+( zRiD=io)RiF{h{JoCJLuc!!wsjPI!}0fUpjj_yINwq*pK2U#RsO^!@##fzZ`uag*gm z;Q``pCi_>0JQ5tO48qsyYW&z}}LWGcMiGS;SuSv~Pu@vP|6FXOGXQl(JL^dXy(R}=^Mgb|1a-TS$Y1F@ zNSd+OUEJ8*uBZb6`%E6PwP8SHZvC>#}Lxeiac(HkroFx=c_*CA}BX&1IQ* z4hK8ppUZv~fjKK%)Rq0WW&fWK=r{;q>>j{=E;;@0@Bh?z$zc z7SZ9CTR*1?fTQu)5pjQ>MM%Os2zY`G2k{J^(pj;S|KkM!V5gtS6?0Nmils3`jZ9}% zi^uSJeX*I>OEt`nOE+0_AO|sD?{}m?;aV;M&NeP#dD;vp3b+v-;Em@3n5)P(LHd{1 zrUo>+xLP8Cqx7$p5CH$0S21IV^k54t)jK-L5BbxU5t%TM`w!W{;?e(KNc=nySxIf58vh((L*MrHP$NKHzrBZ`tvFk_rTBtn&H(Bn0 z&7S3kBZJnfm|U7HK^M4U#JlCoUak(n0M6Csdg)%w z*6=Tp-#MSAV(twegwxc;mV=mPM-a#!F_Vig(W9_F2EKYf+sLPNdT zQDZdDuni>vknu)`@o#ghFxuW#@d-l-nJrffXHo|uAb^DNx>`6d7>v&r3g{*SoGDAx zD^!LmW}2+nEM5Q@6mBG{6);ec&!C^}^-JFB9JiIOT=-t-iGbPXQT#c1I1FYZ5wR*9 zk7_T#B^r}$HB2m0{%?YP8CZdv@u}cSzal?Dxo9N3&R6)=uMW?9E~bOjJtLZxg-}OA zlE{gH;a$xMG$ca)qT=;-Pa*)O=ksS$!0pe~&#CRX>X;RO#HH%7Tn8r2#@OQUh;AUi z8WZQ%dbjVf2t>q>Jnee#zRM)=JANdVNqw-NsTEAT@u{drW%cz;51Atu1a908HG-u;%watNl zzCR|l_F2Qo#QFZhEZJ5Uv*=wn%pr2K+z%xr!r*{n*75>t8RSHKZTz1m~kg5RIX!grF$mbT_pkCT& zqAkVbeO9G>oD}8Mic(*`Ay~`d0j7!UBbm#4^k3$*H9M`Y1jNDcLKIth?Ae|j%uF6L zKwyA^TG#pf2dPuFxNus=2C{Pc%@5lKWXmEH=LM)N2x_%qPT@sycN;N6sw#mZyTtUc zzroE0jOAY~C(v^?vbz967i4{{?Q}L~t!@POYU9Ir`hC~{WPZ|ya_BQQ_d|VI=*6=X z$c*~JyU)qxuo72~p%DxEzC1&xIdyk?0fgfQ!kUe~xc~j^{3{RJ=CKYecANkkqtWBV z+=}ZPoh^7#gBFUoh*384v7o~fa3UojHx>UAihyN0_ND00TlMua1I;!Gq#?k;QAgSE z(mW0chjHW69YEa;#1gY?9nF<}B^PVW@0H9J@KESGUIOib2F4m%GB~XY`4ILeHz~mZ zuE2T=2ylYfV5C!-AB*r@+)jQ$E|1lF{g?Js`XAa;gHgNPbz%+V`66YirKOc0eE)EJ z@Rj$Ptu^*ynz`Jw?FgV*=^8z1PL=w8($_;a%cS$#(HL*_#j%-fNiFHX*XY-wKBxo; zx=Lr|;ib?`A+h|XQ~x2JB#9zGoKjGf$>|_c4PeHe*Koj|@W&3M_v1C1O*b}XGDEGd zljtwoIl15_hxMSg&h>uxuYw?NL*$=VjWBtU{I2`8bo2qw?aw2=A2i1S2LF$YYK1up z(MVY8Bk18K;QE>iz$6mi1-v|Y(A7E;0Ihht=C^m`KoMRcXGm}8Yn`V>td1n+^bpTN z{;2448*K#-yY4b!R$)+rJAjO8EsoP0j`}MH)aA76z`KPFKV3mBKipA5#N_&(ojh`U>^ zOLDm;{DaM|$YfD{--FD<4X-aNsaH^#7y%Z$dfzFwmZ`=uyB*>z2nSx@jzC^0#Yeb` zig0Qm@!R~lOVc2LVO%L>4RNUg5G$toRMjh4`L$lK86Z{y@i6@IX9?MqvFs+{W&+$R zFfqOqpU2VELxi4CJmClN2yCgT1j0*_WGS}!S|szvF+c7nP;X+; zs0W{t-r6poufu&?_kWQ0i894uXYPeC8W`oF9E|PH{KAN9f1ynFEL^h}p?5S}M(scj zfyJb}5z_g*)aD_ik-OC9DG(BEvF9Pgn8n4AI`}pZq}JOz6C8z)0~o6w0`+?hOoAL< zzZ7(>rr9e%-QxELc$uI&DM6Hl}-7j_Pw>QxESt2rW`k%qTUD0C-|}*u|#)|wvC0=?lU3C zx(n11v|;CfP+a0=Acj=2ScDAyBjc!u-K>7^vWG7@31M1-HmnAjvv(DIB2Y0#njrGR zL;FA=NoJQpYmRR0C{1x?Q=Idb9-ZmMbtgjLVWsUQl$iM{?T*iJWAA`%5BYe##D8_^ z2h*HsSDov4JM84S)tL5qt$_a))iY4JjFfsBuW207&LbYE3Ayn=qy~HyAMI8ic}SeJ z6UD2A|NL45M&z)=E2Cp{h*(39n^4i^N@SK4GR?{77PwI1p_>$Rg+A23~qnbhCp<7e~(a#QdXz&RP|_$Gw$WJ6vpA-!P&C z(8~3D8?}G}v&&*F@o(&fK|ycTm|D41((%r;FPN~4830Ap=O*wHzvaFoFqqt^NEEr* z%wvbK=yZraMAO;yEqZyT6)ur9#JnXpV41b`C&o60sU4CfL>QShLfbdL;EvK!>7vbH!Pv@rINY2CU zt-{|z5zajm6Y&w2O60$ZBkV9?)G9TcMBY5sH~CDgsrbCE80g#ena>zlY`*>s5V(_N zXyG6bJf6kV_;xWB8Xb+zT&Vs|6pzvdGW`|5?Npr<-1FV^%1RGhCtl#BKvKnLwu05vPQ}WA5wzF2ot8(=KfTQgT|>L>J#?!`YqQ1 zEXPH&7uv1OWFfDSHd88^eeyWWjN2sYI!#Vvj$f&mzl_hhB0T! z?U;f;wo6*7jnCmFD7Q%UFpI|^?Y({SN=|qJ=!naIJ^ts~FyJ@|P{M*0^3kZG; zo86doF{9yV+2b&JqH($LEHJpBucE)N3P4T6Rje+49W?j(hejop7_me(b6A`P1F$}2 zEEgEabw;1~!(Qz1H9K%|Vhi7f>HvkvEFQZb(lrk(?5aq3ED}`uDPV~v90kAfVPaG*g0QsRp6)vkThHfWc)e5r^DeTD|mw=B6}- z;bBRZWmhiaSACh8RtLQ;`W<9EA>eFtv{z1uIFiZ1+$&4d?lkFe^b5w^C* zqDhRhAVVY)v$&Z^q9pAir){LE3V7evadu0j1AZ%JsE6G6o$SmdxvYDm8R2dBNH};I zsb6y$Z`eUsqxS9AIFVp(9Bm(8o-}@=tY37SpMvT{1{BkfU1+OD=S&052sw#koRy}U zKTw{}P%^U^v;>#j#(t^;HV4l?L3rMzwc6M)q9DrQJW2jY`jLuXvL>gb!qU$RKB7 zo*y=9T4_=MUET&glxGn(>31W3ShUk`CzIT2WxC3tn$>^gZfBJT-|(?C+UqC0L~Fw1 z(|u*R&8R2f@u%1-tkdpJqxd8#L(% z+!fC{R2AXD=imA%$exi_bvH#Z)@mdZo}?}HYsFJE6!dr-0vbF+dt@`G6!B4xS3Lry zCmXLH9HUtpG$TVp3#5>P1n_$9{E@f_awLpgCI>IgyI(xIq3-iRqQL-MB}XVEsogYm zc_yxpqjtWFQ1?!`gS|BI+cO}Qn^qLWNnMgJR#HGA{vgxWyI%=pM}dW#v~BwFwUMTi z$6Qv+!Z~e^`x74Nv?j9wRLOy#w;w#LZe4L)IO<`|{)kJa2Gb4+v!z0QCLraTNxj7D zb~2RD=zN(9{pcAJ=)Pb$lfmwdXC5LQYlNI&1uvKVJlTT3R;kVIU|fq{GsLKfqMT8d z4=*or=8r*1>SMh-kxmY@3knvIf>x!xqbV2l#|@WB206;a;@|8ldFSlKvq5m|fWu9EuOW!IRIvz1meM2b+{qPDq{d zfLt_XiJ?!pN8)lX+HYsUmRW!#x@RXqK1hNY2hH?d>*z+l zZ}HY!ofh-$81ggDRezl802u;|D4l>wKx}E6Lr?@cRYvRKlRrcnM;vAs`!jg{Z0>|N zhsOoifTjNrnt`#=x1CQJFZjpYfnnIw!kX72Bk)Pgb}4FQDjx;)J#umAJ}Wj+LoViD z(myWDJhoh-3AjWB9G1$=D}Z%FYK9XSeD4Al-gMI;Lt4(2t5-2Pbf>a5;sQmfinAtL znr*W@z?;}uifWYoc=a?wabHPq2oK>?!8eEwtAJM9#9#iy+TgDw3LTOO#Ot+Ab-rQI z9*5IUKqqo-W-a0jBtO@<=8{0Ro!QWoXm=KiNaInZv6HDY8c`KfGqtA65ek5ukup7A zY6)?#tNQd^U?%em6_{uLU0^r9sxu+uj3KM4I)mud18FCcRp)I?q>ib{6-n~sI97)pau&==F!;RNswwL0{Yu!d$$_SK{q1ZPHIk8~E6u>1In67-rJhul>FcMT-+iAdr>4)cK7F-U` zG+3nt(QZ6sL;~)}ZkwUa{4EAwzK;92@{%yy%va{Ci}#4sNGIWFIIgj?NXIf|^@ydE zHI9_~!RCcs{h>c%@Y^_lzjEifW?Nb@SG^vnxPQ?gl>+1;>ak4>!GRqP>H%T{J}B%C z)0RWiig^?jk*fA{Lew%-{u)?a>>Ud&#uBx!n)GqVQ9#tE6 zp&~0qmhJ15&1^I}Kpm*lwF&7r-vKr_XTK_-@%S&U^=TV92o$m+J-dw4ysSUFsc)#z ztB<7mja@Pu8?y2nPAb9Fs$*#+4cKvlg4J(@!#x)i4;v6D08zn-*9FdvT%ZFoH&-v4 zzE!Lyx1{yeC6(qNF5!ghrqy{1s^SLe?J`97FNxAId|)w*W<~}T+5S`C`TWX!{6tn$q% zW_aI2$mLHVTxrnx-5@tdTR@`FHLvYljQVuD)GBh+B^-1Km__rDC()u*sgFsCa4ZS1f<<_#S>^ANsL-6OhZ6 zpb_&p?@wZML6Y5}VqJ)NEfx7F-Z3VH)9NpWQC1tCzX`Czp8yotA{m-7cpO`oKW(fi zSixnR6^=Dx%1aqViWYk?x6~n#xAal1Jo^UtLLQ*u16ojAQ@chLyfTo#&}le<*gO}2 zPzVFRkvee9h>!EZpK#o(x!!S*hs%0Pw_z%sE2ttfDU~H~J4u86vLUMgPesCI2d^+JL#D$8(OSSLa{~(s z>c14PmdZ9XiJGR8;pQozA*8_8O3*D47`^j+AnyUz>r9FV|4_WJs08HD9sJ5Zu&9rK zy$-rFpl=OpN1_9q62zb!Cz-Wj)khd$DToY*+!1L{n?qG+nRFVvWh>fotjNqq@pG)e z;Vcw>AZbtY511im)4*aWu}}I^+N?w|=9U>yxWF?xK?}C?b7k&d-ZiI?KKiW$P-wSv z2w?0y^i+_CP?U>Cb5*Dl@cL5O#^l}U+zcV!HHbX6!y@&s{xFm2efL9S4G2Y}Wxp#b z`N)R%_E02eatG*1N?t?6cIM^w7|afToy2Vi)%>v?yNgnNU)M+7UMFP@GxhBg7F{hB zV$Fl%jvghXPI35rbUSK4aH^<2fA95I^uhb9_$pC7W4R^|Q>r8eK=Q)I3?1VnfnR4H1kyHC>-b)45!H21OQOTyWhRyd) zmI2=3itUe?-gI8D6{kVSGqp|Y(F>N9cJK+@6tQAV{FLg0p1^Y} z0-oFZW^*h=Mk9!ru?=H^K);ncrE~4{=>>_%@AdXi#U!h3ud02%qeP@}u{xnt`Nol+ z&y73p1WXsC-$DBu6}8C`{Dbkrg8Df13XwZi($J0? zOw)!nD`4`F$CCLR<};z9nzrcD=tJ;uD9FKO&o6caHU^i=?qUs)&Kt#1pIWar#iW@g zOl)A$evFb{4FK}JEXM7G&0gv(hcb@O)cR{cal1;TAJGd57efbWlbBKEyOTe~K&&Im z%`k%(m|S3*@fmA@mE%z}AC$=|;p_xdz#^WfMDJLBy1Wl%YS3-q6?c3Gn=2BN>Kk0A zVH92Z)Ea1zoqMrIY7Kye#m&Ap*_st|Zbge7YbMkCMm>kk*cEjrVZ$;Ia9F5OAfM&3o^ zkodQT6e#|scuB1cgONuBH?7MZH-ohkV1PI2MA3=}dKF}mYZO8`KI7<`pu!C#4lLyX z6IIJqkS!5HXmD5b;OX+Y-c;&5ny&<2&AMmVKwE_7Y=YBbZ!cw~8(4kCvszl!I0jKB z9NMa|k0B1WY6h6Vv>UA%qo=VZ#m>Uf|MnypYg7`4L$iti}WHa7WD9=owI-pU&J!+MAs_+K}CHU(s=qCsZz;Q(hGq5hbKACCc*Ki2-YghLio0zEo${Q5bMnyisKD$QPx_vC(4rzT*z$+oWterRLM z;yR;Y36n8&$Gr}y*!Xz2H!9_geqoZu#Yj1mzR)Oy%(I*=7FcPbq@JrO2uH*C1fJ&b zEX5maQWSEBU=i%{PCr9*z*?SBX4kGjsCL>4!zdK)w$SR z-BEIgaB+BmvcMT&*pT1F>%a9ozgLPM3x z{mC-F4vPMf4%lW)wvxr2wcTMMW?b4pKw|Cf?p#57=Z-;vX@OzvMW#*l%J+)#f=gWg zfqSu}=w;hh0kNk-I;WU)|I1&~5b4+28Rl_iGR!xoAV749n}avWzhAP^(5K1wJ%dGq zeM_xCkyOX_UWUl3{n0sIo|-{O=Oq34VrL}a*=vGLa2oLLr9l0uQ*XL>D|CGEl_Ym@ zr&YC>fY8iv(UTEMJuDrT0TQJQJ)M1S)gidDs%C_S@+op`!lu&RVX%XI(3EZr8g~ zz{$g^Kgo#5gU$FC{-b;@kB%hkUm)NH6WEeNKZh|6W<>!)unH!)Ck(Mzh9E!j9I*%h z!6-;*XfN()Yu-`D_3_MJ5SuT6mDhdSRLz5|5s=D&8ugv`ed0S1+=iQkR)FyBxkx5z z&z(W$D=ztICyr2S+G-mks;E4YV4K4O}QfnDi6?UZ?9IWCbbj0M)~4EmG? zc`n|7*fATllzUpg9Y5w=I$%0+ z^2c&z%jYzXRuV12)7pU>9?9v)Kt5J{`&D!W#qcp`Q;H17ecu>q&*vBDwR)u;LODCpA^Mn7W!uR1W)%fu zV@t8ME&t^s!I=D?ix;wX^4p^JoE7(w&Ta1>NkI$dd3LuzvyFd*mW*HFM2$M}C3C_x z+N;gxtk7F*D{&R-kd+#CU+&3VEKo)KiXxAo0_2DXn8x=d;sy_;zs-*bcQ%b^P#A-i47HIH1-^4o!Q#ytb>HiCiVlMhGkbwmax`dk6_g?b+redUJO zuPd!?*W0+C-`n{UCKdMe=WJn0eBifPxl-0;oc7ntwtr0V`sNX8lr?1M2n zXUPe2D*u(X>JgJL?8Iw5ijAw-;!c}7o(|f}@u~0a*4$c(5u8vYtC0rw0-NM=TLy%(qmX}r+-%~ zv*Kcdrwu$h*<=C~RHc7kJp)u`TUWtI3I|V*hK~KB{!}|(GWXk4lWjD6)HDO<7ifbt zC;L4PkJdJIkE89H-h-L(R`ciJ1+tE(cr>(=y$CI=i1m`*?AB$)`Rg|%;Hk8Z!x)2S z85f>@ZbOTw!DDeo*6UH1jCZ}1;xX^2cy9?v9gN3DKjHNc-|=(W+)&St>-g-?rRdo6 zTq3cJw;5_Y(iO8g6z|*HFXyn~h$HPzfPVjA=zLf>iXiLvV>-D+HXhTfynG2#wL7z) z?winMdujb?WBfXd$?VD3XZDK{g}YqN<7N4sddW-Qc8Q@4c8;TjJAkfJUoM9C-S&pz zQ7rV|Z?6#8s8;=aUMl-xTqljZomFt>(tb$;T~CzY^rXoZ(<|4wXfFQNnA5Be>zzP) z=5@Z)O}a&&R0d>uBW;^s8%tcyab24oh0dpo?EaaF%==$6>$1zXO{?yy`j2>c;~mXB zG2qp+!=s(G>E{_oV5mD-dvNl6>@&ez`%|_1*v%dOUBH}em*#96giNQ5P4u#*DM`=T z!OHXWD{1*FsDd1IqxHGfYcD&%2n$OpYd$A!<+R0xTp>8#fz|$#r%okM0snf!6LP}o zB)o6tGUuN@RegNj^E$eUl>HH@?jEWYUrZ7!x_*M>ORMULQ*|v8){RJ~@WlLVchT4(#%V$r5cQArhhx-Z(*E z9~ADlL*@CMcNbzC!eu| z;BcxrGdsL~mnC$aUv%65TgK*A$o1rz&!g>m2k0ZGo2(?eD8Sr;i1Kq@Z_#)2k`rtm#%&r4C#l`TpwbhvW)jwDBIb?#P#AeLT*1S(X3HB$W2fcTJvtU+y~h%By#>a=t^TTf=VV*S)v{k|{Fm?! zudlyvP+F?kP~SWc$j=Y5Olo0CiRJnp5@c)k*g2%F4K8Ksyw1Msu#Nk?os#Q-V-^fN zx@QT@7Px{tgk_w8fYo)s`MXcM!@S2w%NjE)JJ1LMK^6ZA=X+N!`Bck;X9+WWxXq%( z)9=IsvKJ;1BqY7uoyS9iBv$iiJw16KxHqTY`?|4#3%um!@#u@(1a>2{-Wxcq@i~_I zc9+D#jEU*c$zb65!TIcDosWK(E>`jSEn|ZRN8hhwrz~4JMtoYu{_gxlmHv5z)z$g( z*$U0dyOQQdQ6lk{!L$$O75Nco=Bp=;0!MF7QM?2^mJ);x@6Ve3+h<5ltxdk4tg8MY z_5~Dfonye9QY!)pIR_VID;x*`2?b37Ub1COFT&rom+TuR86h0kL;BpJ% zN!9auAdcBm9w5+h8XYNqb`%hV0-hW2B)iP%Cr>m#W|l=n$iJQ_gu2%B02=7>Sl{}s zP($3BPAv+sBau)$KE%9~j~{MpwbzVgckVC8WPUBlc6Q!VWWoZ@0vrefZz}^etC+Wt zTq%J2dh&hjy8$i-yYN90mn*ml$m90>oiU}2O{tXrSt=9JV_?s4(=4X8Rrdh&)Z^GA z%tpKIL?O`3DbX<)m$s);<#GO1SDlkY zjQ?q+AAt}IG5G(vy}y_F_ZyM{XRRKU6D8#TT>O8&H&7TC-Ovof https://skyzh.github.io/mini-lsm - 2024-01-25T02:56:28.231Z + 2024-01-30T08:17:56.731Z https://skyzh.github.io/mini-lsm/00-get-started - 2024-01-25T02:56:28.234Z + 2024-01-30T08:17:56.734Z https://skyzh.github.io/mini-lsm/00-overview - 2024-01-25T02:56:28.232Z + 2024-01-30T08:17:56.732Z https://skyzh.github.io/mini-lsm/00-preface - 2024-01-25T02:56:28.230Z + 2024-01-30T08:17:56.730Z https://skyzh.github.io/mini-lsm/00-v1 - 2024-01-25T02:56:28.256Z + 2024-01-30T08:17:56.762Z https://skyzh.github.io/mini-lsm/01-block - 2024-01-25T02:56:28.257Z + 2024-01-30T08:17:56.763Z https://skyzh.github.io/mini-lsm/02-sst - 2024-01-25T02:56:28.258Z + 2024-01-30T08:17:56.764Z https://skyzh.github.io/mini-lsm/03-memtable - 2024-01-25T02:56:28.259Z + 2024-01-30T08:17:56.765Z https://skyzh.github.io/mini-lsm/04-engine - 2024-01-25T02:56:28.260Z + 2024-01-30T08:17:56.766Z https://skyzh.github.io/mini-lsm/05-compaction - 2024-01-25T02:56:28.261Z + 2024-01-30T08:17:56.767Z https://skyzh.github.io/mini-lsm/06-recovery - 2024-01-25T02:56:28.262Z + 2024-01-30T08:17:56.768Z https://skyzh.github.io/mini-lsm/07-bloom-filter - 2024-01-25T02:56:28.263Z + 2024-01-30T08:17:56.768Z https://skyzh.github.io/mini-lsm/08-key-compression - 2024-01-25T02:56:28.264Z + 2024-01-30T08:17:56.769Z https://skyzh.github.io/mini-lsm/09-whats-next - 2024-01-25T02:56:28.265Z + 2024-01-30T08:17:56.770Z https://skyzh.github.io/mini-lsm/week1-01-memtable - 2024-01-25T02:56:28.237Z + 2024-01-30T08:17:56.736Z https://skyzh.github.io/mini-lsm/week1-02-merge-iterator - 2024-01-25T02:56:28.238Z + 2024-01-30T08:17:56.738Z https://skyzh.github.io/mini-lsm/week1-03-block - 2024-01-25T02:56:28.239Z + 2024-01-30T08:17:56.739Z https://skyzh.github.io/mini-lsm/week1-04-sst - 2024-01-25T02:56:28.240Z + 2024-01-30T08:17:56.740Z https://skyzh.github.io/mini-lsm/week1-05-read-path - 2024-01-25T02:56:28.242Z + 2024-01-30T08:17:56.741Z https://skyzh.github.io/mini-lsm/week1-06-write-path - 2024-01-25T02:56:28.243Z + 2024-01-30T08:17:56.743Z https://skyzh.github.io/mini-lsm/week1-07-sst-optimizations - 2024-01-25T02:56:28.244Z + 2024-01-30T08:17:56.744Z https://skyzh.github.io/mini-lsm/week1-overview - 2024-01-25T02:56:28.235Z + 2024-01-30T08:17:56.735Z https://skyzh.github.io/mini-lsm/week2-01-compaction - 2024-01-25T02:56:28.246Z + 2024-01-30T08:17:56.746Z https://skyzh.github.io/mini-lsm/week2-02-simple - 2024-01-25T02:56:28.248Z + 2024-01-30T08:17:56.747Z https://skyzh.github.io/mini-lsm/week2-03-tiered - 2024-01-25T02:56:28.249Z + 2024-01-30T08:17:56.748Z https://skyzh.github.io/mini-lsm/week2-04-leveled - 2024-01-25T02:56:28.250Z + 2024-01-30T08:17:56.749Z https://skyzh.github.io/mini-lsm/week2-05-manifest - 2024-01-25T02:56:28.251Z + 2024-01-30T08:17:56.750Z https://skyzh.github.io/mini-lsm/week2-06-wal - 2024-01-25T02:56:28.252Z + 2024-01-30T08:17:56.751Z https://skyzh.github.io/mini-lsm/week2-07-snacks - 2024-01-25T02:56:28.253Z + 2024-01-30T08:17:56.752Z https://skyzh.github.io/mini-lsm/week2-overview - 2024-01-25T02:56:28.245Z + 2024-01-30T08:17:56.745Z + + + https://skyzh.github.io/mini-lsm/week3-01-ts-key-refactor + 2024-01-30T08:17:56.754Z + + + https://skyzh.github.io/mini-lsm/week3-02-snapshot-read-part-1 + 2024-01-30T08:17:56.755Z + + + https://skyzh.github.io/mini-lsm/week3-03-snapshot-read-part-2 + 2024-01-30T08:17:56.756Z + + + https://skyzh.github.io/mini-lsm/week3-04-watermark + 2024-01-30T08:17:56.757Z + + + https://skyzh.github.io/mini-lsm/week3-05-txn-occ + 2024-01-30T08:17:56.758Z + + + https://skyzh.github.io/mini-lsm/week3-06-serializable + 2024-01-30T08:17:56.759Z + + + https://skyzh.github.io/mini-lsm/week3-07-compaction-filter + 2024-01-30T08:17:56.760Z https://skyzh.github.io/mini-lsm/week3-overview - 2024-01-25T02:56:28.254Z + 2024-01-30T08:17:56.753Z https://skyzh.github.io/mini-lsm/week4-overview - 2024-01-25T02:56:28.255Z + 2024-01-30T08:17:56.761Z diff --git a/mini-lsm-book/src/week1-01-memtable.md b/mini-lsm-book/src/week1-01-memtable.md index 983d22499..39fcfdb45 100644 --- a/mini-lsm-book/src/week1-01-memtable.md +++ b/mini-lsm-book/src/week1-01-memtable.md @@ -23,7 +23,7 @@ In this task, you will need to modify: src/mem_table.rs ``` -Firstly, let us implement the in-memory structure of an LSM storage engine -- the memtable. We choose [crossbeam's skiplist implementation](link) as the data structure of the memtable as it supports lock-free concurrent read and write. We will not cover in-depth how a skiplist works, and in a nutshell, it is an ordered key-value map that easily allows concurrent read and write. +Firstly, let us implement the in-memory structure of an LSM storage engine -- the memtable. We choose [crossbeam's skiplist implementation](https://docs.rs/crossbeam-skiplist/latest/crossbeam_skiplist/) as the data structure of the memtable as it supports lock-free concurrent read and write. We will not cover in-depth how a skiplist works, and in a nutshell, it is an ordered key-value map that easily allows concurrent read and write. crossbeam-skiplist provides similar interfaces to the Rust std's `BTreeMap`: insert, get, and iter. The only difference is that the modification interfaces (i.e., `insert`) only require an immutable reference to the skiplist, instead of a mutable one. Therefore, in your implementation, you should not take any mutex when implementing the memtable structure. @@ -39,7 +39,6 @@ In this task, you will need to modify: ``` src/lsm_storage.rs -src/mem_table.rs ``` Now, we will add our first data structure, the memtable, to the LSM state. In `LsmStorageState::create`, you will find that when a LSM structure is created, we will initialize a memtable of id 0. This is the **mutable memtable** in the initial state. At any point of the time, the engine will have only one single mutable memtable. A memtable usually has a size limit (i.e., 256MB), and it will be frozen to an immutable memtable when it reaches the size limit. @@ -67,7 +66,7 @@ src/mem_table.rs A memtable cannot continuously grow in size, and we will need to freeze them (and later flush to the disk) when it reaches the size limit. You may find the memtable size limit, which is **equal to the SST size limit** (not `num_memtables_limit`), in the `LsmStorageOptions`. This is not a hard limit and you should freeze the memtable at best effort. -In this task, you will need to compute the approximate memtable size when put/delete a key in the memtable. This can be computed by simply adding the total number of bytes of keys and values when `put` is called. Is a key is put twice, though the skiplist only contains the latest value, you may count it twice in the approximate memtable size. Once a memtable reaches the limit, you should call `force_freeze_memtable` to freeze the memtable and create a new one. +In this task, you will need to compute the approximate memtable size when put/delete a key in the memtable. This can be computed by simply adding the total number of bytes of keys and values when `put` is called. If a key is put twice, though the skiplist only contains the latest value, you may count it twice in the approximate memtable size. Once a memtable reaches the limit, you should call `force_freeze_memtable` to freeze the memtable and create a new one. Because there could be multiple threads getting data into the storage engine, `force_freeze_memtable` might be called concurrently from multiple threads. You will need to think about how to avoid race conditions in this case. diff --git a/mini-lsm-book/src/week1-02-merge-iterator.md b/mini-lsm-book/src/week1-02-merge-iterator.md index d10bb6a93..df6896512 100644 --- a/mini-lsm-book/src/week1-02-merge-iterator.md +++ b/mini-lsm-book/src/week1-02-merge-iterator.md @@ -103,7 +103,7 @@ Starting this section, we will use `Key` to represent LSM key types and disti In this task, you will need to modify: ``` -src/iterators/lsm_iterator.rs +src/lsm_iterator.rs ``` We use the `LsmIterator` structure to represent the internal LSM iterators. You will need to modify this structure multiple times throughout the tutorial when more iterators are added into the system. For now, because we only have multiple memtables, it should be defined as: @@ -123,7 +123,7 @@ Then, we want to provide extra safety on the iterator to avoid users from misusi In this task, you will need to modify: ``` -src/iterators/lsm_storage.rs +src/lsm_storage.rs ``` We are finally there -- with all iterators you have implemented, you can finally implement the `scan` interface of the LSM engine. You can simply construct an LSM iterator with the memtable iterators (remember to put the latest memtable at the front of the merge iterator), and your storage engine will be able to handle the scan request. diff --git a/mini-lsm-book/src/week1-03-block.md b/mini-lsm-book/src/week1-03-block.md index 087e66c5c..9a90f1db9 100644 --- a/mini-lsm-book/src/week1-03-block.md +++ b/mini-lsm-book/src/week1-03-block.md @@ -77,7 +77,7 @@ In this task, you will need to modify: src/block/iterator.rs ``` -Now that we have an encoded block, we will need to implement the `StorageIterator` interface, so that the user can lookup/scan keys in the block. +Now that we have an encoded block, we will need to implement the `BlockIterator` interface, so that the user can lookup/scan keys in the block. `BlockIterator` can be created with an `Arc`. If `create_and_seek_to_first` is called, it will be positioned at the first key in the block. If `create_and_seek_to_key` is called, the iterator will be positioned at the first key that is `>=` the provided key. For example, if `1, 3, 5` is in a block. diff --git a/mini-lsm-book/src/week1-05-read-path.md b/mini-lsm-book/src/week1-05-read-path.md index 3c9c6ccc6..dc5d71907 100644 --- a/mini-lsm-book/src/week1-05-read-path.md +++ b/mini-lsm-book/src/week1-05-read-path.md @@ -84,5 +84,6 @@ We do not provide reference answers to the questions, and feel free to discuss a ## Bonus Tasks * **The Cost of Dynamic Dispatch.** Implement a `Box` version of merge iterators and benchmark to see the performance differences. +* **Parallel Seek.** Creating a merge iterator requires loading the first block of all underlying SSTs (when you create `SSTIterator`). You may parallelize the process of creating iterators. {{#include copyright.md}} diff --git a/mini-lsm-book/src/week1-07-sst-optimizations.md b/mini-lsm-book/src/week1-07-sst-optimizations.md index 6bab55de4..7abdd797c 100644 --- a/mini-lsm-book/src/week1-07-sst-optimizations.md +++ b/mini-lsm-book/src/week1-07-sst-optimizations.md @@ -23,9 +23,11 @@ Bloom filters are probabilistic data structures that maintains a set of keys. Yo You usually need to have a hash function in order to construct a bloom filter, and a key can have multiple hashes. Let us take a look at the below example. Assume that we already have hashes of some keys and the bloom filter has 7 bits. +[Note: If you want to understand bloom filters better, look [here](https://samwho.dev/bloom-filters/)] + ```plaintext hash1 = ((character - a) * 13) % 7 -hash1 = ((character - a) * 11) % 7 +hash2 = ((character - a) * 11) % 7 b -> 6 4 c -> 5 1 d -> 4 5 @@ -34,7 +36,7 @@ g -> 1 3 h -> 0 0 ``` -If we insert b, c, d into the 6-bit bloom filter, we will get: +If we insert b, c, d into the 7-bit bloom filter, we will get: ``` bit 0123456 diff --git a/mini-lsm-book/src/week2-01-compaction.md b/mini-lsm-book/src/week2-01-compaction.md index 74ed2a48c..c8bc8d696 100644 --- a/mini-lsm-book/src/week2-01-compaction.md +++ b/mini-lsm-book/src/week2-01-compaction.md @@ -15,6 +15,12 @@ cargo x copy-test --week 2 --day 1 cargo x scheck ``` +

+ +It might be helpful to take a look at [week 2 overview](./week2-overview.md) before reading this chapter to have a general overview of compactions. + +
+ ## Task 1: Compaction Implementation In this task, you will implement the core logic of doing a compaction -- merge sort a set of SST files into a sorted run. You will need to modify: diff --git a/mini-lsm-book/src/week2-02-simple.md b/mini-lsm-book/src/week2-02-simple.md index 1921259e1..dbf877040 100644 --- a/mini-lsm-book/src/week2-02-simple.md +++ b/mini-lsm-book/src/week2-02-simple.md @@ -14,6 +14,12 @@ cargo x copy-test --week 2 --day 2 cargo x scheck ``` +
+ +It might be helpful to take a look at [week 2 overview](./week2-overview.md) before reading this chapter to have a general overview of compactions. + +
+ ## Task 1: Simple Leveled Compaction In this chapter, we are going to implement our first compaction strategy -- simple leveled compaction. In this task, you will need to modify: diff --git a/mini-lsm-book/src/week2-03-tiered.md b/mini-lsm-book/src/week2-03-tiered.md index 68f506f90..b5b619e12 100644 --- a/mini-lsm-book/src/week2-03-tiered.md +++ b/mini-lsm-book/src/week2-03-tiered.md @@ -16,6 +16,12 @@ cargo x copy-test --week 2 --day 3 cargo x scheck ``` +
+ +It might be helpful to take a look at [week 2 overview](./week2-overview.md) before reading this chapter to have a general overview of compactions. + +
+ ## Task 1: Universal Compaction In this chapter, you will implement RocksDB's universal compaction, which is of the tiered compaction family compaction strategies. Similar to the simple leveled compaction strategy, we only use number of files as the indicator in this compaction strategy. And when we trigger the compaction jobs, we always include a full sorted run (tier) in the compaction job. @@ -28,7 +34,7 @@ In this task, you will need to modify: src/compact/tiered.rs ``` -In universal compaction, we do not use L0 SSTs in the LSM state. Instead, we directly flush new SSTs to a single sorted run (called tier). In the LSM state, `levels` will now include all tiers, where the lowest index is the latest SST flushed. The compaction simulator generates tier id based on the first SST id, and you should do the same in your implementation. +In universal compaction, we do not use L0 SSTs in the LSM state. Instead, we directly flush new SSTs to a single sorted run (called tier). In the LSM state, `levels` will now include all tiers, where **the lowest index is the latest SST flushed**. Each element in the `levels` vector stores a tuple: level ID (used as tier ID) and the SSTs in that level. Every time you flush L0 SSTs, you should flush the SST into a tier placed at the front of the vector. The compaction simulator generates tier id based on the first SST id, and you should do the same in your implementation. Universal compaction will only trigger tasks when the number of tiers (sorted runs) is larger than `num_tiers`. Otherwise, it does not trigger any compaction. @@ -130,6 +136,7 @@ As tiered compaction does not use the L0 level of the LSM state, you should dire * What happens if compaction speed cannot keep up with the SST flushes? * What might needs to be considered if the system schedules multiple compaction tasks in parallel? * SSDs also write its own logs (basically it is a log-structured storage). If the SSD has a write amplification of 2x, what is the end-to-end write amplification of the whole system? Related: [ZNS: Avoiding the Block Interface Tax for Flash-based SSDs](https://www.usenix.org/conference/atc21/presentation/bjorling). +* Consider the case that the user chooses to keep a large number of sorted runs (i.e., 300) for tiered compaction. To make the read path faster, is it a good idea to keep some data structure that helps reduce the time complexity (i.e., to `O(log n)`) of finding SSTs to read in each layer for some key ranges? Note that normally, you will need to do a binary search in each sorted run to find the key ranges that you will need to read. (Check out Neon's [layer map](https://neon.tech/blog/persistent-structures-in-neons-wal-indexing) implementation!) We do not provide reference answers to the questions, and feel free to discuss about them in the Discord community. diff --git a/mini-lsm-book/src/week2-04-leveled.md b/mini-lsm-book/src/week2-04-leveled.md index cbf629bae..371041591 100644 --- a/mini-lsm-book/src/week2-04-leveled.md +++ b/mini-lsm-book/src/week2-04-leveled.md @@ -14,6 +14,12 @@ cargo x copy-test --week 2 --day 4 cargo x scheck ``` +
+ +It might be helpful to take a look at [week 2 overview](./week2-overview.md) before reading this chapter to have a general overview of compactions. + +
+ ## Task 1: Leveled Compaction In chapter 2 day 2, you have implemented the simple leveled compaction strategies. However, the implementation has a few problems: @@ -43,21 +49,25 @@ You will need to compute the target sizes of the levels. Assume `base_level_size [0 0 0 0 0 200MB] ``` -When the levels grow in size as more SSTs get compacted to that level, we will compute the target size based on the size of the last level. When the actual size of SST files in the last level reaches 200MB, for example, 300MB, we will compute the target size of the other levels by dividing the `level_size_multiplier`. Assume `level_size_multiplier=10`. +Before the bottom level exceeds `base_level_size_mb`, all other intermediate levels will have target sizes of 0. The idea is that when the total amount of data is small, it's wasteful to create intermediate levels. + +When the bottom level reaches or exceeds `base_level_size_mb`, we will compute the target size of the other levels by dividing the `level_size_multiplier` from the size. Assume the bottom level contains 300MB of data, and `level_size_multiplier=10`. ``` 0 0 0 0 30MB 300MB ``` -We will only keep at most *one* level below `base_level_size_mb`, and in this case, it is L5. Assume we now have 30GB files in the last level, the target sizes will be, +In addition, at most *one* level can have a positive target size below `base_level_size_mb`. Assume we now have 30GB files in the last level, the target sizes will be, ``` 0 0 30MB 300MB 3GB 30GB ``` +Notice in this case L1 and L2 have target size of 0, and L3 is the only level with a postive target size below `base_level_size_mb`. + ### Task 1.2: Decide Base Level -Now, let us solve the problem that SSTs may be compacted across empty levels in the simple leveled compaction strategy. When we compact L0 SSTs with lower levels, we do not directly put it to L1. Instead, we compact it with the first level with `target size > 0``. For example, when the target level sizes are: +Now, let us solve the problem that SSTs may be compacted across empty levels in the simple leveled compaction strategy. When we compact L0 SSTs with lower levels, we do not directly put it to L1. Instead, we compact it with the first level with `target size > 0`. For example, when the target level sizes are: ``` 0 0 0 0 30MB 300MB @@ -89,7 +99,7 @@ The number of levels in the compaction simulator is 4. Therefore, the SSTs shoul ### Task 1.3: Decide Level Priorities -Now that we will need to handle compactions below L0. L0 compaction always has the top priority, that you should compact L0 with other levels first if it reaches the threshold. After that, we can compute the compaction priorities of each level by `current_size / target_size`. We only compact levels with this ratio `> 1.0` The one with the largest ratio will be chosen for compaction with the lower level. For example, if we have: +Now that we will need to handle compactions below L0. L0 compaction always has the top priority, thus you should compact L0 with other levels first if it reaches the threshold. After that, we can compute the compaction priorities of each level by `current_size / target_size`. We only compact levels with this ratio `> 1.0` The one with the largest ratio will be chosen for compaction with the lower level. For example, if we have: ``` L3: 200MB, target_size=20MB diff --git a/mini-lsm-book/src/week2-05-manifest.md b/mini-lsm-book/src/week2-05-manifest.md index c4d898c06..2361ed0ac 100644 --- a/mini-lsm-book/src/week2-05-manifest.md +++ b/mini-lsm-book/src/week2-05-manifest.md @@ -7,6 +7,13 @@ In this chapter, you will: * Implement encoding and decoding of the manifest file. * Recover from the manifest when the system restarts. +To copy the test cases into the starter code and run them, + +``` +cargo x copy-test --week 2 --day 5 +cargo x scheck +``` + ## Task 1: Manifest Encoding The system uses a manifest file to record all operations happened in the engine. Currently, there are only two types of them: compaction and SST flush. When the engine restarts, it will read the manifest file, reconstruct the state, and load the SST files on the disk. @@ -87,5 +94,6 @@ get 1500 ## Bonus Tasks * **Manifest Compaction.** When the number of logs in the manifest file gets too large, you can rewrite the manifest file to only store the current snapshot and append new logs to that file. +* **Parallel Open.** After you collect the list of SSTs to open, you can open and decode them in parallel, instead of doing it one by one, therefore accelerating the recovery process. {{#include copyright.md}} diff --git a/mini-lsm-book/src/week2-06-wal.md b/mini-lsm-book/src/week2-06-wal.md index fb4ebc998..b9500f970 100644 --- a/mini-lsm-book/src/week2-06-wal.md +++ b/mini-lsm-book/src/week2-06-wal.md @@ -7,6 +7,13 @@ In this chapter, you will: * Implement encoding and decoding of the write-ahead log file. * Recover memtables from the WALs when the system restarts. +To copy the test cases into the starter code and run them, + +``` +cargo x copy-test --week 2 --day 6 +cargo x scheck +``` + ## Task 1: WAL Encoding In this task, you will need to modify: diff --git a/mini-lsm-book/src/week3-04-watermark.md b/mini-lsm-book/src/week3-04-watermark.md index 75fc879cc..cc95c615f 100644 --- a/mini-lsm-book/src/week3-04-watermark.md +++ b/mini-lsm-book/src/week3-04-watermark.md @@ -74,6 +74,8 @@ Assume these are all keys in the engine. If we do a scan at ts=3, we will get `a * In our implementation, we manage watermarks by ourselves with the lifecycle of `Transaction` (so-called un-managed mode). If the user intends to manage key timestamps and the watermarks by themselves (i.e., when they have their own timestamp generator), what do you need to do in the write_batch/get/scan API to validate their requests? Is there any architectural assumption we had that might be hard to maintain in this case? * Why do we need to store an `Arc` of `Transaction` inside a transaction iterator? +* What is the condition to fully remove a key from the SST file? +* For now, we only remove a key when compacting to the bottom-most level. Is there any other prior time that we can remove the key? (Hint: you know the start/end key of each SST in all levels.) ## Bonus Tasks diff --git a/mini-lsm-book/src/week3-06-serializable.md b/mini-lsm-book/src/week3-06-serializable.md index 1aa1bfe82..232642364 100644 --- a/mini-lsm-book/src/week3-06-serializable.md +++ b/mini-lsm-book/src/week3-06-serializable.md @@ -102,11 +102,21 @@ You can skip the check if `write_set` is empty. A read-only transaction can alwa You should also modify the `put`, `delete`, and `write_batch` interface in `LsmStorageInner`. We recommend you define a helper function `write_batch_inner` that processes a write batch. If `options.serializable = true`, `put`, `delete`, and the user-facing `write_batch` should create a transaction instead of directly creating a write batch. Your write batch helper function should also return a `u64` commit timestamp so that `Transaction::Commit` can correctly store the committed transaction data into the MVCC structure. +## Task 4: Garbage Collection + +In this task, you will need to modify: + +``` +src/mvcc/txn.rs +``` + +When you commit a transaction, you can also clean up the committed txn map to remove all transactions below the watermark, as they will not be involved in any future serializable validations. + ## Test Your Understanding * If you have some experience with building a relational database, you may think about the following question: assume that we build a database based on Mini-LSM where we store each row in the relation table as a key-value pair (key: primary key, value: serialized row) and enable serializable verification, does the database system directly gain ANSI serializable isolation level capability? Why or why not? * The thing we implement here is actually write snapshot-isolation (see [A critique of snapshot isolation](https://dl.acm.org/doi/abs/10.1145/2168836.2168853)) that guarantees serializable. Is there any cases where the execution is serializable, but will be rejected by the write snapshot-isolation validation? -* There are databases that claim they have serializable snapshot isolation support by only tracking the keys accessed in gets and scans. Do they really prevent write skews caused by phantoms? (Okay... Actually, I'm talking about [BadgerDB](https://dgraph.io/blog/post/badger-txn/).) +* There are databases that claim they have serializable snapshot isolation support by only tracking the keys accessed in gets and scans (instead of key range). Do they really prevent write skews caused by phantoms? (Okay... Actually, I'm talking about [BadgerDB](https://dgraph.io/blog/post/badger-txn/).) We do not provide reference answers to the questions, and feel free to discuss about them in the Discord community. diff --git a/mini-lsm-book/src/week3-07-compaction-filter.md b/mini-lsm-book/src/week3-07-compaction-filter.md index 82213a63b..a9a5eea0f 100644 --- a/mini-lsm-book/src/week3-07-compaction-filter.md +++ b/mini-lsm-book/src/week3-07-compaction-filter.md @@ -1,4 +1,46 @@ -# Snack Time: Compaction Filter +# Snack Time: Compaction Filters +Congratulations! You made it there! In the previous chapter, you made your LSM engine multi-version capable, and the users can use transaction APIs to interact with your storage engine. At the end of this week, we will implement some easy but important features of the storage engine. Welcome to Mini-LSM's week 3 snack time! + +In this chapter, we will generalize our compaction garbage collection logic to become compaction filters. + +For now, our compaction will simply retain the keys above the watermark and the latest version of the keys below the watermark. We can add some magic to the compaction process to help the user collect some unused data automatically as a background job. + +Consider a case that the user uses Mini-LSM to store database tables. Each row in the table are prefixed with the table name. For example, + +``` +table1_key1 -> row +table1_key2 -> row +table1_key3 -> row +table2_key1 -> row +table2_key2 -> row +``` + +Now the user executes `DROP TABLE table1`. The engine will need to clean up all the data beginning with `table1`. + +There are a lot of ways to achieve the goal. The user of Mini-LSM can scan all the keys beginning with `table1` and requests the engine to delete it. However, scanning a very large database might be slow, and it will generate the same number of delete tombstones as the existing keys. Therefore, scan-and-delete will not free up the space occupied by the dropped table -- instead, it will add more data to the engine and the space can only be reclaimed when the tombstones reach the bottom level of the engine. + +Or, they can create column families (we will talk about this in *rest of your life* chapter). They store each table in a column family, which is a standalone LSM state, and directly remove the SST files corresponding to the column family when the user drop the table. + +In this tutorial, we will implement the third approach: compaction filters. Compaction filters can be dynamically added to the engine at runtime. During the compaction, if a key matching the compaction filter is found, we can silently remove it in the background. Therefore, the user can attach a compaction filter of `prefix=table1` to the engine, and all these keys will be removed during compaction. + +## Task 1: Compaction Filter + +In this task, you will need to modify: + +``` +src/compact.rs +``` + +You can iterate all compaction filters in `LsmStorageInner::compaction_filters`. If the first version of the key below watermark matches the compaction filter, simply remove it instead of keeping it in the SST file. + +To run test cases, + +``` +cargo x copy-test --week 3 --day 7 +cargo x scheck +``` + +You can assume that the user will not get the keys within the prefix filter range. And, they will not scan the keys in the prefix range. Therefore, it is okay to return a wrong value when a user requests the keys in the prefix filter range (i.e., undefined behavior). {{#include copyright.md}} diff --git a/mini-lsm-book/src/week3-overview.md b/mini-lsm-book/src/week3-overview.md index 0bbfac5c6..500c4f15b 100644 --- a/mini-lsm-book/src/week3-overview.md +++ b/mini-lsm-book/src/week3-overview.md @@ -2,6 +2,8 @@ In this part, you will implement MVCC over the LSM engine that you have built in the previous two weeks. We will add timestamp encoding in the keys to maintain multiple versions of a key, and change some part of the engine to ensure old data are either retained or garbage-collected based on whether there are users reading an old version. +The general approach of the MVCC part in this tutorial is inspired and partially based on [BadgerDB](https://github.com/dgraph-io/badger). + The key of MVCC is to store and access multiple versions of a key in the storage engine. Therefore, we will need to change the key format to `user_key + timestamp (u64)`. And on the user interface side, we will need to have new APIs to help users to gain access to a history version. In summary, we will add a monotonically-increasing timestamp to the key. In previous parts, we assumed that newer keys are in the upper level of the LSM tree, and older keys are in the lower level of the LSM tree. During compaction, we only keep the latest version of a key if multiple versions are found in multiple levels, and the compaction process will ensure that newer keys will be kept on the upper level by only merging adjacent levels/tiers. In the MVCC implementation, the key with a larger timestamp is the newest key. During compaction, we can only remove the key if no user is accessing an older version of the database. Though not keeping the latest version of key in the upper level may still yield a correct result for the MVCC LSM implementation, in our tutorial, we choose to keep the invariant, and if there are multiple versions of a key, a later version will always appear in a upper level. @@ -16,7 +18,7 @@ put/delete/write_batch(key, timestamp) set_watermark(timestamp) # we will talk about watermarks soon! ``` -**Un-managed Mode APIs** +**Un-managed/Normal Mode APIs** ``` get(key) -> value scan(key_range) -> iterator diff --git a/mini-lsm-mvcc/Cargo.toml b/mini-lsm-mvcc/Cargo.toml index 15213c311..00b2e5bfd 100644 --- a/mini-lsm-mvcc/Cargo.toml +++ b/mini-lsm-mvcc/Cargo.toml @@ -25,6 +25,8 @@ serde_json = { version = "1.0" } serde = { version = "1.0", features = ["derive"] } farmhash = "1" crc32fast = "1.3.2" +nom = "7.1.3" +rustyline = "13.0.0" [dev-dependencies] tempfile = "3" diff --git a/mini-lsm-mvcc/src/compact.rs b/mini-lsm-mvcc/src/compact.rs index 06df389e9..cc52c63fe 100644 --- a/mini-lsm-mvcc/src/compact.rs +++ b/mini-lsm-mvcc/src/compact.rs @@ -19,7 +19,7 @@ use crate::iterators::merge_iterator::MergeIterator; use crate::iterators::two_merge_iterator::TwoMergeIterator; use crate::iterators::StorageIterator; use crate::key::KeySlice; -use crate::lsm_storage::{LsmStorageInner, LsmStorageState}; +use crate::lsm_storage::{CompactionFilter, LsmStorageInner, LsmStorageState}; use crate::manifest::ManifestRecord; use crate::table::{SsTable, SsTableBuilder, SsTableIterator}; @@ -122,7 +122,8 @@ impl LsmStorageInner { let watermark = self.mvcc().watermark(); let mut last_key = Vec::::new(); let mut first_key_below_watermark = false; - while iter.is_valid() { + let compaction_filters = self.compaction_filters.lock().clone(); + 'outer: while iter.is_valid() { if builder.is_none() { builder = Some(SsTableBuilder::new(self.options.block_size)); } @@ -144,12 +145,26 @@ impl LsmStorageInner { continue; } - if same_as_last_key && iter.key().ts() <= watermark { - if !first_key_below_watermark { + if iter.key().ts() <= watermark { + if same_as_last_key && !first_key_below_watermark { iter.next()?; continue; } + first_key_below_watermark = false; + + if !compaction_filters.is_empty() { + for filter in &compaction_filters { + match filter { + CompactionFilter::Prefix(x) => { + if iter.key().key_ref().starts_with(x) { + iter.next()?; + continue 'outer; + } + } + } + } + } } let builder_inner = builder.as_mut().unwrap(); diff --git a/mini-lsm-mvcc/src/compact/leveled.rs b/mini-lsm-mvcc/src/compact/leveled.rs index 213661888..043bc9273 100644 --- a/mini-lsm-mvcc/src/compact/leveled.rs +++ b/mini-lsm-mvcc/src/compact/leveled.rs @@ -118,6 +118,7 @@ impl LeveledCompactionController { } } priorities.sort_by(|a, b| a.partial_cmp(b).unwrap().reverse()); + let priority = priorities.first(); if let Some((_, level)) = priority { println!( diff --git a/mini-lsm-mvcc/src/iterators/merge_iterator.rs b/mini-lsm-mvcc/src/iterators/merge_iterator.rs index c4abc8d3e..b1f5bdf77 100644 --- a/mini-lsm-mvcc/src/iterators/merge_iterator.rs +++ b/mini-lsm-mvcc/src/iterators/merge_iterator.rs @@ -37,7 +37,7 @@ impl Ord for HeapWrapper { } /// Merge multiple iterators of the same type. If the same key occurs multiple times in some -/// iterators, perfer the one with smaller index. +/// iterators, prefer the one with smaller index. pub struct MergeIterator { iters: BinaryHeap>, current: Option>, diff --git a/mini-lsm-mvcc/src/iterators/two_merge_iterator.rs b/mini-lsm-mvcc/src/iterators/two_merge_iterator.rs index 8488cd282..2ff2ce3fa 100644 --- a/mini-lsm-mvcc/src/iterators/two_merge_iterator.rs +++ b/mini-lsm-mvcc/src/iterators/two_merge_iterator.rs @@ -53,8 +53,10 @@ impl< fn key(&self) -> A::KeyType<'_> { if self.choose_a { + debug_assert!(self.a.is_valid()); self.a.key() } else { + debug_assert!(self.b.is_valid()); self.b.key() } } diff --git a/mini-lsm-mvcc/src/lsm_iterator.rs b/mini-lsm-mvcc/src/lsm_iterator.rs index ad3242447..36a87bfb9 100644 --- a/mini-lsm-mvcc/src/lsm_iterator.rs +++ b/mini-lsm-mvcc/src/lsm_iterator.rs @@ -136,14 +136,14 @@ impl StorageIterator for FusedIterator { } fn key(&self) -> Self::KeyType<'_> { - if self.has_errored || !self.iter.is_valid() { + if !self.is_valid() { panic!("invalid access to the underlying iterator"); } self.iter.key() } fn value(&self) -> &[u8] { - if self.has_errored || !self.iter.is_valid() { + if !self.is_valid() { panic!("invalid access to the underlying iterator"); } self.iter.value() diff --git a/mini-lsm-mvcc/src/lsm_storage.rs b/mini-lsm-mvcc/src/lsm_storage.rs index c284b4a75..25786dfd1 100644 --- a/mini-lsm-mvcc/src/lsm_storage.rs +++ b/mini-lsm-mvcc/src/lsm_storage.rs @@ -149,6 +149,11 @@ fn key_within(user_key: &[u8], table_begin: KeySlice, table_end: KeySlice) -> bo table_begin.key_ref() <= user_key && user_key <= table_end.key_ref() } +#[derive(Clone, Debug)] +pub enum CompactionFilter { + Prefix(Bytes), +} + /// The storage interface of the LSM tree. pub(crate) struct LsmStorageInner { pub(crate) state: Arc>>, @@ -160,6 +165,7 @@ pub(crate) struct LsmStorageInner { pub(crate) compaction_controller: CompactionController, pub(crate) manifest: Option, pub(crate) mvcc: Option, + pub(crate) compaction_filters: Arc>>, } /// A thin wrapper for `LsmStorageInner` and the user interface for MiniLSM. @@ -243,6 +249,10 @@ impl MiniLsm { })) } + pub fn add_compaction_filter(&self, compaction_filter: CompactionFilter) { + self.inner.add_compaction_filter(compaction_filter) + } + pub fn get(&self, key: &[u8]) -> Result> { self.inner.get(key) } @@ -431,12 +441,18 @@ impl LsmStorageInner { manifest: Some(manifest), options: options.into(), mvcc: Some(LsmMvccInner::new(last_commit_ts)), + compaction_filters: Arc::new(Mutex::new(Vec::new())), }; storage.sync_dir()?; Ok(storage) } + pub fn add_compaction_filter(&self, compaction_filter: CompactionFilter) { + let mut compaction_filters = self.compaction_filters.lock(); + compaction_filters.push(compaction_filter); + } + pub fn sync(&self) -> Result<()> { self.state.read().memtable.sync_wal() } diff --git a/mini-lsm-mvcc/src/mem_table.rs b/mini-lsm-mvcc/src/mem_table.rs index 5b6dd5008..92d90d764 100644 --- a/mini-lsm-mvcc/src/mem_table.rs +++ b/mini-lsm-mvcc/src/mem_table.rs @@ -152,8 +152,7 @@ impl MemTable { item: (KeyBytes::new(), Bytes::new()), } .build(); - let entry = iter.with_iter_mut(|iter| MemTableIterator::entry_to_item(iter.next())); - iter.with_mut(|x| *x.item = entry); + iter.next().unwrap(); iter } diff --git a/mini-lsm-mvcc/src/tests.rs b/mini-lsm-mvcc/src/tests.rs index ab4e40520..7b12c1134 100644 --- a/mini-lsm-mvcc/src/tests.rs +++ b/mini-lsm-mvcc/src/tests.rs @@ -18,3 +18,4 @@ mod week3_day3; mod week3_day4; mod week3_day5; mod week3_day6; +mod week3_day7; diff --git a/mini-lsm-mvcc/src/tests/week3_day7.rs b/mini-lsm-mvcc/src/tests/week3_day7.rs new file mode 100644 index 000000000..bfbc05d7a --- /dev/null +++ b/mini-lsm-mvcc/src/tests/week3_day7.rs @@ -0,0 +1,70 @@ +use bytes::Bytes; +use tempfile::tempdir; + +use crate::{ + compact::CompactionOptions, + lsm_storage::{CompactionFilter, LsmStorageOptions, MiniLsm, WriteBatchRecord}, +}; + +use super::harness::{check_iter_result_by_key, construct_merge_iterator_over_storage}; + +#[test] +fn test_task3_mvcc_compaction() { + let dir = tempdir().unwrap(); + let options = LsmStorageOptions::default_for_week2_test(CompactionOptions::NoCompaction); + let storage = MiniLsm::open(&dir, options.clone()).unwrap(); + storage + .write_batch(&[ + WriteBatchRecord::Put("table1_a", "1"), + WriteBatchRecord::Put("table1_b", "1"), + WriteBatchRecord::Put("table1_c", "1"), + WriteBatchRecord::Put("table2_a", "1"), + WriteBatchRecord::Put("table2_b", "1"), + WriteBatchRecord::Put("table2_c", "1"), + ]) + .unwrap(); + storage.force_flush().unwrap(); + let snapshot0 = storage.new_txn().unwrap(); + storage + .write_batch(&[ + WriteBatchRecord::Put("table1_a", "2"), + WriteBatchRecord::Del("table1_b"), + WriteBatchRecord::Put("table1_c", "2"), + WriteBatchRecord::Put("table2_a", "2"), + WriteBatchRecord::Del("table2_b"), + WriteBatchRecord::Put("table2_c", "2"), + ]) + .unwrap(); + storage.force_flush().unwrap(); + storage.add_compaction_filter(CompactionFilter::Prefix(Bytes::from("table2_"))); + storage.force_full_compaction().unwrap(); + + let mut iter = construct_merge_iterator_over_storage(&storage.inner.state.read()); + check_iter_result_by_key( + &mut iter, + vec![ + (Bytes::from("table1_a"), Bytes::from("2")), + (Bytes::from("table1_a"), Bytes::from("1")), + (Bytes::from("table1_b"), Bytes::new()), + (Bytes::from("table1_b"), Bytes::from("1")), + (Bytes::from("table1_c"), Bytes::from("2")), + (Bytes::from("table1_c"), Bytes::from("1")), + (Bytes::from("table2_a"), Bytes::from("2")), + (Bytes::from("table2_b"), Bytes::new()), + (Bytes::from("table2_c"), Bytes::from("2")), + ], + ); + + drop(snapshot0); + + storage.force_full_compaction().unwrap(); + + let mut iter = construct_merge_iterator_over_storage(&storage.inner.state.read()); + check_iter_result_by_key( + &mut iter, + vec![ + (Bytes::from("table1_a"), Bytes::from("2")), + (Bytes::from("table1_c"), Bytes::from("2")), + ], + ); +} diff --git a/mini-lsm-starter/src/iterators/merge_iterator.rs b/mini-lsm-starter/src/iterators/merge_iterator.rs index 1e14a6723..8496e0389 100644 --- a/mini-lsm-starter/src/iterators/merge_iterator.rs +++ b/mini-lsm-starter/src/iterators/merge_iterator.rs @@ -37,7 +37,7 @@ impl Ord for HeapWrapper { } /// Merge multiple iterators of the same type. If the same key occurs multiple times in some -/// iterators, perfer the one with smaller index. +/// iterators, prefer the one with smaller index. pub struct MergeIterator { iters: BinaryHeap>, current: Option>, diff --git a/mini-lsm-starter/src/lsm_iterator.rs b/mini-lsm-starter/src/lsm_iterator.rs index 230b867f6..a436453d1 100644 --- a/mini-lsm-starter/src/lsm_iterator.rs +++ b/mini-lsm-starter/src/lsm_iterator.rs @@ -122,14 +122,14 @@ impl StorageIterator for LsmIterator { /// `is_valid` should return false, and `next` should always return an error. pub struct FusedIterator { iter: I, - has_error: bool, + has_errored: bool, } impl FusedIterator { pub fn new(iter: I) -> Self { Self { iter, - has_error: false, + has_errored: false, } } } @@ -138,7 +138,7 @@ impl StorageIterator for FusedIterator { type KeyType<'a> = I::KeyType<'a> where Self: 'a; fn is_valid(&self) -> bool { - !self.has_error && self.iter.is_valid() + !self.has_errored && self.iter.is_valid() } fn key(&self) -> Self::KeyType<'_> { diff --git a/mini-lsm-starter/src/lsm_storage.rs b/mini-lsm-starter/src/lsm_storage.rs index a4ea54b43..7a0a16f9b 100644 --- a/mini-lsm-starter/src/lsm_storage.rs +++ b/mini-lsm-starter/src/lsm_storage.rs @@ -120,6 +120,11 @@ impl LsmStorageOptions { } } +#[derive(Clone, Debug)] +pub enum CompactionFilter { + Prefix(Bytes), +} + /// The storage interface of the LSM tree. pub(crate) struct LsmStorageInner { pub(crate) state: Arc>>, @@ -131,6 +136,7 @@ pub(crate) struct LsmStorageInner { pub(crate) compaction_controller: CompactionController, pub(crate) manifest: Option, pub(crate) mvcc: Option, + pub(crate) compaction_filters: Arc>>, } /// A thin wrapper for `LsmStorageInner` and the user interface for MiniLSM. @@ -215,6 +221,10 @@ impl MiniLsm { self.inner.write_batch(batch) } + pub fn add_compaction_filter(&self, compaction_filter: CompactionFilter) { + self.inner.add_compaction_filter(compaction_filter) + } + pub fn get(&self, key: &[u8]) -> Result> { self.inner.get(key) } @@ -396,6 +406,7 @@ impl LsmStorageInner { manifest: Some(manifest), options: options.into(), mvcc: Some(LsmMvccInner::new(last_commit_ts)), + compaction_filters: Arc::new(Mutex::new(Vec::new())), }; storage.sync_dir()?; @@ -421,6 +432,11 @@ impl LsmStorageInner { Ok(()) } + pub fn add_compaction_filter(&self, compaction_filter: CompactionFilter) { + let mut compaction_filters = self.compaction_filters.lock(); + compaction_filters.push(compaction_filter); + } + pub(crate) fn path_of_sst_static(path: impl AsRef, id: usize) -> PathBuf { path.as_ref().join(format!("{:05}.sst", id)) } diff --git a/mini-lsm/Cargo.toml b/mini-lsm/Cargo.toml index 093616082..000e05a46 100644 --- a/mini-lsm/Cargo.toml +++ b/mini-lsm/Cargo.toml @@ -25,6 +25,8 @@ serde_json = { version = "1.0" } serde = { version = "1.0", features = ["derive"] } farmhash = "1" crc32fast = "1.3.2" +nom = "7.1.3" +rustyline = "13.0.0" [dev-dependencies] tempfile = "3" diff --git a/mini-lsm/src/iterators/merge_iterator.rs b/mini-lsm/src/iterators/merge_iterator.rs index c4abc8d3e..b1f5bdf77 100644 --- a/mini-lsm/src/iterators/merge_iterator.rs +++ b/mini-lsm/src/iterators/merge_iterator.rs @@ -37,7 +37,7 @@ impl Ord for HeapWrapper { } /// Merge multiple iterators of the same type. If the same key occurs multiple times in some -/// iterators, perfer the one with smaller index. +/// iterators, prefer the one with smaller index. pub struct MergeIterator { iters: BinaryHeap>, current: Option>, diff --git a/mini-lsm/src/lsm_iterator.rs b/mini-lsm/src/lsm_iterator.rs index 044769cf6..448989d47 100644 --- a/mini-lsm/src/lsm_iterator.rs +++ b/mini-lsm/src/lsm_iterator.rs @@ -106,14 +106,14 @@ impl StorageIterator for FusedIterator { } fn key(&self) -> Self::KeyType<'_> { - if self.has_errored || !self.iter.is_valid() { + if !self.is_valid() { panic!("invalid access to the underlying iterator"); } self.iter.key() } fn value(&self) -> &[u8] { - if self.has_errored || !self.iter.is_valid() { + if !self.is_valid() { panic!("invalid access to the underlying iterator"); } self.iter.value() diff --git a/mini-lsm/src/lsm_storage.rs b/mini-lsm/src/lsm_storage.rs index 738fdbde5..0ee8497af 100644 --- a/mini-lsm/src/lsm_storage.rs +++ b/mini-lsm/src/lsm_storage.rs @@ -148,6 +148,11 @@ fn key_within(user_key: &[u8], table_begin: KeySlice, table_end: KeySlice) -> bo table_begin.raw_ref() <= user_key && user_key <= table_end.raw_ref() } +#[derive(Clone, Debug)] +pub enum CompactionFilter { + Prefix(Bytes), +} + /// The storage interface of the LSM tree. pub(crate) struct LsmStorageInner { pub(crate) state: Arc>>, @@ -160,6 +165,8 @@ pub(crate) struct LsmStorageInner { pub(crate) manifest: Option, #[allow(dead_code)] pub(crate) mvcc: Option, + #[allow(dead_code)] + pub(crate) compaction_filters: Arc>>, } /// A thin wrapper for `LsmStorageInner` and the user interface for MiniLSM. @@ -243,6 +250,10 @@ impl MiniLsm { })) } + pub fn add_compaction_filter(&self, compaction_filter: CompactionFilter) { + self.inner.add_compaction_filter(compaction_filter) + } + pub fn get(&self, key: &[u8]) -> Result> { self.inner.get(key) } @@ -418,6 +429,7 @@ impl LsmStorageInner { manifest: Some(manifest), options: options.into(), mvcc: None, + compaction_filters: Arc::new(Mutex::new(Vec::new())), }; storage.sync_dir()?; @@ -428,6 +440,11 @@ impl LsmStorageInner { self.state.read().memtable.sync_wal() } + pub fn add_compaction_filter(&self, compaction_filter: CompactionFilter) { + let mut compaction_filters = self.compaction_filters.lock(); + compaction_filters.push(compaction_filter); + } + /// Get a key from the storage. In day 7, this can be further optimized by using a bloom filter. pub fn get(&self, key: &[u8]) -> Result> { let snapshot = { diff --git a/mini-lsm/src/mem_table.rs b/mini-lsm/src/mem_table.rs index d5efc5152..89da268f5 100644 --- a/mini-lsm/src/mem_table.rs +++ b/mini-lsm/src/mem_table.rs @@ -119,8 +119,7 @@ impl MemTable { item: (Bytes::new(), Bytes::new()), } .build(); - let entry = iter.with_iter_mut(|iter| MemTableIterator::entry_to_item(iter.next())); - iter.with_mut(|x| *x.item = entry); + iter.next().unwrap(); iter } diff --git a/mini-lsm/src/tests/harness.rs b/mini-lsm/src/tests/harness.rs index 4b0d7e30d..b41745b2f 100644 --- a/mini-lsm/src/tests/harness.rs +++ b/mini-lsm/src/tests/harness.rs @@ -295,6 +295,11 @@ pub fn check_compaction_ratio(storage: Arc) { }; level_size.push(size); } + let extra_iterators = if TS_ENABLED { + 1 /* txn local iterator for OCC */ + } else { + 0 + }; let num_iters = storage .scan(Bound::Unbounded, Bound::Unbounded) .unwrap() @@ -326,8 +331,8 @@ pub fn check_compaction_ratio(storage: Arc) { ); } assert!( - num_iters <= l0_sst_num + num_memtables + max_levels, - "did you use concat iterators?" + num_iters <= l0_sst_num + num_memtables + max_levels + extra_iterators, + "we found {num_iters} iterators in your implementation, (l0_sst_num={l0_sst_num}, num_memtables={num_memtables}, max_levels={max_levels}) did you use concat iterators?" ); } CompactionOptions::Leveled(LeveledCompactionOptions { @@ -338,23 +343,24 @@ pub fn check_compaction_ratio(storage: Arc) { }) => { assert!(l0_sst_num < level0_file_num_compaction_trigger); assert!(level_size.len() <= max_levels); - for idx in 1..level_size.len() { - let prev_size = level_size[idx - 1]; - let this_size = level_size[idx]; + let last_level_size = *level_size.last().unwrap(); + let mut multiplier = 1.0; + for idx in (1..level_size.len()).rev() { + multiplier *= level_size_multiplier as f64; + let this_size = level_size[idx - 1]; assert!( // do not add hard requirement on level size multiplier considering bloom filters... - this_size as f64 / prev_size as f64 >= (level_size_multiplier as f64 - 0.5), - "L{}/L{}, {}/{}<<{}", - state.levels[idx].0, + this_size as f64 / last_level_size as f64 <= 1.0 / multiplier + 0.5, + "L{}/L_max, {}/{}>>1.0/{}", state.levels[idx - 1].0, this_size, - prev_size, - level_size_multiplier + last_level_size, + multiplier ); } assert!( - num_iters <= l0_sst_num + num_memtables + max_levels, - "did you use concat iterators?" + num_iters <= l0_sst_num + num_memtables + max_levels + extra_iterators, + "we found {num_iters} iterators in your implementation, (l0_sst_num={l0_sst_num}, num_memtables={num_memtables}, max_levels={max_levels}) did you use concat iterators?" ); } CompactionOptions::Tiered(TieredCompactionOptions { @@ -395,8 +401,8 @@ pub fn check_compaction_ratio(storage: Arc) { sum_size += this_size; } assert!( - num_iters <= num_memtables + num_tiers, - "did you use concat iterators?" + num_iters <= num_memtables + num_tiers + extra_iterators, + "we found {num_iters} iterators in your implementation, (num_memtables={num_memtables}, num_tiers={num_tiers}) did you use concat iterators?" ); } } diff --git a/mini-lsm/src/tests/week1_day2.rs b/mini-lsm/src/tests/week1_day2.rs index 6c1b002e1..92b5730a8 100644 --- a/mini-lsm/src/tests/week1_day2.rs +++ b/mini-lsm/src/tests/week1_day2.rs @@ -229,7 +229,11 @@ fn test_task2_merge_error() { ], 1, ); - let iter = MergeIterator::::create(vec![Box::new(i1), Box::new(i2)]); + let iter = MergeIterator::::create(vec![ + Box::new(i1.clone()), + Box::new(i1), + Box::new(i2), + ]); // your implementation should correctly throw an error instead of panic expect_iter_error(iter); } diff --git a/mini-lsm/src/tests/week1_day6.rs b/mini-lsm/src/tests/week1_day6.rs index a435d75dd..cb6a9c836 100644 --- a/mini-lsm/src/tests/week1_day6.rs +++ b/mini-lsm/src/tests/week1_day6.rs @@ -193,5 +193,5 @@ fn test_task3_sst_filter() { Bound::Excluded(format!("{:05}", 6000).as_bytes()), ) .unwrap(); - assert!(min_num < iter.num_active_iterators() && iter.num_active_iterators() < max_num); + assert!(min_num <= iter.num_active_iterators() && iter.num_active_iterators() < max_num); } From 96035adf412e80463f5254686da8097b0e8c1972 Mon Sep 17 00:00:00 2001 From: husharp Date: Tue, 27 Feb 2024 18:21:08 +0800 Subject: [PATCH 22/22] support compaction filter Signed-off-by: husharp --- mini-lsm-starter/src/compact.rs | 19 ++++++- mini-lsm-starter/src/lsm_iterator.rs | 4 +- mini-lsm-starter/src/tests.rs | 1 + mini-lsm-starter/src/tests/harness.rs | 34 +++++++----- mini-lsm-starter/src/tests/week3_day7.rs | 70 ++++++++++++++++++++++++ 5 files changed, 110 insertions(+), 18 deletions(-) create mode 100644 mini-lsm-starter/src/tests/week3_day7.rs diff --git a/mini-lsm-starter/src/compact.rs b/mini-lsm-starter/src/compact.rs index 554eee7d1..354bad4d3 100644 --- a/mini-lsm-starter/src/compact.rs +++ b/mini-lsm-starter/src/compact.rs @@ -22,7 +22,7 @@ use crate::iterators::two_merge_iterator::TwoMergeIterator; use crate::iterators::StorageIterator; use crate::key::KeySlice; use crate::lsm_iterator::FusedIterator; -use crate::lsm_storage::{LsmStorageInner, LsmStorageState}; +use crate::lsm_storage::{CompactionFilter, LsmStorageInner, LsmStorageState}; use crate::manifest::ManifestRecord; use crate::table::{SsTable, SsTableBuilder, SsTableIterator}; @@ -127,7 +127,9 @@ impl LsmStorageInner { // All ts (strictly) below this ts can be garbage collected. let gc_ts = self.mvcc().watermark(); let mut last_gc_key = Vec::::new(); - while iter.is_valid() { + // add compaction filters, ref https://skyzh.github.io/mini-lsm/week3-07-compaction-filter.html + let compaction_filters = self.compaction_filters.lock().clone(); + 'outer: while iter.is_valid() { if builder.is_none() { builder = Some(SsTableBuilder::new(self.options.block_size)); } @@ -145,6 +147,19 @@ impl LsmStorageInner { iter.next()?; continue; } + + if !compaction_filters.is_empty() { + for filter in &compaction_filters { + match filter { + CompactionFilter::Prefix(prefix) => { + if iter.key().key_ref().starts_with(prefix) { + iter.next()?; + continue 'outer; + } + } + } + } + } } let same_as_last_key = iter.key().key_ref() == last_key; diff --git a/mini-lsm-starter/src/lsm_iterator.rs b/mini-lsm-starter/src/lsm_iterator.rs index a436453d1..843518148 100644 --- a/mini-lsm-starter/src/lsm_iterator.rs +++ b/mini-lsm-starter/src/lsm_iterator.rs @@ -150,11 +150,11 @@ impl StorageIterator for FusedIterator { } fn next(&mut self) -> Result<()> { - if self.has_error { + if self.has_errored { bail!("Iterator has already returned an error") } if let Some(err) = self.iter.next().err() { - self.has_error = true; + self.has_errored = true; return Err(err); } Ok(()) diff --git a/mini-lsm-starter/src/tests.rs b/mini-lsm-starter/src/tests.rs index 761a0c026..466d6402c 100644 --- a/mini-lsm-starter/src/tests.rs +++ b/mini-lsm-starter/src/tests.rs @@ -21,3 +21,4 @@ mod week3_day3; mod week3_day4; mod week3_day5; mod week3_day6; +mod week3_day7; diff --git a/mini-lsm-starter/src/tests/harness.rs b/mini-lsm-starter/src/tests/harness.rs index 4b0d7e30d..b41745b2f 100644 --- a/mini-lsm-starter/src/tests/harness.rs +++ b/mini-lsm-starter/src/tests/harness.rs @@ -295,6 +295,11 @@ pub fn check_compaction_ratio(storage: Arc) { }; level_size.push(size); } + let extra_iterators = if TS_ENABLED { + 1 /* txn local iterator for OCC */ + } else { + 0 + }; let num_iters = storage .scan(Bound::Unbounded, Bound::Unbounded) .unwrap() @@ -326,8 +331,8 @@ pub fn check_compaction_ratio(storage: Arc) { ); } assert!( - num_iters <= l0_sst_num + num_memtables + max_levels, - "did you use concat iterators?" + num_iters <= l0_sst_num + num_memtables + max_levels + extra_iterators, + "we found {num_iters} iterators in your implementation, (l0_sst_num={l0_sst_num}, num_memtables={num_memtables}, max_levels={max_levels}) did you use concat iterators?" ); } CompactionOptions::Leveled(LeveledCompactionOptions { @@ -338,23 +343,24 @@ pub fn check_compaction_ratio(storage: Arc) { }) => { assert!(l0_sst_num < level0_file_num_compaction_trigger); assert!(level_size.len() <= max_levels); - for idx in 1..level_size.len() { - let prev_size = level_size[idx - 1]; - let this_size = level_size[idx]; + let last_level_size = *level_size.last().unwrap(); + let mut multiplier = 1.0; + for idx in (1..level_size.len()).rev() { + multiplier *= level_size_multiplier as f64; + let this_size = level_size[idx - 1]; assert!( // do not add hard requirement on level size multiplier considering bloom filters... - this_size as f64 / prev_size as f64 >= (level_size_multiplier as f64 - 0.5), - "L{}/L{}, {}/{}<<{}", - state.levels[idx].0, + this_size as f64 / last_level_size as f64 <= 1.0 / multiplier + 0.5, + "L{}/L_max, {}/{}>>1.0/{}", state.levels[idx - 1].0, this_size, - prev_size, - level_size_multiplier + last_level_size, + multiplier ); } assert!( - num_iters <= l0_sst_num + num_memtables + max_levels, - "did you use concat iterators?" + num_iters <= l0_sst_num + num_memtables + max_levels + extra_iterators, + "we found {num_iters} iterators in your implementation, (l0_sst_num={l0_sst_num}, num_memtables={num_memtables}, max_levels={max_levels}) did you use concat iterators?" ); } CompactionOptions::Tiered(TieredCompactionOptions { @@ -395,8 +401,8 @@ pub fn check_compaction_ratio(storage: Arc) { sum_size += this_size; } assert!( - num_iters <= num_memtables + num_tiers, - "did you use concat iterators?" + num_iters <= num_memtables + num_tiers + extra_iterators, + "we found {num_iters} iterators in your implementation, (num_memtables={num_memtables}, num_tiers={num_tiers}) did you use concat iterators?" ); } } diff --git a/mini-lsm-starter/src/tests/week3_day7.rs b/mini-lsm-starter/src/tests/week3_day7.rs new file mode 100644 index 000000000..bfbc05d7a --- /dev/null +++ b/mini-lsm-starter/src/tests/week3_day7.rs @@ -0,0 +1,70 @@ +use bytes::Bytes; +use tempfile::tempdir; + +use crate::{ + compact::CompactionOptions, + lsm_storage::{CompactionFilter, LsmStorageOptions, MiniLsm, WriteBatchRecord}, +}; + +use super::harness::{check_iter_result_by_key, construct_merge_iterator_over_storage}; + +#[test] +fn test_task3_mvcc_compaction() { + let dir = tempdir().unwrap(); + let options = LsmStorageOptions::default_for_week2_test(CompactionOptions::NoCompaction); + let storage = MiniLsm::open(&dir, options.clone()).unwrap(); + storage + .write_batch(&[ + WriteBatchRecord::Put("table1_a", "1"), + WriteBatchRecord::Put("table1_b", "1"), + WriteBatchRecord::Put("table1_c", "1"), + WriteBatchRecord::Put("table2_a", "1"), + WriteBatchRecord::Put("table2_b", "1"), + WriteBatchRecord::Put("table2_c", "1"), + ]) + .unwrap(); + storage.force_flush().unwrap(); + let snapshot0 = storage.new_txn().unwrap(); + storage + .write_batch(&[ + WriteBatchRecord::Put("table1_a", "2"), + WriteBatchRecord::Del("table1_b"), + WriteBatchRecord::Put("table1_c", "2"), + WriteBatchRecord::Put("table2_a", "2"), + WriteBatchRecord::Del("table2_b"), + WriteBatchRecord::Put("table2_c", "2"), + ]) + .unwrap(); + storage.force_flush().unwrap(); + storage.add_compaction_filter(CompactionFilter::Prefix(Bytes::from("table2_"))); + storage.force_full_compaction().unwrap(); + + let mut iter = construct_merge_iterator_over_storage(&storage.inner.state.read()); + check_iter_result_by_key( + &mut iter, + vec![ + (Bytes::from("table1_a"), Bytes::from("2")), + (Bytes::from("table1_a"), Bytes::from("1")), + (Bytes::from("table1_b"), Bytes::new()), + (Bytes::from("table1_b"), Bytes::from("1")), + (Bytes::from("table1_c"), Bytes::from("2")), + (Bytes::from("table1_c"), Bytes::from("1")), + (Bytes::from("table2_a"), Bytes::from("2")), + (Bytes::from("table2_b"), Bytes::new()), + (Bytes::from("table2_c"), Bytes::from("2")), + ], + ); + + drop(snapshot0); + + storage.force_full_compaction().unwrap(); + + let mut iter = construct_merge_iterator_over_storage(&storage.inner.state.read()); + check_iter_result_by_key( + &mut iter, + vec![ + (Bytes::from("table1_a"), Bytes::from("2")), + (Bytes::from("table1_c"), Bytes::from("2")), + ], + ); +}