diff --git a/Cargo.lock b/Cargo.lock
index 66dd13e06..3619b2711 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -80,6 +80,12 @@ version = "1.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
+[[package]]
+name = "bitflags"
+version = "2.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ed570934406eb16438a4e976b1b4500774099c13b8cb96eec99f620f05090ddf"
+
[[package]]
name = "bumpalo"
version = "3.11.1"
@@ -175,6 +181,15 @@ version = "0.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "702fc72eb24e5a1e48ce58027a675bc24edd52096d5397d4aea7c6dd9eca0bd1"
+[[package]]
+name = "clipboard-win"
+version = "5.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3ec832972fefb8cf9313b45a0d1945e29c9c251f1d4c6eafc5fe2124c02d2e81"
+dependencies = [
+ "error-code",
+]
+
[[package]]
name = "colorchoice"
version = "1.0.0"
@@ -268,6 +283,22 @@ version = "0.3.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a357d28ed41a50f9c765dbfe56cbc04a64e53e5fc58ba79fbc34c10ef3df831f"
+[[package]]
+name = "endian-type"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c34f04666d835ff5d62e058c3995147c06f42fe86ff053337632bca83e42702d"
+
+[[package]]
+name = "errno"
+version = "0.3.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a258e46cdc063eb8519c00b9fc845fc47bcfca4130e2f08e88665ceda8474245"
+dependencies = [
+ "libc",
+ "windows-sys 0.52.0",
+]
+
[[package]]
name = "error-chain"
version = "0.12.4"
@@ -277,6 +308,12 @@ dependencies = [
"version_check",
]
+[[package]]
+name = "error-code"
+version = "3.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "281e452d3bad4005426416cdba5ccfd4f5c1280e10099e21db27f7c1c28347fc"
+
[[package]]
name = "farmhash"
version = "1.1.5"
@@ -292,6 +329,17 @@ dependencies = [
"instant",
]
+[[package]]
+name = "fd-lock"
+version = "4.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7e5768da2206272c81ef0b5e951a41862938a6070da63bcea197899942d3b947"
+dependencies = [
+ "cfg-if",
+ "rustix",
+ "windows-sys 0.52.0",
+]
+
[[package]]
name = "getrandom"
version = "0.2.8"
@@ -324,6 +372,15 @@ dependencies = [
"libc",
]
+[[package]]
+name = "home"
+version = "0.5.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e3d1354bf6b7235cb4a0576c2619fd4ed18183f689b12b006a0ee7329eeff9a5"
+dependencies = [
+ "windows-sys 0.52.0",
+]
+
[[package]]
name = "instant"
version = "0.1.12"
@@ -365,9 +422,15 @@ checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
[[package]]
name = "libc"
-version = "0.2.139"
+version = "0.2.153"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd"
+
+[[package]]
+name = "linux-raw-sys"
+version = "0.4.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "201de327520df007757c1f0adce6e827fe8562fbc28bfd9c15571c66ca1f5f79"
+checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c"
[[package]]
name = "lock_api"
@@ -426,9 +489,11 @@ dependencies = [
"crossbeam-skiplist",
"farmhash",
"moka",
+ "nom",
"ouroboros",
"parking_lot",
"rand",
+ "rustyline",
"serde",
"serde_json",
"tempfile",
@@ -448,9 +513,11 @@ dependencies = [
"crossbeam-skiplist",
"farmhash",
"moka",
+ "nom",
"ouroboros",
"parking_lot",
"rand",
+ "rustyline",
"serde",
"serde_json",
"tempfile",
@@ -464,14 +531,17 @@ dependencies = [
"arc-swap",
"bytes",
"clap",
+ "crc32fast",
"crossbeam-channel",
"crossbeam-epoch",
"crossbeam-skiplist",
"farmhash",
"moka",
+ "nom",
"ouroboros",
"parking_lot",
"rand",
+ "rustyline",
"serde",
"serde_json",
"tempfile",
@@ -487,6 +557,12 @@ dependencies = [
"duct",
]
+[[package]]
+name = "minimal-lexical"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a"
+
[[package]]
name = "moka"
version = "0.9.6"
@@ -510,6 +586,36 @@ dependencies = [
"uuid",
]
+[[package]]
+name = "nibble_vec"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "77a5d83df9f36fe23f0c3648c6bbb8b0298bb5f1939c8f2704431371f4b84d43"
+dependencies = [
+ "smallvec",
+]
+
+[[package]]
+name = "nix"
+version = "0.27.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2eb04e9c688eff1c89d72b407f168cf79bb9e867a9d3323ed6c01519eb9cc053"
+dependencies = [
+ "bitflags 2.4.2",
+ "cfg-if",
+ "libc",
+]
+
+[[package]]
+name = "nom"
+version = "7.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a"
+dependencies = [
+ "memchr",
+ "minimal-lexical",
+]
+
[[package]]
name = "num_cpus"
version = "1.15.0"
@@ -618,7 +724,7 @@ version = "0.9.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2d9cc634bc78768157b5cbfe988ffcd1dcba95cd2b2f03a88316c08c6d00ed63"
dependencies = [
- "bitflags",
+ "bitflags 1.3.2",
"memchr",
"unicase",
]
@@ -648,6 +754,16 @@ dependencies = [
"proc-macro2",
]
+[[package]]
+name = "radix_trie"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c069c179fcdc6a2fe24d8d18305cf085fdbd4f922c041943e203685d6a1c58fd"
+dependencies = [
+ "endian-type",
+ "nibble_vec",
+]
+
[[package]]
name = "rand"
version = "0.8.5"
@@ -684,7 +800,7 @@ version = "10.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a6823ea29436221176fe662da99998ad3b4db2c7f31e7b6f5fe43adccd6320bb"
dependencies = [
- "bitflags",
+ "bitflags 1.3.2",
]
[[package]]
@@ -693,7 +809,7 @@ version = "0.2.16"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fb5a58c1855b4b6819d59012155603f0b22ad30cad752600aadfcb695265519a"
dependencies = [
- "bitflags",
+ "bitflags 1.3.2",
]
[[package]]
@@ -714,6 +830,41 @@ dependencies = [
"semver",
]
+[[package]]
+name = "rustix"
+version = "0.38.31"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6ea3e1a662af26cd7a3ba09c0297a31af215563ecf42817c98df621387f4e949"
+dependencies = [
+ "bitflags 2.4.2",
+ "errno",
+ "libc",
+ "linux-raw-sys",
+ "windows-sys 0.52.0",
+]
+
+[[package]]
+name = "rustyline"
+version = "13.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "02a2d683a4ac90aeef5b1013933f6d977bd37d51ff3f4dad829d4931a7e6be86"
+dependencies = [
+ "bitflags 2.4.2",
+ "cfg-if",
+ "clipboard-win",
+ "fd-lock",
+ "home",
+ "libc",
+ "log",
+ "memchr",
+ "nix",
+ "radix_trie",
+ "unicode-segmentation",
+ "unicode-width",
+ "utf8parse",
+ "winapi",
+]
+
[[package]]
name = "ryu"
version = "1.0.12"
@@ -920,6 +1071,12 @@ version = "1.0.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "84a22b9f218b40614adcb3f4ff08b703773ad44fa9423e4e0d346d5db86e4ebc"
+[[package]]
+name = "unicode-segmentation"
+version = "1.10.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1dd624098567895118886609431a7c3b8f516e41d30e0643f03d94592a147e36"
+
[[package]]
name = "unicode-width"
version = "0.1.10"
diff --git a/README.md b/README.md
index 754ee0fb6..d59582ca3 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,5 @@
+
+
# LSM in a Week
[](https://github.com/skyzh/mini-lsm/actions/workflows/main.yml)
@@ -26,7 +28,7 @@ You should modify code in `mini-lsm-starter` directory.
```
cargo x install-tools
-cargo copy-test --week 1 --day 1
+cargo x copy-test --week 1 --day 1
cargo x scheck
cargo run --bin mini-lsm-cli
cargo run --bin compaction-simulator
@@ -45,7 +47,7 @@ cargo x book
If you changed public API in the reference solution, you might also need to synchronize it to the starter crate.
To do this, use `cargo x sync`.
-## Structure
+## Code Structure
* mini-lsm: the final solution code for <= week 2
* mini-lsm-mvcc: the final solution code for week 3 MVCC
@@ -70,28 +72,41 @@ cargo run --bin compaction-simulator-ref
cargo run --bin compaction-simulator-mvcc-ref
```
-## Progress
+## Tutorial Structure
-We are working on chapter 3 and more test cases for all existing contents.
+We have 3 weeks + 1 extra week (in progress) for this tutorial.
* Week 1: Storage Format + Engine Skeleton
* Week 2: Compaction and Persistence
* Week 3: Multi-Version Concurrency Control
-* The Extra Week / Rest of Your Life: Optimizations (unlikely to be available in 2024...)
-
-✅: Finished \
-🚧: WIP and will likely be available soon
-
-| Week + Chapter | Topic | Solution | Starter Code | Writeup |
-| -------------- | ----------------------------------------------- | -------- | ------------ | ------- |
-| 3.1 | Timestamp Key Encoding | ✅ | ✅ | ✅ |
-| 3.2 | Snapshot Read - Blocks, Memtables, and SSTs | ✅ | ✅ | ✅ |
-| 3.3 | Snapshot Read - Engine Read Path | ✅ | ✅ | ✅ |
-| 3.4 | Watermark and Garbage Collection | ✅ | ✅ | ✅ |
-| 3.5 | Transactions and Optimistic Concurrency Control | ✅ | ✅ | ✅ |
-| 3.6 | Serializable Snapshot Isolation | ✅ | ✅ | ✅ |
-| 3.7 | Compaction Filter | 🚧 | | |
+* The Extra Week / Rest of Your Life: Optimizations (unlikely to be available in 2024...)
+
+
+
+| Week + Chapter | Topic |
+| -------------- | ----------------------------------------------------------- |
+| 1.1 | Memtable |
+| 1.2 | Merge Iterator |
+| 1.3 | Block |
+| 1.4 | Sorted String Table (SST) |
+| 1.5 | Read Path |
+| 1.6 | Write Path |
+| 1.7 | SST Optimizations: Prefix Key Encoding + Bloom Filters |
+| 2.1 | Compaction Implementation |
+| 2.2 | Simple Compaction Strategy (Traditional Leveled Compaction) |
+| 2.3 | Tiered Compaction Strategy (RocksDB Universal Compaction) |
+| 2.4 | Leveled Compaction Strategy (RocksDB Leveled Compaction) |
+| 2.5 | Manifest |
+| 2.6 | Write-Ahead Log (WAL) |
+| 2.7 | Batch Write and Checksums |
+| 3.1 | Timestamp Key Encoding |
+| 3.2 | Snapshot Read - Memtables and Timestamps |
+| 3.3 | Snapshot Read - Transaction API |
+| 3.4 | Watermark and Garbage Collection |
+| 3.5 | Transactions and Optimistic Concurrency Control |
+| 3.6 | Serializable Snapshot Isolation |
+| 3.7 | Compaction Filters |
## License
-The Mini-LSM starter code and solution are under Apache 2.0 license. The author reserves the full copyright of the tutorial materials (markdown files and figures).
+The Mini-LSM starter code and solution are under [Apache 2.0 license](LICENSE). The author reserves the full copyright of the tutorial materials (markdown files and figures).
diff --git a/mini-lsm-book/src/00-overview.md b/mini-lsm-book/src/00-overview.md
index d80ed7af8..8314938aa 100644
--- a/mini-lsm-book/src/00-overview.md
+++ b/mini-lsm-book/src/00-overview.md
@@ -1,11 +1,23 @@
# Mini-LSM Course Overview
+## Tutorial Structure
+
+
+
+We have three parts (weeks) for this tutorial. In the first week, we will focus on the storage structure and the storage format of an LSM storage engine. In the second week, we will deeply dive into compactions and implement persistence support for the storage engine. In the third week, we will implement multi-version concurrency control.
+
+* [The First Week: Mini-LSM](./week1-overview.md)
+* [The Second Week: Compaction and Persistence](./week2-overview.md)
+* [The Third Week: Multi-Version Concurrency Control](./week3-overview.md)
+
+Please look at [Environment Setup](./00-get-started.md) to set up the environment.
+
## Overview of LSM
-An LSM storage engine generally contains 3 parts:
+An LSM storage engine generally contains three parts:
1. Write-ahead log to persist temporary data for recovery.
-2. SSTs on the disk for maintaining a tree structure.
+2. SSTs on the disk to maintain an LSM-tree structure.
3. Mem-tables in memory for batching small writes.
The storage engine generally provides the following interfaces:
@@ -19,24 +31,20 @@ To ensure persistence,
* `Sync()`: ensure all the operations before `sync` are persisted to the disk.
-Some engines choose to combine `Put` and `Delete` into a single operation called `WriteBatch`, which accepts a batch
-of key value pairs.
+Some engines choose to combine `Put` and `Delete` into a single operation called `WriteBatch`, which accepts a batch of key-value pairs.
-In this tutorial, we assume the LSM tree is using leveled compaction algorithm, which is commonly used in real-world
-systems.
+In this tutorial, we assume the LSM tree is using a leveled compaction algorithm, which is commonly used in real-world systems.
### Write Path

-The write path of LSM contains 4 steps:
+The write path of LSM contains four steps:
-1. Write the key-value pair to write-ahead log, so that it can be recovered after the storage engine crashes.
-2. Write the key-value pair to memtable. After (1) and (2) completes, we can notify the user that the write operation
- is completed.
-3. When a memtable is full, we will freeze them into immutable memtables, and will flush them to the disk as SST files in the background.
-4. We will compact some files in some level into lower levels to maintain a good shape for the LSM tree, so that read
- amplification is low.
+1. Write the key-value pair to the write-ahead log so that it can be recovered after the storage engine crashes.
+2. Write the key-value pair to memtable. After (1) and (2) are completed, we can notify the user that the write operation is completed.
+3. (In the background) When a mem-table is full, we will freeze them into immutable mem-tables and flush them to the disk as SST files in the background.
+4. (In the background) The engine will compact some files in some levels into lower levels to maintain a good shape for the LSM tree so that the read amplification is low.
### Read Path
@@ -44,21 +52,9 @@ The write path of LSM contains 4 steps:
When we want to read a key,
-1. We will first probe all the memtables from latest to oldest.
+1. We will first probe all the mem-tables from the latest to the oldest.
2. If the key is not found, we will then search the entire LSM tree containing SSTs to find the data.
There are two types of read: lookup and scan. Lookup finds one key in the LSM tree, while scan iterates all keys within a range in the storage engine. We will cover both of them throughout the tutorial.
-## Tutorial Structure
-
-
-
-We have 3 parts (weeks) for this tutorial. In the first week, we will focus on the storage structure and the storage format of an LSM storage engine. In the second week, we will dive into compactions in depth and implement persistence support for the storage engine. In the third week, we will implement multi-version concurrency control.
-
-* [The First Week: Mini-LSM](./week1-overview.md)
-* [The Second Week: Compaction and Persistence](./week2-overview.md)
-* [The Third Week: Multi-Version Concurrency Control](./week3-overview.md)
-
-To set up the environment, please take a look at [Environment Setup](./00-get-started.md).
-
{{#include copyright.md}}
diff --git a/mini-lsm-book/src/00-preface.md b/mini-lsm-book/src/00-preface.md
index 041caee41..6c8fb1c1d 100644
--- a/mini-lsm-book/src/00-preface.md
+++ b/mini-lsm-book/src/00-preface.md
@@ -1,82 +1,74 @@
# Preface
-
+
-In this tutorial, you will learn how to build a simple LSM-Tree storage engine in the Rust programming language.
+This course teaches you how to build a simple LSM-Tree storage engine in Rust.
## What is LSM, and Why LSM?
-Log-structured merge tree is a data structure to maintain key-value pairs. This data structure is widely used in
+Log-structured merge trees are data structures that maintain key-value pairs. This data structure is widely used in
distributed database systems like [TiDB](https://www.pingcap.com) and [CockroachDB](https://www.cockroachlabs.com) as
their underlying storage engine. [RocksDB](http://rocksdb.org), based on [LevelDB](https://github.com/google/leveldb),
-is an implementation of LSM-Tree storage engine. It provides a wide range of key-value access functionalities and is
-used in a lot of production systems.
+is an implementation of LSM-Tree storage engines. It provides many key-value access functionalities and is
+used in many production systems.
Generally speaking, LSM Tree is an append-friendly data structure. It is more intuitive to compare LSM to other
-key-value data structure like RB-Tree and B-Tree. For RB-Tree and B-Tree, all data operations are in-place. That is to
-say, when you update the value corresponding to the key, the value will be overwritten at its original memory or disk
-space. But in an LSM Tree, all write operations, i.e., insertions, updates, deletions, are performed in somewhere else.
-These operations will be batched into SST (sorted string table) files and be written to the disk. Once written to the
-disk, the file will not be changed. These operations are applied lazily on disk with a special task called compaction.
-The compaction job will merge multiple SST files and remove unused data.
+key-value data structures like RB-Tree and B-Tree. For RB-Tree and B-Tree, all data operations are in place. That is to
+say, when you want to update the value corresponding to the key, the engine will overwrite its original memory or disk
+space with the new value. But in an LSM Tree, all write operations, i.e., insertions, updates, deletions, are lazily applied to the storage.
+The engine batches these operations into SST (sorted string table) files and writes them to the disk. Once written to the
+disk, the engine will not directly modify them. In a particular background task called compaction, the engine will merge these files to apply the updates and deletions.
-This architectural design makes LSM tree easy to work with.
+This architectural design makes LSM trees easy to work with.
-1. Data are immutable on persistent storage, which means that it is easier to offload the background tasks (compaction)
- to remote servers. It is also feasible to directly store and serve data from cloud-native storage systems like S3.
-2. An LSM tree can balance between read, write and space amplification by changing the compaction algorithm. The data
- structure itself is super versatile and can be optimized for different workloads.
+1. Data are immutable on persistent storage. Concurrency control is more straightforward. Offloading the background tasks (compaction) to remote servers is possible. Storing and serving data directly from cloud-native storage systems like S3 is also feasible.
+2. Changing the compaction algorithm allows the storage engine to balance between read, write, and space amplification. The data structure is versatile, and by adjusting the compaction parameters, we can optimize the LSM structure for different workloads.
-In this tutorial, we will learn how to build an LSM-Tree-based storage engine in the Rust programming language.
+This course will teach you how to build an LSM-tree-based storage engine in the Rust programming language.
## Prerequisites
-* You should know the basics of the Rust programming language. Reading [the Rust book](https://doc.rust-lang.org/book/)
- is enough.
-* You should know the basic concepts of key-value storage engines, i.e., why we need somehow complex design to achieve
- persistence. If you have no experience with database systems and storage systems before, you can implement Bitcask
- in [PingCAP Talent Plan](https://github.com/pingcap/talent-plan/tree/master/courses/rust/projects/project-2).
-* Knowing the basics of an LSM tree is not a requirement but we recommend you to read something about it, e.g., the
- overall idea of LevelDB. This would familiarize you with concepts like mutable and immutable mem-tables, SST,
- compaction, WAL, etc.
+* You should know the basics of the Rust programming language. Reading [the Rust book](https://doc.rust-lang.org/book/) is enough.
+* You should know the basic concepts of key-value storage engines, i.e., why we need a complex design to achieve persistence. If you have no experience with database systems and storage systems before, you can implement Bitcask in [PingCAP Talent Plan](https://github.com/pingcap/talent-plan/tree/master/courses/rust/projects/project-2).
+* Knowing the basics of an LSM tree is not a requirement, but we recommend you read something about it, e.g., the overall idea of LevelDB. Knowing them beforehand would familiarize you with concepts like mutable and immutable mem-tables, SST, compaction, WAL, etc.
-## What should you expect from this tutorial...
+## What should you expect from this tutorial
-After learning this course, you should have a deep understanding of how a LSM-based storage system works, gain hands-on experience of designing such systems, and apply what you have learned in your study and career. You will understand the design tradeoffs in such storage systems and find optimal ways to design a LSM-based storage system to meet your workload requirements/goals. This is a very in-depth tutorial that covers all the important implementation details and design choices of modern storage systems (i.e., RocksDB) based on the author's experience in several LSM-like storage systems, and you will be able to directly apply what you have learned in both industry and academia.
+After taking this course, you should deeply understand how an LSM-based storage system works, gain hands-on experience in designing such systems, and apply what you have learned in your study and career. You will understand the design tradeoffs in such storage systems and find optimal ways to design an LSM-based storage system to meet your workload requirements/goals. This very in-depth tutorial covers all the essential implementation details and design choices of modern storage systems (i.e., RocksDB) based on the author's experience in several LSM-like storage systems, and you will be able to directly apply what you have learned in both industry and academia.
### Structure
-The tutorial is a large course that is split into several parts (weeks). Each week usually has seven chapters, and each of the chapter can be finished within 2-3 hours. The first six chapters of each part will instruct you to build a working system, and the last chapter of each week will be a *snack time* chapter that implements some easy things over what you have built in the previous six days. In each chapter, there will be required tasks, *check you understanding* questions, and bonus tasks.
+The tutorial is an extensive course with several parts (weeks). Each week has seven chapters; you can finish each within 2 to 3 hours. The first six chapters of each part will instruct you to build a working system, and the last chapter of each week will be a *snack time* chapter that implements some easy things over what you have built in the previous six days. Each chapter will have required tasks, *check your understanding* questions, and bonus tasks.
### Testing
-We provide full test suite and some cli tools for you to validate if your solution is correct. Note that the test suite is not exhaustive, and your solution might not be 100% correct after passing all test cases. You might need to fix earlier bugs when implementing later parts of the system. We recommend you to think thoroughly about your implementation, especially when there are multi-thread operations and race conditions.
+We provide a full test suite and some CLI tools for you to validate if your solution is correct. Note that the test suite is not exhaustive, and your solution might not be 100% correct after passing all test cases. You might need to fix earlier bugs when implementing later parts of the system. We recommend you think thoroughly about your implementation, especially when there are multi-thread operations and race conditions.
### Solution
We have a solution that implements all the functionalities as required in the tutorial in the mini-lsm main repo. At the same time, we also have a mini-lsm solution checkpoint repo where each commit corresponds to a chapter in the tutorial.
-Keeping such checkpoint repo up-to-date to the mini-lsm tutorial is hard because each bug fix or new feature will need to go through all commits (or checkpoints). Therefore, this repo might not be using the latest starter code or incorporating the latest features from the mini-lsm tutorial.
+Keeping such a checkpoint repo up-to-date with the mini-lsm tutorial is challenging because each bug fix or new feature must go through all commits (or checkpoints). Therefore, this repo might not use the latest starter code or incorporate the latest features from the mini-lsm tutorial.
-**TL;DR: We do not guarantee the solution checkpoint repo contains a correct solution, passes all tests, or has the correct doc comments.** For a correct implementation and the solution after implementing all things, please take a look at the solution in the main repo instead. [https://github.com/skyzh/mini-lsm/tree/main/mini-lsm](https://github.com/skyzh/mini-lsm/tree/main/mini-lsm).
+**TL;DR: We do not guarantee the solution checkpoint repo contains a correct solution, passes all tests, or has the correct doc comments.** For a correct implementation and the solution after implementing everything, please look at the solution in the main repo instead. [https://github.com/skyzh/mini-lsm/tree/main/mini-lsm](https://github.com/skyzh/mini-lsm/tree/main/mini-lsm).
-If you are stuck at some part of the tutorial or do not know where to implement a functionality, you can refer to this repo for help. You may compare the diff between commits to know what has been changed. Some functions in the mini-lsm tutorial might be changed multiple times throughout the chapters, and you can know what exactly are expected to be implemented for each chapter in this repo.
+If you are stuck at some part of the tutorial or need help determining where to implement functionality, you can refer to this repo for help. You may compare the diff between commits to know what has been changed. You might need to modify some functions in the mini-lsm tutorial multiple times throughout the chapters, and you can understand what exactly is expected to be implemented for each chapter in this repo.
You may access the solution checkpoint repo at [https://github.com/skyzh/mini-lsm-solution-checkpoint](https://github.com/skyzh/mini-lsm-solution-checkpoint).
### Feedbacks
-Your feedback is greatly appreciated. We have rewritten the whole course from scratch in 2024 based on the feedbacks from the students. We hope you can share your learning experience and help us continuously improve the tutorial. Welcome to the [Discord community](https://skyzh.dev/join/discord) and share your experience.
+Your feedback is greatly appreciated. We have rewritten the whole course from scratch in 2024 based on the feedback from the students. Please share your learning experience and help us continuously improve the tutorial. Welcome to the [Discord community](https://skyzh.dev/join/discord) and share your experience.
-The long story of why we rewrote it: The tutorial was originally planned as a general guidance that students start from an empty directory and implement whatever they want based on the specification we had. We had minimal tests that checks if the behavior is correct. However, the original tutorial is too open-ended that caused huge obstacles with the learning experience. As students do not have an overview of the whole system beforehand and the instructions are kind of vague, sometimes it is hard for the students to know why a design decision is made and what they need to achieve a goal. And some part of the course is too compact that it is impossible to deliver expected contents within just one chapter. Therefore, we completely redesigned the course to have a easier learning curve and clearer learning goals. The original one-week tutorial is now split into two weeks (first week on storage format, and second week on deep-dive compaction), with an extra part on MVCC. We hope you find this course interesting and helpful in your study and career. We would like to thank everyone who commented in [Feedback after coding day 1](https://github.com/skyzh/mini-lsm/issues/11) and [Hello, when is the next update plan for the tutorial?](https://github.com/skyzh/mini-lsm/issues/7) -- your feedback greatly helped us improve the course.
+The long story of why we rewrote it: The tutorial was originally planned as a general guidance that students start from an empty directory and implement whatever they want based on the specifications we had. We had minimal tests that checked if the behavior was correct. However, the original tutorial was too open-ended, which caused huge obstacles to the learning experience. As students do not have an overview of the whole system beforehand and the instructions are vague, sometimes it is hard for them to know why a design decision is made and what they need to achieve a goal. Some parts of the course were so compact that delivering the expected contents within just one chapter was impossible. Therefore, we completely redesigned the course for an easier learning curve and clearer learning goals. The original one-week tutorial is now split into two weeks (the first week on storage format and the second week on deep-dive compaction), with an extra part on MVCC. We hope you find this course interesting and helpful in your study and career. We want to thank everyone who commented in [Feedback after coding day 1](https://github.com/skyzh/mini-lsm/issues/11) and [Hello, when is the next update plan for the tutorial?](https://github.com/skyzh/mini-lsm/issues/7) -- Your feedback greatly helped us improve the course.
### License
-The source code of this course is licensed under Apache 2.0, while the author owns the full copyright of the tutorial itself (markdown files + figures).
+The source code of this course is licensed under Apache 2.0, while the author owns the complete copyright of the tutorial itself (markdown files + figures).
### Will this tutorial be free forever?
-Yes! Everything publicly available now will be free forever and will receive lifetime updates and bug fixes. Meanwhile, we might provide paid code review and office hour services in the future. For the DLC part (*rest of your life* chapters), we do not have plans to finish them as of 2024, and have not decided whether they will be public available or not.
+Yes! Everything publicly available now will be free forever and receive lifetime updates and bug fixes. Meanwhile, we might provide paid code review and office hour services. For the DLC part (*rest of your life* chapters), we do not have plans to finish them as of 2024 and have yet to decide whether they will be publicly available.
## Community
@@ -86,11 +78,11 @@ You may join skyzh's Discord server and study with the mini-lsm community.
## Get Started
-Now, you may go ahead and get an overview of the LSM structure in [Mini-LSM Course Overview](./00-overview.md).
+Now, you can get an overview of the LSM structure in [Mini-LSM Course Overview](./00-overview.md).
## About the Author
-As of writing (at the beginning of 2024), Chi obtained his master's degree in Computer Science from Carnegie Mellon University and his bachelor's degree from Shanghai Jiao Tong University. He has been working on a variety of database systems including [TiKV][db1], [AgateDB][db2], [TerarkDB][db3], [RisingWave][db4], and [Neon][db5]. Since 2022, he worked as a teaching assistant for [CMU's Database Systems course](https://15445.courses.cs.cmu) for three semesters on the BusTub educational system, where he added a lot of new features and more challenges to the course (check out the re-designed [query execution](https://15445.courses.cs.cmu.edu/fall2022/project3/) project and the super challenging [multi-version concurrency control](https://15445.courses.cs.cmu.edu/fall2023/project4/) project). Besides working on the BusTub educational system, he is also a maintainer of the [RisingLight](https://github.com/risinglightdb/risinglight) educational database system. Chi is interested in exploring how the Rust programming language can fit in the database world. Check out his previous tutorial on building a vectorized expression framework [type-exercise-in-rust](https://github.com/skyzh/type-exercise-in-rust) and on building a vector database [write-you-a-vector-db](https://github.com/skyzh/write-you-a-vector-db) if you are also interested in that topic.
+As of writing (at the beginning of 2024), Chi obtained his master's degree in Computer Science from Carnegie Mellon University and his bachelor's degree from Shanghai Jiao Tong University. He has been working on a variety of database systems, including [TiKV][db1], [AgateDB][db2], [TerarkDB][db3], [RisingWave][db4], and [Neon][db5]. Since 2022, he has worked as a teaching assistant for [CMU's Database Systems course](https://15445.courses.cs.cmu) for three semesters on the BusTub educational system, where he added a lot of new features and more challenges to the course (check out the redesigned [query execution](https://15445.courses.cs.cmu.edu/fall2022/project3/) project and the super challenging [multi-version concurrency control](https://15445.courses.cs.cmu.edu/fall2023/project4/) project). Besides working on the BusTub educational system, he also maintains the [RisingLight](https://github.com/risinglightdb/risinglight) educational database system. Chi is interested in exploring how the Rust programming language can fit into the database world. Check out his previous tutorial on building a vectorized expression framework [type-exercise-in-rust](https://github.com/skyzh/type-exercise-in-rust) and on building a vector database [write-you-a-vector-db](https://github.com/skyzh/write-you-a-vector-db) if you are also interested in that topic.
[db1]: https://github.com/tikv/tikv
[db2]: https://github.com/tikv/agatedb
diff --git a/mini-lsm-book/src/SUMMARY.md b/mini-lsm-book/src/SUMMARY.md
index 38d8eeab5..ed66d7d8e 100644
--- a/mini-lsm-book/src/SUMMARY.md
+++ b/mini-lsm-book/src/SUMMARY.md
@@ -27,9 +27,9 @@
- [Snapshots - Memtables and Timestamps](./week3-02-snapshot-read-part-1.md)
- [Snapshots - Transaction API](./week3-03-snapshot-read-part-2.md)
- [Watermark and GC](./week3-04-watermark.md)
- - [Transaction and OCC (WIP)](./week3-05-txn-occ.md)
- - [Serializable Snapshot Isolation (WIP)](./week3-06-serializable.md)
- - [Snack Time: Compaction Filter (WIP)](./week3-07-compaction-filter.md)
+ - [Transaction and OCC](./week3-05-txn-occ.md)
+ - [Serializable Snapshot Isolation](./week3-06-serializable.md)
+ - [Snack Time: Compaction Filters](./week3-07-compaction-filter.md)
- [The Rest of Your Life (TBD)](./week4-overview.md)
---
diff --git a/mini-lsm-book/src/mini-lsm-logo.png b/mini-lsm-book/src/mini-lsm-logo.png
new file mode 100644
index 000000000..ab6459804
Binary files /dev/null and b/mini-lsm-book/src/mini-lsm-logo.png differ
diff --git a/mini-lsm-book/src/sitemap.txt b/mini-lsm-book/src/sitemap.txt
index 815caec44..a3ef27d52 100644
--- a/mini-lsm-book/src/sitemap.txt
+++ b/mini-lsm-book/src/sitemap.txt
@@ -28,5 +28,12 @@ https://skyzh.github.io/mini-lsm/week2-05-manifest
https://skyzh.github.io/mini-lsm/week2-06-wal
https://skyzh.github.io/mini-lsm/week2-07-snacks
https://skyzh.github.io/mini-lsm/week2-overview
+https://skyzh.github.io/mini-lsm/week3-01-ts-key-refactor
+https://skyzh.github.io/mini-lsm/week3-02-snapshot-read-part-1
+https://skyzh.github.io/mini-lsm/week3-03-snapshot-read-part-2
+https://skyzh.github.io/mini-lsm/week3-04-watermark
+https://skyzh.github.io/mini-lsm/week3-05-txn-occ
+https://skyzh.github.io/mini-lsm/week3-06-serializable
+https://skyzh.github.io/mini-lsm/week3-07-compaction-filter
https://skyzh.github.io/mini-lsm/week3-overview
https://skyzh.github.io/mini-lsm/week4-overview
diff --git a/mini-lsm-book/src/sitemap.xml b/mini-lsm-book/src/sitemap.xml
index 3200ba1cd..7d84a32b7 100644
--- a/mini-lsm-book/src/sitemap.xml
+++ b/mini-lsm-book/src/sitemap.xml
@@ -2,130 +2,158 @@
https://skyzh.github.io/mini-lsm
- 2024-01-25T02:56:28.231Z
+ 2024-01-30T08:17:56.731Zhttps://skyzh.github.io/mini-lsm/00-get-started
- 2024-01-25T02:56:28.234Z
+ 2024-01-30T08:17:56.734Zhttps://skyzh.github.io/mini-lsm/00-overview
- 2024-01-25T02:56:28.232Z
+ 2024-01-30T08:17:56.732Zhttps://skyzh.github.io/mini-lsm/00-preface
- 2024-01-25T02:56:28.230Z
+ 2024-01-30T08:17:56.730Zhttps://skyzh.github.io/mini-lsm/00-v1
- 2024-01-25T02:56:28.256Z
+ 2024-01-30T08:17:56.762Zhttps://skyzh.github.io/mini-lsm/01-block
- 2024-01-25T02:56:28.257Z
+ 2024-01-30T08:17:56.763Zhttps://skyzh.github.io/mini-lsm/02-sst
- 2024-01-25T02:56:28.258Z
+ 2024-01-30T08:17:56.764Zhttps://skyzh.github.io/mini-lsm/03-memtable
- 2024-01-25T02:56:28.259Z
+ 2024-01-30T08:17:56.765Zhttps://skyzh.github.io/mini-lsm/04-engine
- 2024-01-25T02:56:28.260Z
+ 2024-01-30T08:17:56.766Zhttps://skyzh.github.io/mini-lsm/05-compaction
- 2024-01-25T02:56:28.261Z
+ 2024-01-30T08:17:56.767Zhttps://skyzh.github.io/mini-lsm/06-recovery
- 2024-01-25T02:56:28.262Z
+ 2024-01-30T08:17:56.768Zhttps://skyzh.github.io/mini-lsm/07-bloom-filter
- 2024-01-25T02:56:28.263Z
+ 2024-01-30T08:17:56.768Zhttps://skyzh.github.io/mini-lsm/08-key-compression
- 2024-01-25T02:56:28.264Z
+ 2024-01-30T08:17:56.769Zhttps://skyzh.github.io/mini-lsm/09-whats-next
- 2024-01-25T02:56:28.265Z
+ 2024-01-30T08:17:56.770Zhttps://skyzh.github.io/mini-lsm/week1-01-memtable
- 2024-01-25T02:56:28.237Z
+ 2024-01-30T08:17:56.736Zhttps://skyzh.github.io/mini-lsm/week1-02-merge-iterator
- 2024-01-25T02:56:28.238Z
+ 2024-01-30T08:17:56.738Zhttps://skyzh.github.io/mini-lsm/week1-03-block
- 2024-01-25T02:56:28.239Z
+ 2024-01-30T08:17:56.739Zhttps://skyzh.github.io/mini-lsm/week1-04-sst
- 2024-01-25T02:56:28.240Z
+ 2024-01-30T08:17:56.740Zhttps://skyzh.github.io/mini-lsm/week1-05-read-path
- 2024-01-25T02:56:28.242Z
+ 2024-01-30T08:17:56.741Zhttps://skyzh.github.io/mini-lsm/week1-06-write-path
- 2024-01-25T02:56:28.243Z
+ 2024-01-30T08:17:56.743Zhttps://skyzh.github.io/mini-lsm/week1-07-sst-optimizations
- 2024-01-25T02:56:28.244Z
+ 2024-01-30T08:17:56.744Zhttps://skyzh.github.io/mini-lsm/week1-overview
- 2024-01-25T02:56:28.235Z
+ 2024-01-30T08:17:56.735Zhttps://skyzh.github.io/mini-lsm/week2-01-compaction
- 2024-01-25T02:56:28.246Z
+ 2024-01-30T08:17:56.746Zhttps://skyzh.github.io/mini-lsm/week2-02-simple
- 2024-01-25T02:56:28.248Z
+ 2024-01-30T08:17:56.747Zhttps://skyzh.github.io/mini-lsm/week2-03-tiered
- 2024-01-25T02:56:28.249Z
+ 2024-01-30T08:17:56.748Zhttps://skyzh.github.io/mini-lsm/week2-04-leveled
- 2024-01-25T02:56:28.250Z
+ 2024-01-30T08:17:56.749Zhttps://skyzh.github.io/mini-lsm/week2-05-manifest
- 2024-01-25T02:56:28.251Z
+ 2024-01-30T08:17:56.750Zhttps://skyzh.github.io/mini-lsm/week2-06-wal
- 2024-01-25T02:56:28.252Z
+ 2024-01-30T08:17:56.751Zhttps://skyzh.github.io/mini-lsm/week2-07-snacks
- 2024-01-25T02:56:28.253Z
+ 2024-01-30T08:17:56.752Zhttps://skyzh.github.io/mini-lsm/week2-overview
- 2024-01-25T02:56:28.245Z
+ 2024-01-30T08:17:56.745Z
+
+
+ https://skyzh.github.io/mini-lsm/week3-01-ts-key-refactor
+ 2024-01-30T08:17:56.754Z
+
+
+ https://skyzh.github.io/mini-lsm/week3-02-snapshot-read-part-1
+ 2024-01-30T08:17:56.755Z
+
+
+ https://skyzh.github.io/mini-lsm/week3-03-snapshot-read-part-2
+ 2024-01-30T08:17:56.756Z
+
+
+ https://skyzh.github.io/mini-lsm/week3-04-watermark
+ 2024-01-30T08:17:56.757Z
+
+
+ https://skyzh.github.io/mini-lsm/week3-05-txn-occ
+ 2024-01-30T08:17:56.758Z
+
+
+ https://skyzh.github.io/mini-lsm/week3-06-serializable
+ 2024-01-30T08:17:56.759Z
+
+
+ https://skyzh.github.io/mini-lsm/week3-07-compaction-filter
+ 2024-01-30T08:17:56.760Zhttps://skyzh.github.io/mini-lsm/week3-overview
- 2024-01-25T02:56:28.254Z
+ 2024-01-30T08:17:56.753Zhttps://skyzh.github.io/mini-lsm/week4-overview
- 2024-01-25T02:56:28.255Z
+ 2024-01-30T08:17:56.761Z
diff --git a/mini-lsm-book/src/week1-01-memtable.md b/mini-lsm-book/src/week1-01-memtable.md
index 983d22499..39fcfdb45 100644
--- a/mini-lsm-book/src/week1-01-memtable.md
+++ b/mini-lsm-book/src/week1-01-memtable.md
@@ -23,7 +23,7 @@ In this task, you will need to modify:
src/mem_table.rs
```
-Firstly, let us implement the in-memory structure of an LSM storage engine -- the memtable. We choose [crossbeam's skiplist implementation](link) as the data structure of the memtable as it supports lock-free concurrent read and write. We will not cover in-depth how a skiplist works, and in a nutshell, it is an ordered key-value map that easily allows concurrent read and write.
+Firstly, let us implement the in-memory structure of an LSM storage engine -- the memtable. We choose [crossbeam's skiplist implementation](https://docs.rs/crossbeam-skiplist/latest/crossbeam_skiplist/) as the data structure of the memtable as it supports lock-free concurrent read and write. We will not cover in-depth how a skiplist works, and in a nutshell, it is an ordered key-value map that easily allows concurrent read and write.
crossbeam-skiplist provides similar interfaces to the Rust std's `BTreeMap`: insert, get, and iter. The only difference is that the modification interfaces (i.e., `insert`) only require an immutable reference to the skiplist, instead of a mutable one. Therefore, in your implementation, you should not take any mutex when implementing the memtable structure.
@@ -39,7 +39,6 @@ In this task, you will need to modify:
```
src/lsm_storage.rs
-src/mem_table.rs
```
Now, we will add our first data structure, the memtable, to the LSM state. In `LsmStorageState::create`, you will find that when a LSM structure is created, we will initialize a memtable of id 0. This is the **mutable memtable** in the initial state. At any point of the time, the engine will have only one single mutable memtable. A memtable usually has a size limit (i.e., 256MB), and it will be frozen to an immutable memtable when it reaches the size limit.
@@ -67,7 +66,7 @@ src/mem_table.rs
A memtable cannot continuously grow in size, and we will need to freeze them (and later flush to the disk) when it reaches the size limit. You may find the memtable size limit, which is **equal to the SST size limit** (not `num_memtables_limit`), in the `LsmStorageOptions`. This is not a hard limit and you should freeze the memtable at best effort.
-In this task, you will need to compute the approximate memtable size when put/delete a key in the memtable. This can be computed by simply adding the total number of bytes of keys and values when `put` is called. Is a key is put twice, though the skiplist only contains the latest value, you may count it twice in the approximate memtable size. Once a memtable reaches the limit, you should call `force_freeze_memtable` to freeze the memtable and create a new one.
+In this task, you will need to compute the approximate memtable size when put/delete a key in the memtable. This can be computed by simply adding the total number of bytes of keys and values when `put` is called. If a key is put twice, though the skiplist only contains the latest value, you may count it twice in the approximate memtable size. Once a memtable reaches the limit, you should call `force_freeze_memtable` to freeze the memtable and create a new one.
Because there could be multiple threads getting data into the storage engine, `force_freeze_memtable` might be called concurrently from multiple threads. You will need to think about how to avoid race conditions in this case.
diff --git a/mini-lsm-book/src/week1-02-merge-iterator.md b/mini-lsm-book/src/week1-02-merge-iterator.md
index d10bb6a93..df6896512 100644
--- a/mini-lsm-book/src/week1-02-merge-iterator.md
+++ b/mini-lsm-book/src/week1-02-merge-iterator.md
@@ -103,7 +103,7 @@ Starting this section, we will use `Key` to represent LSM key types and disti
In this task, you will need to modify:
```
-src/iterators/lsm_iterator.rs
+src/lsm_iterator.rs
```
We use the `LsmIterator` structure to represent the internal LSM iterators. You will need to modify this structure multiple times throughout the tutorial when more iterators are added into the system. For now, because we only have multiple memtables, it should be defined as:
@@ -123,7 +123,7 @@ Then, we want to provide extra safety on the iterator to avoid users from misusi
In this task, you will need to modify:
```
-src/iterators/lsm_storage.rs
+src/lsm_storage.rs
```
We are finally there -- with all iterators you have implemented, you can finally implement the `scan` interface of the LSM engine. You can simply construct an LSM iterator with the memtable iterators (remember to put the latest memtable at the front of the merge iterator), and your storage engine will be able to handle the scan request.
diff --git a/mini-lsm-book/src/week1-03-block.md b/mini-lsm-book/src/week1-03-block.md
index 087e66c5c..9a90f1db9 100644
--- a/mini-lsm-book/src/week1-03-block.md
+++ b/mini-lsm-book/src/week1-03-block.md
@@ -77,7 +77,7 @@ In this task, you will need to modify:
src/block/iterator.rs
```
-Now that we have an encoded block, we will need to implement the `StorageIterator` interface, so that the user can lookup/scan keys in the block.
+Now that we have an encoded block, we will need to implement the `BlockIterator` interface, so that the user can lookup/scan keys in the block.
`BlockIterator` can be created with an `Arc`. If `create_and_seek_to_first` is called, it will be positioned at the first key in the block. If `create_and_seek_to_key` is called, the iterator will be positioned at the first key that is `>=` the provided key. For example, if `1, 3, 5` is in a block.
diff --git a/mini-lsm-book/src/week1-05-read-path.md b/mini-lsm-book/src/week1-05-read-path.md
index 3c9c6ccc6..dc5d71907 100644
--- a/mini-lsm-book/src/week1-05-read-path.md
+++ b/mini-lsm-book/src/week1-05-read-path.md
@@ -84,5 +84,6 @@ We do not provide reference answers to the questions, and feel free to discuss a
## Bonus Tasks
* **The Cost of Dynamic Dispatch.** Implement a `Box` version of merge iterators and benchmark to see the performance differences.
+* **Parallel Seek.** Creating a merge iterator requires loading the first block of all underlying SSTs (when you create `SSTIterator`). You may parallelize the process of creating iterators.
{{#include copyright.md}}
diff --git a/mini-lsm-book/src/week1-07-sst-optimizations.md b/mini-lsm-book/src/week1-07-sst-optimizations.md
index 6bab55de4..7abdd797c 100644
--- a/mini-lsm-book/src/week1-07-sst-optimizations.md
+++ b/mini-lsm-book/src/week1-07-sst-optimizations.md
@@ -23,9 +23,11 @@ Bloom filters are probabilistic data structures that maintains a set of keys. Yo
You usually need to have a hash function in order to construct a bloom filter, and a key can have multiple hashes. Let us take a look at the below example. Assume that we already have hashes of some keys and the bloom filter has 7 bits.
+[Note: If you want to understand bloom filters better, look [here](https://samwho.dev/bloom-filters/)]
+
```plaintext
hash1 = ((character - a) * 13) % 7
-hash1 = ((character - a) * 11) % 7
+hash2 = ((character - a) * 11) % 7
b -> 6 4
c -> 5 1
d -> 4 5
@@ -34,7 +36,7 @@ g -> 1 3
h -> 0 0
```
-If we insert b, c, d into the 6-bit bloom filter, we will get:
+If we insert b, c, d into the 7-bit bloom filter, we will get:
```
bit 0123456
diff --git a/mini-lsm-book/src/week2-01-compaction.md b/mini-lsm-book/src/week2-01-compaction.md
index 74ed2a48c..c8bc8d696 100644
--- a/mini-lsm-book/src/week2-01-compaction.md
+++ b/mini-lsm-book/src/week2-01-compaction.md
@@ -15,6 +15,12 @@ cargo x copy-test --week 2 --day 1
cargo x scheck
```
+
+
+It might be helpful to take a look at [week 2 overview](./week2-overview.md) before reading this chapter to have a general overview of compactions.
+
+
+
## Task 1: Compaction Implementation
In this task, you will implement the core logic of doing a compaction -- merge sort a set of SST files into a sorted run. You will need to modify:
diff --git a/mini-lsm-book/src/week2-02-simple.md b/mini-lsm-book/src/week2-02-simple.md
index 1921259e1..dbf877040 100644
--- a/mini-lsm-book/src/week2-02-simple.md
+++ b/mini-lsm-book/src/week2-02-simple.md
@@ -14,6 +14,12 @@ cargo x copy-test --week 2 --day 2
cargo x scheck
```
+
+
+It might be helpful to take a look at [week 2 overview](./week2-overview.md) before reading this chapter to have a general overview of compactions.
+
+
+
## Task 1: Simple Leveled Compaction
In this chapter, we are going to implement our first compaction strategy -- simple leveled compaction. In this task, you will need to modify:
diff --git a/mini-lsm-book/src/week2-03-tiered.md b/mini-lsm-book/src/week2-03-tiered.md
index 68f506f90..b5b619e12 100644
--- a/mini-lsm-book/src/week2-03-tiered.md
+++ b/mini-lsm-book/src/week2-03-tiered.md
@@ -16,6 +16,12 @@ cargo x copy-test --week 2 --day 3
cargo x scheck
```
+
+
+It might be helpful to take a look at [week 2 overview](./week2-overview.md) before reading this chapter to have a general overview of compactions.
+
+
+
## Task 1: Universal Compaction
In this chapter, you will implement RocksDB's universal compaction, which is of the tiered compaction family compaction strategies. Similar to the simple leveled compaction strategy, we only use number of files as the indicator in this compaction strategy. And when we trigger the compaction jobs, we always include a full sorted run (tier) in the compaction job.
@@ -28,7 +34,7 @@ In this task, you will need to modify:
src/compact/tiered.rs
```
-In universal compaction, we do not use L0 SSTs in the LSM state. Instead, we directly flush new SSTs to a single sorted run (called tier). In the LSM state, `levels` will now include all tiers, where the lowest index is the latest SST flushed. The compaction simulator generates tier id based on the first SST id, and you should do the same in your implementation.
+In universal compaction, we do not use L0 SSTs in the LSM state. Instead, we directly flush new SSTs to a single sorted run (called tier). In the LSM state, `levels` will now include all tiers, where **the lowest index is the latest SST flushed**. Each element in the `levels` vector stores a tuple: level ID (used as tier ID) and the SSTs in that level. Every time you flush L0 SSTs, you should flush the SST into a tier placed at the front of the vector. The compaction simulator generates tier id based on the first SST id, and you should do the same in your implementation.
Universal compaction will only trigger tasks when the number of tiers (sorted runs) is larger than `num_tiers`. Otherwise, it does not trigger any compaction.
@@ -130,6 +136,7 @@ As tiered compaction does not use the L0 level of the LSM state, you should dire
* What happens if compaction speed cannot keep up with the SST flushes?
* What might needs to be considered if the system schedules multiple compaction tasks in parallel?
* SSDs also write its own logs (basically it is a log-structured storage). If the SSD has a write amplification of 2x, what is the end-to-end write amplification of the whole system? Related: [ZNS: Avoiding the Block Interface Tax for Flash-based SSDs](https://www.usenix.org/conference/atc21/presentation/bjorling).
+* Consider the case that the user chooses to keep a large number of sorted runs (i.e., 300) for tiered compaction. To make the read path faster, is it a good idea to keep some data structure that helps reduce the time complexity (i.e., to `O(log n)`) of finding SSTs to read in each layer for some key ranges? Note that normally, you will need to do a binary search in each sorted run to find the key ranges that you will need to read. (Check out Neon's [layer map](https://neon.tech/blog/persistent-structures-in-neons-wal-indexing) implementation!)
We do not provide reference answers to the questions, and feel free to discuss about them in the Discord community.
diff --git a/mini-lsm-book/src/week2-04-leveled.md b/mini-lsm-book/src/week2-04-leveled.md
index cbf629bae..371041591 100644
--- a/mini-lsm-book/src/week2-04-leveled.md
+++ b/mini-lsm-book/src/week2-04-leveled.md
@@ -14,6 +14,12 @@ cargo x copy-test --week 2 --day 4
cargo x scheck
```
+
+
+It might be helpful to take a look at [week 2 overview](./week2-overview.md) before reading this chapter to have a general overview of compactions.
+
+
+
## Task 1: Leveled Compaction
In chapter 2 day 2, you have implemented the simple leveled compaction strategies. However, the implementation has a few problems:
@@ -43,21 +49,25 @@ You will need to compute the target sizes of the levels. Assume `base_level_size
[0 0 0 0 0 200MB]
```
-When the levels grow in size as more SSTs get compacted to that level, we will compute the target size based on the size of the last level. When the actual size of SST files in the last level reaches 200MB, for example, 300MB, we will compute the target size of the other levels by dividing the `level_size_multiplier`. Assume `level_size_multiplier=10`.
+Before the bottom level exceeds `base_level_size_mb`, all other intermediate levels will have target sizes of 0. The idea is that when the total amount of data is small, it's wasteful to create intermediate levels.
+
+When the bottom level reaches or exceeds `base_level_size_mb`, we will compute the target size of the other levels by dividing the `level_size_multiplier` from the size. Assume the bottom level contains 300MB of data, and `level_size_multiplier=10`.
```
0 0 0 0 30MB 300MB
```
-We will only keep at most *one* level below `base_level_size_mb`, and in this case, it is L5. Assume we now have 30GB files in the last level, the target sizes will be,
+In addition, at most *one* level can have a positive target size below `base_level_size_mb`. Assume we now have 30GB files in the last level, the target sizes will be,
```
0 0 30MB 300MB 3GB 30GB
```
+Notice in this case L1 and L2 have target size of 0, and L3 is the only level with a postive target size below `base_level_size_mb`.
+
### Task 1.2: Decide Base Level
-Now, let us solve the problem that SSTs may be compacted across empty levels in the simple leveled compaction strategy. When we compact L0 SSTs with lower levels, we do not directly put it to L1. Instead, we compact it with the first level with `target size > 0``. For example, when the target level sizes are:
+Now, let us solve the problem that SSTs may be compacted across empty levels in the simple leveled compaction strategy. When we compact L0 SSTs with lower levels, we do not directly put it to L1. Instead, we compact it with the first level with `target size > 0`. For example, when the target level sizes are:
```
0 0 0 0 30MB 300MB
@@ -89,7 +99,7 @@ The number of levels in the compaction simulator is 4. Therefore, the SSTs shoul
### Task 1.3: Decide Level Priorities
-Now that we will need to handle compactions below L0. L0 compaction always has the top priority, that you should compact L0 with other levels first if it reaches the threshold. After that, we can compute the compaction priorities of each level by `current_size / target_size`. We only compact levels with this ratio `> 1.0` The one with the largest ratio will be chosen for compaction with the lower level. For example, if we have:
+Now that we will need to handle compactions below L0. L0 compaction always has the top priority, thus you should compact L0 with other levels first if it reaches the threshold. After that, we can compute the compaction priorities of each level by `current_size / target_size`. We only compact levels with this ratio `> 1.0` The one with the largest ratio will be chosen for compaction with the lower level. For example, if we have:
```
L3: 200MB, target_size=20MB
diff --git a/mini-lsm-book/src/week2-05-manifest.md b/mini-lsm-book/src/week2-05-manifest.md
index c4d898c06..2361ed0ac 100644
--- a/mini-lsm-book/src/week2-05-manifest.md
+++ b/mini-lsm-book/src/week2-05-manifest.md
@@ -7,6 +7,13 @@ In this chapter, you will:
* Implement encoding and decoding of the manifest file.
* Recover from the manifest when the system restarts.
+To copy the test cases into the starter code and run them,
+
+```
+cargo x copy-test --week 2 --day 5
+cargo x scheck
+```
+
## Task 1: Manifest Encoding
The system uses a manifest file to record all operations happened in the engine. Currently, there are only two types of them: compaction and SST flush. When the engine restarts, it will read the manifest file, reconstruct the state, and load the SST files on the disk.
@@ -87,5 +94,6 @@ get 1500
## Bonus Tasks
* **Manifest Compaction.** When the number of logs in the manifest file gets too large, you can rewrite the manifest file to only store the current snapshot and append new logs to that file.
+* **Parallel Open.** After you collect the list of SSTs to open, you can open and decode them in parallel, instead of doing it one by one, therefore accelerating the recovery process.
{{#include copyright.md}}
diff --git a/mini-lsm-book/src/week2-06-wal.md b/mini-lsm-book/src/week2-06-wal.md
index fb4ebc998..b9500f970 100644
--- a/mini-lsm-book/src/week2-06-wal.md
+++ b/mini-lsm-book/src/week2-06-wal.md
@@ -7,6 +7,13 @@ In this chapter, you will:
* Implement encoding and decoding of the write-ahead log file.
* Recover memtables from the WALs when the system restarts.
+To copy the test cases into the starter code and run them,
+
+```
+cargo x copy-test --week 2 --day 6
+cargo x scheck
+```
+
## Task 1: WAL Encoding
In this task, you will need to modify:
diff --git a/mini-lsm-book/src/week3-04-watermark.md b/mini-lsm-book/src/week3-04-watermark.md
index 75fc879cc..cc95c615f 100644
--- a/mini-lsm-book/src/week3-04-watermark.md
+++ b/mini-lsm-book/src/week3-04-watermark.md
@@ -74,6 +74,8 @@ Assume these are all keys in the engine. If we do a scan at ts=3, we will get `a
* In our implementation, we manage watermarks by ourselves with the lifecycle of `Transaction` (so-called un-managed mode). If the user intends to manage key timestamps and the watermarks by themselves (i.e., when they have their own timestamp generator), what do you need to do in the write_batch/get/scan API to validate their requests? Is there any architectural assumption we had that might be hard to maintain in this case?
* Why do we need to store an `Arc` of `Transaction` inside a transaction iterator?
+* What is the condition to fully remove a key from the SST file?
+* For now, we only remove a key when compacting to the bottom-most level. Is there any other prior time that we can remove the key? (Hint: you know the start/end key of each SST in all levels.)
## Bonus Tasks
diff --git a/mini-lsm-book/src/week3-06-serializable.md b/mini-lsm-book/src/week3-06-serializable.md
index 1aa1bfe82..232642364 100644
--- a/mini-lsm-book/src/week3-06-serializable.md
+++ b/mini-lsm-book/src/week3-06-serializable.md
@@ -102,11 +102,21 @@ You can skip the check if `write_set` is empty. A read-only transaction can alwa
You should also modify the `put`, `delete`, and `write_batch` interface in `LsmStorageInner`. We recommend you define a helper function `write_batch_inner` that processes a write batch. If `options.serializable = true`, `put`, `delete`, and the user-facing `write_batch` should create a transaction instead of directly creating a write batch. Your write batch helper function should also return a `u64` commit timestamp so that `Transaction::Commit` can correctly store the committed transaction data into the MVCC structure.
+## Task 4: Garbage Collection
+
+In this task, you will need to modify:
+
+```
+src/mvcc/txn.rs
+```
+
+When you commit a transaction, you can also clean up the committed txn map to remove all transactions below the watermark, as they will not be involved in any future serializable validations.
+
## Test Your Understanding
* If you have some experience with building a relational database, you may think about the following question: assume that we build a database based on Mini-LSM where we store each row in the relation table as a key-value pair (key: primary key, value: serialized row) and enable serializable verification, does the database system directly gain ANSI serializable isolation level capability? Why or why not?
* The thing we implement here is actually write snapshot-isolation (see [A critique of snapshot isolation](https://dl.acm.org/doi/abs/10.1145/2168836.2168853)) that guarantees serializable. Is there any cases where the execution is serializable, but will be rejected by the write snapshot-isolation validation?
-* There are databases that claim they have serializable snapshot isolation support by only tracking the keys accessed in gets and scans. Do they really prevent write skews caused by phantoms? (Okay... Actually, I'm talking about [BadgerDB](https://dgraph.io/blog/post/badger-txn/).)
+* There are databases that claim they have serializable snapshot isolation support by only tracking the keys accessed in gets and scans (instead of key range). Do they really prevent write skews caused by phantoms? (Okay... Actually, I'm talking about [BadgerDB](https://dgraph.io/blog/post/badger-txn/).)
We do not provide reference answers to the questions, and feel free to discuss about them in the Discord community.
diff --git a/mini-lsm-book/src/week3-07-compaction-filter.md b/mini-lsm-book/src/week3-07-compaction-filter.md
index 82213a63b..a9a5eea0f 100644
--- a/mini-lsm-book/src/week3-07-compaction-filter.md
+++ b/mini-lsm-book/src/week3-07-compaction-filter.md
@@ -1,4 +1,46 @@
-# Snack Time: Compaction Filter
+# Snack Time: Compaction Filters
+Congratulations! You made it there! In the previous chapter, you made your LSM engine multi-version capable, and the users can use transaction APIs to interact with your storage engine. At the end of this week, we will implement some easy but important features of the storage engine. Welcome to Mini-LSM's week 3 snack time!
+
+In this chapter, we will generalize our compaction garbage collection logic to become compaction filters.
+
+For now, our compaction will simply retain the keys above the watermark and the latest version of the keys below the watermark. We can add some magic to the compaction process to help the user collect some unused data automatically as a background job.
+
+Consider a case that the user uses Mini-LSM to store database tables. Each row in the table are prefixed with the table name. For example,
+
+```
+table1_key1 -> row
+table1_key2 -> row
+table1_key3 -> row
+table2_key1 -> row
+table2_key2 -> row
+```
+
+Now the user executes `DROP TABLE table1`. The engine will need to clean up all the data beginning with `table1`.
+
+There are a lot of ways to achieve the goal. The user of Mini-LSM can scan all the keys beginning with `table1` and requests the engine to delete it. However, scanning a very large database might be slow, and it will generate the same number of delete tombstones as the existing keys. Therefore, scan-and-delete will not free up the space occupied by the dropped table -- instead, it will add more data to the engine and the space can only be reclaimed when the tombstones reach the bottom level of the engine.
+
+Or, they can create column families (we will talk about this in *rest of your life* chapter). They store each table in a column family, which is a standalone LSM state, and directly remove the SST files corresponding to the column family when the user drop the table.
+
+In this tutorial, we will implement the third approach: compaction filters. Compaction filters can be dynamically added to the engine at runtime. During the compaction, if a key matching the compaction filter is found, we can silently remove it in the background. Therefore, the user can attach a compaction filter of `prefix=table1` to the engine, and all these keys will be removed during compaction.
+
+## Task 1: Compaction Filter
+
+In this task, you will need to modify:
+
+```
+src/compact.rs
+```
+
+You can iterate all compaction filters in `LsmStorageInner::compaction_filters`. If the first version of the key below watermark matches the compaction filter, simply remove it instead of keeping it in the SST file.
+
+To run test cases,
+
+```
+cargo x copy-test --week 3 --day 7
+cargo x scheck
+```
+
+You can assume that the user will not get the keys within the prefix filter range. And, they will not scan the keys in the prefix range. Therefore, it is okay to return a wrong value when a user requests the keys in the prefix filter range (i.e., undefined behavior).
{{#include copyright.md}}
diff --git a/mini-lsm-book/src/week3-overview.md b/mini-lsm-book/src/week3-overview.md
index 0bbfac5c6..500c4f15b 100644
--- a/mini-lsm-book/src/week3-overview.md
+++ b/mini-lsm-book/src/week3-overview.md
@@ -2,6 +2,8 @@
In this part, you will implement MVCC over the LSM engine that you have built in the previous two weeks. We will add timestamp encoding in the keys to maintain multiple versions of a key, and change some part of the engine to ensure old data are either retained or garbage-collected based on whether there are users reading an old version.
+The general approach of the MVCC part in this tutorial is inspired and partially based on [BadgerDB](https://github.com/dgraph-io/badger).
+
The key of MVCC is to store and access multiple versions of a key in the storage engine. Therefore, we will need to change the key format to `user_key + timestamp (u64)`. And on the user interface side, we will need to have new APIs to help users to gain access to a history version. In summary, we will add a monotonically-increasing timestamp to the key.
In previous parts, we assumed that newer keys are in the upper level of the LSM tree, and older keys are in the lower level of the LSM tree. During compaction, we only keep the latest version of a key if multiple versions are found in multiple levels, and the compaction process will ensure that newer keys will be kept on the upper level by only merging adjacent levels/tiers. In the MVCC implementation, the key with a larger timestamp is the newest key. During compaction, we can only remove the key if no user is accessing an older version of the database. Though not keeping the latest version of key in the upper level may still yield a correct result for the MVCC LSM implementation, in our tutorial, we choose to keep the invariant, and if there are multiple versions of a key, a later version will always appear in a upper level.
@@ -16,7 +18,7 @@ put/delete/write_batch(key, timestamp)
set_watermark(timestamp) # we will talk about watermarks soon!
```
-**Un-managed Mode APIs**
+**Un-managed/Normal Mode APIs**
```
get(key) -> value
scan(key_range) -> iterator
diff --git a/mini-lsm-mvcc/Cargo.toml b/mini-lsm-mvcc/Cargo.toml
index 15213c311..00b2e5bfd 100644
--- a/mini-lsm-mvcc/Cargo.toml
+++ b/mini-lsm-mvcc/Cargo.toml
@@ -25,6 +25,8 @@ serde_json = { version = "1.0" }
serde = { version = "1.0", features = ["derive"] }
farmhash = "1"
crc32fast = "1.3.2"
+nom = "7.1.3"
+rustyline = "13.0.0"
[dev-dependencies]
tempfile = "3"
diff --git a/mini-lsm-mvcc/src/compact.rs b/mini-lsm-mvcc/src/compact.rs
index 06df389e9..cc52c63fe 100644
--- a/mini-lsm-mvcc/src/compact.rs
+++ b/mini-lsm-mvcc/src/compact.rs
@@ -19,7 +19,7 @@ use crate::iterators::merge_iterator::MergeIterator;
use crate::iterators::two_merge_iterator::TwoMergeIterator;
use crate::iterators::StorageIterator;
use crate::key::KeySlice;
-use crate::lsm_storage::{LsmStorageInner, LsmStorageState};
+use crate::lsm_storage::{CompactionFilter, LsmStorageInner, LsmStorageState};
use crate::manifest::ManifestRecord;
use crate::table::{SsTable, SsTableBuilder, SsTableIterator};
@@ -122,7 +122,8 @@ impl LsmStorageInner {
let watermark = self.mvcc().watermark();
let mut last_key = Vec::::new();
let mut first_key_below_watermark = false;
- while iter.is_valid() {
+ let compaction_filters = self.compaction_filters.lock().clone();
+ 'outer: while iter.is_valid() {
if builder.is_none() {
builder = Some(SsTableBuilder::new(self.options.block_size));
}
@@ -144,12 +145,26 @@ impl LsmStorageInner {
continue;
}
- if same_as_last_key && iter.key().ts() <= watermark {
- if !first_key_below_watermark {
+ if iter.key().ts() <= watermark {
+ if same_as_last_key && !first_key_below_watermark {
iter.next()?;
continue;
}
+
first_key_below_watermark = false;
+
+ if !compaction_filters.is_empty() {
+ for filter in &compaction_filters {
+ match filter {
+ CompactionFilter::Prefix(x) => {
+ if iter.key().key_ref().starts_with(x) {
+ iter.next()?;
+ continue 'outer;
+ }
+ }
+ }
+ }
+ }
}
let builder_inner = builder.as_mut().unwrap();
diff --git a/mini-lsm-mvcc/src/compact/leveled.rs b/mini-lsm-mvcc/src/compact/leveled.rs
index 213661888..043bc9273 100644
--- a/mini-lsm-mvcc/src/compact/leveled.rs
+++ b/mini-lsm-mvcc/src/compact/leveled.rs
@@ -118,6 +118,7 @@ impl LeveledCompactionController {
}
}
priorities.sort_by(|a, b| a.partial_cmp(b).unwrap().reverse());
+
let priority = priorities.first();
if let Some((_, level)) = priority {
println!(
diff --git a/mini-lsm-mvcc/src/iterators/merge_iterator.rs b/mini-lsm-mvcc/src/iterators/merge_iterator.rs
index c4abc8d3e..b1f5bdf77 100644
--- a/mini-lsm-mvcc/src/iterators/merge_iterator.rs
+++ b/mini-lsm-mvcc/src/iterators/merge_iterator.rs
@@ -37,7 +37,7 @@ impl Ord for HeapWrapper {
}
/// Merge multiple iterators of the same type. If the same key occurs multiple times in some
-/// iterators, perfer the one with smaller index.
+/// iterators, prefer the one with smaller index.
pub struct MergeIterator {
iters: BinaryHeap>,
current: Option>,
diff --git a/mini-lsm-mvcc/src/iterators/two_merge_iterator.rs b/mini-lsm-mvcc/src/iterators/two_merge_iterator.rs
index 8488cd282..2ff2ce3fa 100644
--- a/mini-lsm-mvcc/src/iterators/two_merge_iterator.rs
+++ b/mini-lsm-mvcc/src/iterators/two_merge_iterator.rs
@@ -53,8 +53,10 @@ impl<
fn key(&self) -> A::KeyType<'_> {
if self.choose_a {
+ debug_assert!(self.a.is_valid());
self.a.key()
} else {
+ debug_assert!(self.b.is_valid());
self.b.key()
}
}
diff --git a/mini-lsm-mvcc/src/lsm_iterator.rs b/mini-lsm-mvcc/src/lsm_iterator.rs
index ad3242447..36a87bfb9 100644
--- a/mini-lsm-mvcc/src/lsm_iterator.rs
+++ b/mini-lsm-mvcc/src/lsm_iterator.rs
@@ -136,14 +136,14 @@ impl StorageIterator for FusedIterator {
}
fn key(&self) -> Self::KeyType<'_> {
- if self.has_errored || !self.iter.is_valid() {
+ if !self.is_valid() {
panic!("invalid access to the underlying iterator");
}
self.iter.key()
}
fn value(&self) -> &[u8] {
- if self.has_errored || !self.iter.is_valid() {
+ if !self.is_valid() {
panic!("invalid access to the underlying iterator");
}
self.iter.value()
diff --git a/mini-lsm-mvcc/src/lsm_storage.rs b/mini-lsm-mvcc/src/lsm_storage.rs
index c284b4a75..25786dfd1 100644
--- a/mini-lsm-mvcc/src/lsm_storage.rs
+++ b/mini-lsm-mvcc/src/lsm_storage.rs
@@ -149,6 +149,11 @@ fn key_within(user_key: &[u8], table_begin: KeySlice, table_end: KeySlice) -> bo
table_begin.key_ref() <= user_key && user_key <= table_end.key_ref()
}
+#[derive(Clone, Debug)]
+pub enum CompactionFilter {
+ Prefix(Bytes),
+}
+
/// The storage interface of the LSM tree.
pub(crate) struct LsmStorageInner {
pub(crate) state: Arc>>,
@@ -160,6 +165,7 @@ pub(crate) struct LsmStorageInner {
pub(crate) compaction_controller: CompactionController,
pub(crate) manifest: Option,
pub(crate) mvcc: Option,
+ pub(crate) compaction_filters: Arc>>,
}
/// A thin wrapper for `LsmStorageInner` and the user interface for MiniLSM.
@@ -243,6 +249,10 @@ impl MiniLsm {
}))
}
+ pub fn add_compaction_filter(&self, compaction_filter: CompactionFilter) {
+ self.inner.add_compaction_filter(compaction_filter)
+ }
+
pub fn get(&self, key: &[u8]) -> Result