diff --git a/Cargo.lock b/Cargo.lock index 66dd13e06..3619b2711 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -80,6 +80,12 @@ version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" +[[package]] +name = "bitflags" +version = "2.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed570934406eb16438a4e976b1b4500774099c13b8cb96eec99f620f05090ddf" + [[package]] name = "bumpalo" version = "3.11.1" @@ -175,6 +181,15 @@ version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "702fc72eb24e5a1e48ce58027a675bc24edd52096d5397d4aea7c6dd9eca0bd1" +[[package]] +name = "clipboard-win" +version = "5.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3ec832972fefb8cf9313b45a0d1945e29c9c251f1d4c6eafc5fe2124c02d2e81" +dependencies = [ + "error-code", +] + [[package]] name = "colorchoice" version = "1.0.0" @@ -268,6 +283,22 @@ version = "0.3.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a357d28ed41a50f9c765dbfe56cbc04a64e53e5fc58ba79fbc34c10ef3df831f" +[[package]] +name = "endian-type" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c34f04666d835ff5d62e058c3995147c06f42fe86ff053337632bca83e42702d" + +[[package]] +name = "errno" +version = "0.3.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a258e46cdc063eb8519c00b9fc845fc47bcfca4130e2f08e88665ceda8474245" +dependencies = [ + "libc", + "windows-sys 0.52.0", +] + [[package]] name = "error-chain" version = "0.12.4" @@ -277,6 +308,12 @@ dependencies = [ "version_check", ] +[[package]] +name = "error-code" +version = "3.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "281e452d3bad4005426416cdba5ccfd4f5c1280e10099e21db27f7c1c28347fc" + [[package]] name = "farmhash" version = "1.1.5" @@ -292,6 +329,17 @@ dependencies = [ "instant", ] +[[package]] +name = "fd-lock" +version = "4.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e5768da2206272c81ef0b5e951a41862938a6070da63bcea197899942d3b947" +dependencies = [ + "cfg-if", + "rustix", + "windows-sys 0.52.0", +] + [[package]] name = "getrandom" version = "0.2.8" @@ -324,6 +372,15 @@ dependencies = [ "libc", ] +[[package]] +name = "home" +version = "0.5.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3d1354bf6b7235cb4a0576c2619fd4ed18183f689b12b006a0ee7329eeff9a5" +dependencies = [ + "windows-sys 0.52.0", +] + [[package]] name = "instant" version = "0.1.12" @@ -365,9 +422,15 @@ checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" [[package]] name = "libc" -version = "0.2.139" +version = "0.2.153" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd" + +[[package]] +name = "linux-raw-sys" +version = "0.4.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "201de327520df007757c1f0adce6e827fe8562fbc28bfd9c15571c66ca1f5f79" +checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c" [[package]] name = "lock_api" @@ -426,9 +489,11 @@ dependencies = [ "crossbeam-skiplist", "farmhash", "moka", + "nom", "ouroboros", "parking_lot", "rand", + "rustyline", "serde", "serde_json", "tempfile", @@ -448,9 +513,11 @@ dependencies = [ "crossbeam-skiplist", "farmhash", "moka", + "nom", "ouroboros", "parking_lot", "rand", + "rustyline", "serde", "serde_json", "tempfile", @@ -464,14 +531,17 @@ dependencies = [ "arc-swap", "bytes", "clap", + "crc32fast", "crossbeam-channel", "crossbeam-epoch", "crossbeam-skiplist", "farmhash", "moka", + "nom", "ouroboros", "parking_lot", "rand", + "rustyline", "serde", "serde_json", "tempfile", @@ -487,6 +557,12 @@ dependencies = [ "duct", ] +[[package]] +name = "minimal-lexical" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" + [[package]] name = "moka" version = "0.9.6" @@ -510,6 +586,36 @@ dependencies = [ "uuid", ] +[[package]] +name = "nibble_vec" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77a5d83df9f36fe23f0c3648c6bbb8b0298bb5f1939c8f2704431371f4b84d43" +dependencies = [ + "smallvec", +] + +[[package]] +name = "nix" +version = "0.27.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2eb04e9c688eff1c89d72b407f168cf79bb9e867a9d3323ed6c01519eb9cc053" +dependencies = [ + "bitflags 2.4.2", + "cfg-if", + "libc", +] + +[[package]] +name = "nom" +version = "7.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a" +dependencies = [ + "memchr", + "minimal-lexical", +] + [[package]] name = "num_cpus" version = "1.15.0" @@ -618,7 +724,7 @@ version = "0.9.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2d9cc634bc78768157b5cbfe988ffcd1dcba95cd2b2f03a88316c08c6d00ed63" dependencies = [ - "bitflags", + "bitflags 1.3.2", "memchr", "unicase", ] @@ -648,6 +754,16 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "radix_trie" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c069c179fcdc6a2fe24d8d18305cf085fdbd4f922c041943e203685d6a1c58fd" +dependencies = [ + "endian-type", + "nibble_vec", +] + [[package]] name = "rand" version = "0.8.5" @@ -684,7 +800,7 @@ version = "10.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a6823ea29436221176fe662da99998ad3b4db2c7f31e7b6f5fe43adccd6320bb" dependencies = [ - "bitflags", + "bitflags 1.3.2", ] [[package]] @@ -693,7 +809,7 @@ version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fb5a58c1855b4b6819d59012155603f0b22ad30cad752600aadfcb695265519a" dependencies = [ - "bitflags", + "bitflags 1.3.2", ] [[package]] @@ -714,6 +830,41 @@ dependencies = [ "semver", ] +[[package]] +name = "rustix" +version = "0.38.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ea3e1a662af26cd7a3ba09c0297a31af215563ecf42817c98df621387f4e949" +dependencies = [ + "bitflags 2.4.2", + "errno", + "libc", + "linux-raw-sys", + "windows-sys 0.52.0", +] + +[[package]] +name = "rustyline" +version = "13.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "02a2d683a4ac90aeef5b1013933f6d977bd37d51ff3f4dad829d4931a7e6be86" +dependencies = [ + "bitflags 2.4.2", + "cfg-if", + "clipboard-win", + "fd-lock", + "home", + "libc", + "log", + "memchr", + "nix", + "radix_trie", + "unicode-segmentation", + "unicode-width", + "utf8parse", + "winapi", +] + [[package]] name = "ryu" version = "1.0.12" @@ -920,6 +1071,12 @@ version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "84a22b9f218b40614adcb3f4ff08b703773ad44fa9423e4e0d346d5db86e4ebc" +[[package]] +name = "unicode-segmentation" +version = "1.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1dd624098567895118886609431a7c3b8f516e41d30e0643f03d94592a147e36" + [[package]] name = "unicode-width" version = "0.1.10" diff --git a/README.md b/README.md index 754ee0fb6..d59582ca3 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,5 @@ +![banner](./mini-lsm-book/src/mini-lsm-logo.png) + # LSM in a Week [![CI (main)](https://github.com/skyzh/mini-lsm/actions/workflows/main.yml/badge.svg)](https://github.com/skyzh/mini-lsm/actions/workflows/main.yml) @@ -26,7 +28,7 @@ You should modify code in `mini-lsm-starter` directory. ``` cargo x install-tools -cargo copy-test --week 1 --day 1 +cargo x copy-test --week 1 --day 1 cargo x scheck cargo run --bin mini-lsm-cli cargo run --bin compaction-simulator @@ -45,7 +47,7 @@ cargo x book If you changed public API in the reference solution, you might also need to synchronize it to the starter crate. To do this, use `cargo x sync`. -## Structure +## Code Structure * mini-lsm: the final solution code for <= week 2 * mini-lsm-mvcc: the final solution code for week 3 MVCC @@ -70,28 +72,41 @@ cargo run --bin compaction-simulator-ref cargo run --bin compaction-simulator-mvcc-ref ``` -## Progress +## Tutorial Structure -We are working on chapter 3 and more test cases for all existing contents. +We have 3 weeks + 1 extra week (in progress) for this tutorial. * Week 1: Storage Format + Engine Skeleton * Week 2: Compaction and Persistence * Week 3: Multi-Version Concurrency Control -* The Extra Week / Rest of Your Life: Optimizations (unlikely to be available in 2024...) - -✅: Finished \ -🚧: WIP and will likely be available soon - -| Week + Chapter | Topic | Solution | Starter Code | Writeup | -| -------------- | ----------------------------------------------- | -------- | ------------ | ------- | -| 3.1 | Timestamp Key Encoding | ✅ | ✅ | ✅ | -| 3.2 | Snapshot Read - Blocks, Memtables, and SSTs | ✅ | ✅ | ✅ | -| 3.3 | Snapshot Read - Engine Read Path | ✅ | ✅ | ✅ | -| 3.4 | Watermark and Garbage Collection | ✅ | ✅ | ✅ | -| 3.5 | Transactions and Optimistic Concurrency Control | ✅ | ✅ | ✅ | -| 3.6 | Serializable Snapshot Isolation | ✅ | ✅ | ✅ | -| 3.7 | Compaction Filter | 🚧 | | | +* The Extra Week / Rest of Your Life: Optimizations (unlikely to be available in 2024...) + +![Tutorial Roadmap](./mini-lsm-book/src/lsm-tutorial/00-full-overview.svg) + +| Week + Chapter | Topic | +| -------------- | ----------------------------------------------------------- | +| 1.1 | Memtable | +| 1.2 | Merge Iterator | +| 1.3 | Block | +| 1.4 | Sorted String Table (SST) | +| 1.5 | Read Path | +| 1.6 | Write Path | +| 1.7 | SST Optimizations: Prefix Key Encoding + Bloom Filters | +| 2.1 | Compaction Implementation | +| 2.2 | Simple Compaction Strategy (Traditional Leveled Compaction) | +| 2.3 | Tiered Compaction Strategy (RocksDB Universal Compaction) | +| 2.4 | Leveled Compaction Strategy (RocksDB Leveled Compaction) | +| 2.5 | Manifest | +| 2.6 | Write-Ahead Log (WAL) | +| 2.7 | Batch Write and Checksums | +| 3.1 | Timestamp Key Encoding | +| 3.2 | Snapshot Read - Memtables and Timestamps | +| 3.3 | Snapshot Read - Transaction API | +| 3.4 | Watermark and Garbage Collection | +| 3.5 | Transactions and Optimistic Concurrency Control | +| 3.6 | Serializable Snapshot Isolation | +| 3.7 | Compaction Filters | ## License -The Mini-LSM starter code and solution are under Apache 2.0 license. The author reserves the full copyright of the tutorial materials (markdown files and figures). +The Mini-LSM starter code and solution are under [Apache 2.0 license](LICENSE). The author reserves the full copyright of the tutorial materials (markdown files and figures). diff --git a/mini-lsm-book/src/00-overview.md b/mini-lsm-book/src/00-overview.md index d80ed7af8..8314938aa 100644 --- a/mini-lsm-book/src/00-overview.md +++ b/mini-lsm-book/src/00-overview.md @@ -1,11 +1,23 @@ # Mini-LSM Course Overview +## Tutorial Structure + +![Tutorial Overview](lsm-tutorial/00-full-overview.svg) + +We have three parts (weeks) for this tutorial. In the first week, we will focus on the storage structure and the storage format of an LSM storage engine. In the second week, we will deeply dive into compactions and implement persistence support for the storage engine. In the third week, we will implement multi-version concurrency control. + +* [The First Week: Mini-LSM](./week1-overview.md) +* [The Second Week: Compaction and Persistence](./week2-overview.md) +* [The Third Week: Multi-Version Concurrency Control](./week3-overview.md) + +Please look at [Environment Setup](./00-get-started.md) to set up the environment. + ## Overview of LSM -An LSM storage engine generally contains 3 parts: +An LSM storage engine generally contains three parts: 1. Write-ahead log to persist temporary data for recovery. -2. SSTs on the disk for maintaining a tree structure. +2. SSTs on the disk to maintain an LSM-tree structure. 3. Mem-tables in memory for batching small writes. The storage engine generally provides the following interfaces: @@ -19,24 +31,20 @@ To ensure persistence, * `Sync()`: ensure all the operations before `sync` are persisted to the disk. -Some engines choose to combine `Put` and `Delete` into a single operation called `WriteBatch`, which accepts a batch -of key value pairs. +Some engines choose to combine `Put` and `Delete` into a single operation called `WriteBatch`, which accepts a batch of key-value pairs. -In this tutorial, we assume the LSM tree is using leveled compaction algorithm, which is commonly used in real-world -systems. +In this tutorial, we assume the LSM tree is using a leveled compaction algorithm, which is commonly used in real-world systems. ### Write Path ![Write Path](lsm-tutorial/00-lsm-write-flow.svg) -The write path of LSM contains 4 steps: +The write path of LSM contains four steps: -1. Write the key-value pair to write-ahead log, so that it can be recovered after the storage engine crashes. -2. Write the key-value pair to memtable. After (1) and (2) completes, we can notify the user that the write operation - is completed. -3. When a memtable is full, we will freeze them into immutable memtables, and will flush them to the disk as SST files in the background. -4. We will compact some files in some level into lower levels to maintain a good shape for the LSM tree, so that read - amplification is low. +1. Write the key-value pair to the write-ahead log so that it can be recovered after the storage engine crashes. +2. Write the key-value pair to memtable. After (1) and (2) are completed, we can notify the user that the write operation is completed. +3. (In the background) When a mem-table is full, we will freeze them into immutable mem-tables and flush them to the disk as SST files in the background. +4. (In the background) The engine will compact some files in some levels into lower levels to maintain a good shape for the LSM tree so that the read amplification is low. ### Read Path @@ -44,21 +52,9 @@ The write path of LSM contains 4 steps: When we want to read a key, -1. We will first probe all the memtables from latest to oldest. +1. We will first probe all the mem-tables from the latest to the oldest. 2. If the key is not found, we will then search the entire LSM tree containing SSTs to find the data. There are two types of read: lookup and scan. Lookup finds one key in the LSM tree, while scan iterates all keys within a range in the storage engine. We will cover both of them throughout the tutorial. -## Tutorial Structure - -![Tutorial Overview](lsm-tutorial/00-full-overview.svg) - -We have 3 parts (weeks) for this tutorial. In the first week, we will focus on the storage structure and the storage format of an LSM storage engine. In the second week, we will dive into compactions in depth and implement persistence support for the storage engine. In the third week, we will implement multi-version concurrency control. - -* [The First Week: Mini-LSM](./week1-overview.md) -* [The Second Week: Compaction and Persistence](./week2-overview.md) -* [The Third Week: Multi-Version Concurrency Control](./week3-overview.md) - -To set up the environment, please take a look at [Environment Setup](./00-get-started.md). - {{#include copyright.md}} diff --git a/mini-lsm-book/src/00-preface.md b/mini-lsm-book/src/00-preface.md index 041caee41..6c8fb1c1d 100644 --- a/mini-lsm-book/src/00-preface.md +++ b/mini-lsm-book/src/00-preface.md @@ -1,82 +1,74 @@ # Preface -![Tutorial Overview](lsm-tutorial/00-full-overview.svg) +![Banner](./mini-lsm-logo.png) -In this tutorial, you will learn how to build a simple LSM-Tree storage engine in the Rust programming language. +This course teaches you how to build a simple LSM-Tree storage engine in Rust. ## What is LSM, and Why LSM? -Log-structured merge tree is a data structure to maintain key-value pairs. This data structure is widely used in +Log-structured merge trees are data structures that maintain key-value pairs. This data structure is widely used in distributed database systems like [TiDB](https://www.pingcap.com) and [CockroachDB](https://www.cockroachlabs.com) as their underlying storage engine. [RocksDB](http://rocksdb.org), based on [LevelDB](https://github.com/google/leveldb), -is an implementation of LSM-Tree storage engine. It provides a wide range of key-value access functionalities and is -used in a lot of production systems. +is an implementation of LSM-Tree storage engines. It provides many key-value access functionalities and is +used in many production systems. Generally speaking, LSM Tree is an append-friendly data structure. It is more intuitive to compare LSM to other -key-value data structure like RB-Tree and B-Tree. For RB-Tree and B-Tree, all data operations are in-place. That is to -say, when you update the value corresponding to the key, the value will be overwritten at its original memory or disk -space. But in an LSM Tree, all write operations, i.e., insertions, updates, deletions, are performed in somewhere else. -These operations will be batched into SST (sorted string table) files and be written to the disk. Once written to the -disk, the file will not be changed. These operations are applied lazily on disk with a special task called compaction. -The compaction job will merge multiple SST files and remove unused data. +key-value data structures like RB-Tree and B-Tree. For RB-Tree and B-Tree, all data operations are in place. That is to +say, when you want to update the value corresponding to the key, the engine will overwrite its original memory or disk +space with the new value. But in an LSM Tree, all write operations, i.e., insertions, updates, deletions, are lazily applied to the storage. +The engine batches these operations into SST (sorted string table) files and writes them to the disk. Once written to the +disk, the engine will not directly modify them. In a particular background task called compaction, the engine will merge these files to apply the updates and deletions. -This architectural design makes LSM tree easy to work with. +This architectural design makes LSM trees easy to work with. -1. Data are immutable on persistent storage, which means that it is easier to offload the background tasks (compaction) - to remote servers. It is also feasible to directly store and serve data from cloud-native storage systems like S3. -2. An LSM tree can balance between read, write and space amplification by changing the compaction algorithm. The data - structure itself is super versatile and can be optimized for different workloads. +1. Data are immutable on persistent storage. Concurrency control is more straightforward. Offloading the background tasks (compaction) to remote servers is possible. Storing and serving data directly from cloud-native storage systems like S3 is also feasible. +2. Changing the compaction algorithm allows the storage engine to balance between read, write, and space amplification. The data structure is versatile, and by adjusting the compaction parameters, we can optimize the LSM structure for different workloads. -In this tutorial, we will learn how to build an LSM-Tree-based storage engine in the Rust programming language. +This course will teach you how to build an LSM-tree-based storage engine in the Rust programming language. ## Prerequisites -* You should know the basics of the Rust programming language. Reading [the Rust book](https://doc.rust-lang.org/book/) - is enough. -* You should know the basic concepts of key-value storage engines, i.e., why we need somehow complex design to achieve - persistence. If you have no experience with database systems and storage systems before, you can implement Bitcask - in [PingCAP Talent Plan](https://github.com/pingcap/talent-plan/tree/master/courses/rust/projects/project-2). -* Knowing the basics of an LSM tree is not a requirement but we recommend you to read something about it, e.g., the - overall idea of LevelDB. This would familiarize you with concepts like mutable and immutable mem-tables, SST, - compaction, WAL, etc. +* You should know the basics of the Rust programming language. Reading [the Rust book](https://doc.rust-lang.org/book/) is enough. +* You should know the basic concepts of key-value storage engines, i.e., why we need a complex design to achieve persistence. If you have no experience with database systems and storage systems before, you can implement Bitcask in [PingCAP Talent Plan](https://github.com/pingcap/talent-plan/tree/master/courses/rust/projects/project-2). +* Knowing the basics of an LSM tree is not a requirement, but we recommend you read something about it, e.g., the overall idea of LevelDB. Knowing them beforehand would familiarize you with concepts like mutable and immutable mem-tables, SST, compaction, WAL, etc. -## What should you expect from this tutorial... +## What should you expect from this tutorial -After learning this course, you should have a deep understanding of how a LSM-based storage system works, gain hands-on experience of designing such systems, and apply what you have learned in your study and career. You will understand the design tradeoffs in such storage systems and find optimal ways to design a LSM-based storage system to meet your workload requirements/goals. This is a very in-depth tutorial that covers all the important implementation details and design choices of modern storage systems (i.e., RocksDB) based on the author's experience in several LSM-like storage systems, and you will be able to directly apply what you have learned in both industry and academia. +After taking this course, you should deeply understand how an LSM-based storage system works, gain hands-on experience in designing such systems, and apply what you have learned in your study and career. You will understand the design tradeoffs in such storage systems and find optimal ways to design an LSM-based storage system to meet your workload requirements/goals. This very in-depth tutorial covers all the essential implementation details and design choices of modern storage systems (i.e., RocksDB) based on the author's experience in several LSM-like storage systems, and you will be able to directly apply what you have learned in both industry and academia. ### Structure -The tutorial is a large course that is split into several parts (weeks). Each week usually has seven chapters, and each of the chapter can be finished within 2-3 hours. The first six chapters of each part will instruct you to build a working system, and the last chapter of each week will be a *snack time* chapter that implements some easy things over what you have built in the previous six days. In each chapter, there will be required tasks, *check you understanding* questions, and bonus tasks. +The tutorial is an extensive course with several parts (weeks). Each week has seven chapters; you can finish each within 2 to 3 hours. The first six chapters of each part will instruct you to build a working system, and the last chapter of each week will be a *snack time* chapter that implements some easy things over what you have built in the previous six days. Each chapter will have required tasks, *check your understanding* questions, and bonus tasks. ### Testing -We provide full test suite and some cli tools for you to validate if your solution is correct. Note that the test suite is not exhaustive, and your solution might not be 100% correct after passing all test cases. You might need to fix earlier bugs when implementing later parts of the system. We recommend you to think thoroughly about your implementation, especially when there are multi-thread operations and race conditions. +We provide a full test suite and some CLI tools for you to validate if your solution is correct. Note that the test suite is not exhaustive, and your solution might not be 100% correct after passing all test cases. You might need to fix earlier bugs when implementing later parts of the system. We recommend you think thoroughly about your implementation, especially when there are multi-thread operations and race conditions. ### Solution We have a solution that implements all the functionalities as required in the tutorial in the mini-lsm main repo. At the same time, we also have a mini-lsm solution checkpoint repo where each commit corresponds to a chapter in the tutorial. -Keeping such checkpoint repo up-to-date to the mini-lsm tutorial is hard because each bug fix or new feature will need to go through all commits (or checkpoints). Therefore, this repo might not be using the latest starter code or incorporating the latest features from the mini-lsm tutorial. +Keeping such a checkpoint repo up-to-date with the mini-lsm tutorial is challenging because each bug fix or new feature must go through all commits (or checkpoints). Therefore, this repo might not use the latest starter code or incorporate the latest features from the mini-lsm tutorial. -**TL;DR: We do not guarantee the solution checkpoint repo contains a correct solution, passes all tests, or has the correct doc comments.** For a correct implementation and the solution after implementing all things, please take a look at the solution in the main repo instead. [https://github.com/skyzh/mini-lsm/tree/main/mini-lsm](https://github.com/skyzh/mini-lsm/tree/main/mini-lsm). +**TL;DR: We do not guarantee the solution checkpoint repo contains a correct solution, passes all tests, or has the correct doc comments.** For a correct implementation and the solution after implementing everything, please look at the solution in the main repo instead. [https://github.com/skyzh/mini-lsm/tree/main/mini-lsm](https://github.com/skyzh/mini-lsm/tree/main/mini-lsm). -If you are stuck at some part of the tutorial or do not know where to implement a functionality, you can refer to this repo for help. You may compare the diff between commits to know what has been changed. Some functions in the mini-lsm tutorial might be changed multiple times throughout the chapters, and you can know what exactly are expected to be implemented for each chapter in this repo. +If you are stuck at some part of the tutorial or need help determining where to implement functionality, you can refer to this repo for help. You may compare the diff between commits to know what has been changed. You might need to modify some functions in the mini-lsm tutorial multiple times throughout the chapters, and you can understand what exactly is expected to be implemented for each chapter in this repo. You may access the solution checkpoint repo at [https://github.com/skyzh/mini-lsm-solution-checkpoint](https://github.com/skyzh/mini-lsm-solution-checkpoint). ### Feedbacks -Your feedback is greatly appreciated. We have rewritten the whole course from scratch in 2024 based on the feedbacks from the students. We hope you can share your learning experience and help us continuously improve the tutorial. Welcome to the [Discord community](https://skyzh.dev/join/discord) and share your experience. +Your feedback is greatly appreciated. We have rewritten the whole course from scratch in 2024 based on the feedback from the students. Please share your learning experience and help us continuously improve the tutorial. Welcome to the [Discord community](https://skyzh.dev/join/discord) and share your experience. -The long story of why we rewrote it: The tutorial was originally planned as a general guidance that students start from an empty directory and implement whatever they want based on the specification we had. We had minimal tests that checks if the behavior is correct. However, the original tutorial is too open-ended that caused huge obstacles with the learning experience. As students do not have an overview of the whole system beforehand and the instructions are kind of vague, sometimes it is hard for the students to know why a design decision is made and what they need to achieve a goal. And some part of the course is too compact that it is impossible to deliver expected contents within just one chapter. Therefore, we completely redesigned the course to have a easier learning curve and clearer learning goals. The original one-week tutorial is now split into two weeks (first week on storage format, and second week on deep-dive compaction), with an extra part on MVCC. We hope you find this course interesting and helpful in your study and career. We would like to thank everyone who commented in [Feedback after coding day 1](https://github.com/skyzh/mini-lsm/issues/11) and [Hello, when is the next update plan for the tutorial?](https://github.com/skyzh/mini-lsm/issues/7) -- your feedback greatly helped us improve the course. +The long story of why we rewrote it: The tutorial was originally planned as a general guidance that students start from an empty directory and implement whatever they want based on the specifications we had. We had minimal tests that checked if the behavior was correct. However, the original tutorial was too open-ended, which caused huge obstacles to the learning experience. As students do not have an overview of the whole system beforehand and the instructions are vague, sometimes it is hard for them to know why a design decision is made and what they need to achieve a goal. Some parts of the course were so compact that delivering the expected contents within just one chapter was impossible. Therefore, we completely redesigned the course for an easier learning curve and clearer learning goals. The original one-week tutorial is now split into two weeks (the first week on storage format and the second week on deep-dive compaction), with an extra part on MVCC. We hope you find this course interesting and helpful in your study and career. We want to thank everyone who commented in [Feedback after coding day 1](https://github.com/skyzh/mini-lsm/issues/11) and [Hello, when is the next update plan for the tutorial?](https://github.com/skyzh/mini-lsm/issues/7) -- Your feedback greatly helped us improve the course. ### License -The source code of this course is licensed under Apache 2.0, while the author owns the full copyright of the tutorial itself (markdown files + figures). +The source code of this course is licensed under Apache 2.0, while the author owns the complete copyright of the tutorial itself (markdown files + figures). ### Will this tutorial be free forever? -Yes! Everything publicly available now will be free forever and will receive lifetime updates and bug fixes. Meanwhile, we might provide paid code review and office hour services in the future. For the DLC part (*rest of your life* chapters), we do not have plans to finish them as of 2024, and have not decided whether they will be public available or not. +Yes! Everything publicly available now will be free forever and receive lifetime updates and bug fixes. Meanwhile, we might provide paid code review and office hour services. For the DLC part (*rest of your life* chapters), we do not have plans to finish them as of 2024 and have yet to decide whether they will be publicly available. ## Community @@ -86,11 +78,11 @@ You may join skyzh's Discord server and study with the mini-lsm community. ## Get Started -Now, you may go ahead and get an overview of the LSM structure in [Mini-LSM Course Overview](./00-overview.md). +Now, you can get an overview of the LSM structure in [Mini-LSM Course Overview](./00-overview.md). ## About the Author -As of writing (at the beginning of 2024), Chi obtained his master's degree in Computer Science from Carnegie Mellon University and his bachelor's degree from Shanghai Jiao Tong University. He has been working on a variety of database systems including [TiKV][db1], [AgateDB][db2], [TerarkDB][db3], [RisingWave][db4], and [Neon][db5]. Since 2022, he worked as a teaching assistant for [CMU's Database Systems course](https://15445.courses.cs.cmu) for three semesters on the BusTub educational system, where he added a lot of new features and more challenges to the course (check out the re-designed [query execution](https://15445.courses.cs.cmu.edu/fall2022/project3/) project and the super challenging [multi-version concurrency control](https://15445.courses.cs.cmu.edu/fall2023/project4/) project). Besides working on the BusTub educational system, he is also a maintainer of the [RisingLight](https://github.com/risinglightdb/risinglight) educational database system. Chi is interested in exploring how the Rust programming language can fit in the database world. Check out his previous tutorial on building a vectorized expression framework [type-exercise-in-rust](https://github.com/skyzh/type-exercise-in-rust) and on building a vector database [write-you-a-vector-db](https://github.com/skyzh/write-you-a-vector-db) if you are also interested in that topic. +As of writing (at the beginning of 2024), Chi obtained his master's degree in Computer Science from Carnegie Mellon University and his bachelor's degree from Shanghai Jiao Tong University. He has been working on a variety of database systems, including [TiKV][db1], [AgateDB][db2], [TerarkDB][db3], [RisingWave][db4], and [Neon][db5]. Since 2022, he has worked as a teaching assistant for [CMU's Database Systems course](https://15445.courses.cs.cmu) for three semesters on the BusTub educational system, where he added a lot of new features and more challenges to the course (check out the redesigned [query execution](https://15445.courses.cs.cmu.edu/fall2022/project3/) project and the super challenging [multi-version concurrency control](https://15445.courses.cs.cmu.edu/fall2023/project4/) project). Besides working on the BusTub educational system, he also maintains the [RisingLight](https://github.com/risinglightdb/risinglight) educational database system. Chi is interested in exploring how the Rust programming language can fit into the database world. Check out his previous tutorial on building a vectorized expression framework [type-exercise-in-rust](https://github.com/skyzh/type-exercise-in-rust) and on building a vector database [write-you-a-vector-db](https://github.com/skyzh/write-you-a-vector-db) if you are also interested in that topic. [db1]: https://github.com/tikv/tikv [db2]: https://github.com/tikv/agatedb diff --git a/mini-lsm-book/src/SUMMARY.md b/mini-lsm-book/src/SUMMARY.md index 38d8eeab5..ed66d7d8e 100644 --- a/mini-lsm-book/src/SUMMARY.md +++ b/mini-lsm-book/src/SUMMARY.md @@ -27,9 +27,9 @@ - [Snapshots - Memtables and Timestamps](./week3-02-snapshot-read-part-1.md) - [Snapshots - Transaction API](./week3-03-snapshot-read-part-2.md) - [Watermark and GC](./week3-04-watermark.md) - - [Transaction and OCC (WIP)](./week3-05-txn-occ.md) - - [Serializable Snapshot Isolation (WIP)](./week3-06-serializable.md) - - [Snack Time: Compaction Filter (WIP)](./week3-07-compaction-filter.md) + - [Transaction and OCC](./week3-05-txn-occ.md) + - [Serializable Snapshot Isolation](./week3-06-serializable.md) + - [Snack Time: Compaction Filters](./week3-07-compaction-filter.md) - [The Rest of Your Life (TBD)](./week4-overview.md) --- diff --git a/mini-lsm-book/src/mini-lsm-logo.png b/mini-lsm-book/src/mini-lsm-logo.png new file mode 100644 index 000000000..ab6459804 Binary files /dev/null and b/mini-lsm-book/src/mini-lsm-logo.png differ diff --git a/mini-lsm-book/src/sitemap.txt b/mini-lsm-book/src/sitemap.txt index 815caec44..a3ef27d52 100644 --- a/mini-lsm-book/src/sitemap.txt +++ b/mini-lsm-book/src/sitemap.txt @@ -28,5 +28,12 @@ https://skyzh.github.io/mini-lsm/week2-05-manifest https://skyzh.github.io/mini-lsm/week2-06-wal https://skyzh.github.io/mini-lsm/week2-07-snacks https://skyzh.github.io/mini-lsm/week2-overview +https://skyzh.github.io/mini-lsm/week3-01-ts-key-refactor +https://skyzh.github.io/mini-lsm/week3-02-snapshot-read-part-1 +https://skyzh.github.io/mini-lsm/week3-03-snapshot-read-part-2 +https://skyzh.github.io/mini-lsm/week3-04-watermark +https://skyzh.github.io/mini-lsm/week3-05-txn-occ +https://skyzh.github.io/mini-lsm/week3-06-serializable +https://skyzh.github.io/mini-lsm/week3-07-compaction-filter https://skyzh.github.io/mini-lsm/week3-overview https://skyzh.github.io/mini-lsm/week4-overview diff --git a/mini-lsm-book/src/sitemap.xml b/mini-lsm-book/src/sitemap.xml index 3200ba1cd..7d84a32b7 100644 --- a/mini-lsm-book/src/sitemap.xml +++ b/mini-lsm-book/src/sitemap.xml @@ -2,130 +2,158 @@ https://skyzh.github.io/mini-lsm - 2024-01-25T02:56:28.231Z + 2024-01-30T08:17:56.731Z https://skyzh.github.io/mini-lsm/00-get-started - 2024-01-25T02:56:28.234Z + 2024-01-30T08:17:56.734Z https://skyzh.github.io/mini-lsm/00-overview - 2024-01-25T02:56:28.232Z + 2024-01-30T08:17:56.732Z https://skyzh.github.io/mini-lsm/00-preface - 2024-01-25T02:56:28.230Z + 2024-01-30T08:17:56.730Z https://skyzh.github.io/mini-lsm/00-v1 - 2024-01-25T02:56:28.256Z + 2024-01-30T08:17:56.762Z https://skyzh.github.io/mini-lsm/01-block - 2024-01-25T02:56:28.257Z + 2024-01-30T08:17:56.763Z https://skyzh.github.io/mini-lsm/02-sst - 2024-01-25T02:56:28.258Z + 2024-01-30T08:17:56.764Z https://skyzh.github.io/mini-lsm/03-memtable - 2024-01-25T02:56:28.259Z + 2024-01-30T08:17:56.765Z https://skyzh.github.io/mini-lsm/04-engine - 2024-01-25T02:56:28.260Z + 2024-01-30T08:17:56.766Z https://skyzh.github.io/mini-lsm/05-compaction - 2024-01-25T02:56:28.261Z + 2024-01-30T08:17:56.767Z https://skyzh.github.io/mini-lsm/06-recovery - 2024-01-25T02:56:28.262Z + 2024-01-30T08:17:56.768Z https://skyzh.github.io/mini-lsm/07-bloom-filter - 2024-01-25T02:56:28.263Z + 2024-01-30T08:17:56.768Z https://skyzh.github.io/mini-lsm/08-key-compression - 2024-01-25T02:56:28.264Z + 2024-01-30T08:17:56.769Z https://skyzh.github.io/mini-lsm/09-whats-next - 2024-01-25T02:56:28.265Z + 2024-01-30T08:17:56.770Z https://skyzh.github.io/mini-lsm/week1-01-memtable - 2024-01-25T02:56:28.237Z + 2024-01-30T08:17:56.736Z https://skyzh.github.io/mini-lsm/week1-02-merge-iterator - 2024-01-25T02:56:28.238Z + 2024-01-30T08:17:56.738Z https://skyzh.github.io/mini-lsm/week1-03-block - 2024-01-25T02:56:28.239Z + 2024-01-30T08:17:56.739Z https://skyzh.github.io/mini-lsm/week1-04-sst - 2024-01-25T02:56:28.240Z + 2024-01-30T08:17:56.740Z https://skyzh.github.io/mini-lsm/week1-05-read-path - 2024-01-25T02:56:28.242Z + 2024-01-30T08:17:56.741Z https://skyzh.github.io/mini-lsm/week1-06-write-path - 2024-01-25T02:56:28.243Z + 2024-01-30T08:17:56.743Z https://skyzh.github.io/mini-lsm/week1-07-sst-optimizations - 2024-01-25T02:56:28.244Z + 2024-01-30T08:17:56.744Z https://skyzh.github.io/mini-lsm/week1-overview - 2024-01-25T02:56:28.235Z + 2024-01-30T08:17:56.735Z https://skyzh.github.io/mini-lsm/week2-01-compaction - 2024-01-25T02:56:28.246Z + 2024-01-30T08:17:56.746Z https://skyzh.github.io/mini-lsm/week2-02-simple - 2024-01-25T02:56:28.248Z + 2024-01-30T08:17:56.747Z https://skyzh.github.io/mini-lsm/week2-03-tiered - 2024-01-25T02:56:28.249Z + 2024-01-30T08:17:56.748Z https://skyzh.github.io/mini-lsm/week2-04-leveled - 2024-01-25T02:56:28.250Z + 2024-01-30T08:17:56.749Z https://skyzh.github.io/mini-lsm/week2-05-manifest - 2024-01-25T02:56:28.251Z + 2024-01-30T08:17:56.750Z https://skyzh.github.io/mini-lsm/week2-06-wal - 2024-01-25T02:56:28.252Z + 2024-01-30T08:17:56.751Z https://skyzh.github.io/mini-lsm/week2-07-snacks - 2024-01-25T02:56:28.253Z + 2024-01-30T08:17:56.752Z https://skyzh.github.io/mini-lsm/week2-overview - 2024-01-25T02:56:28.245Z + 2024-01-30T08:17:56.745Z + + + https://skyzh.github.io/mini-lsm/week3-01-ts-key-refactor + 2024-01-30T08:17:56.754Z + + + https://skyzh.github.io/mini-lsm/week3-02-snapshot-read-part-1 + 2024-01-30T08:17:56.755Z + + + https://skyzh.github.io/mini-lsm/week3-03-snapshot-read-part-2 + 2024-01-30T08:17:56.756Z + + + https://skyzh.github.io/mini-lsm/week3-04-watermark + 2024-01-30T08:17:56.757Z + + + https://skyzh.github.io/mini-lsm/week3-05-txn-occ + 2024-01-30T08:17:56.758Z + + + https://skyzh.github.io/mini-lsm/week3-06-serializable + 2024-01-30T08:17:56.759Z + + + https://skyzh.github.io/mini-lsm/week3-07-compaction-filter + 2024-01-30T08:17:56.760Z https://skyzh.github.io/mini-lsm/week3-overview - 2024-01-25T02:56:28.254Z + 2024-01-30T08:17:56.753Z https://skyzh.github.io/mini-lsm/week4-overview - 2024-01-25T02:56:28.255Z + 2024-01-30T08:17:56.761Z diff --git a/mini-lsm-book/src/week1-01-memtable.md b/mini-lsm-book/src/week1-01-memtable.md index 983d22499..39fcfdb45 100644 --- a/mini-lsm-book/src/week1-01-memtable.md +++ b/mini-lsm-book/src/week1-01-memtable.md @@ -23,7 +23,7 @@ In this task, you will need to modify: src/mem_table.rs ``` -Firstly, let us implement the in-memory structure of an LSM storage engine -- the memtable. We choose [crossbeam's skiplist implementation](link) as the data structure of the memtable as it supports lock-free concurrent read and write. We will not cover in-depth how a skiplist works, and in a nutshell, it is an ordered key-value map that easily allows concurrent read and write. +Firstly, let us implement the in-memory structure of an LSM storage engine -- the memtable. We choose [crossbeam's skiplist implementation](https://docs.rs/crossbeam-skiplist/latest/crossbeam_skiplist/) as the data structure of the memtable as it supports lock-free concurrent read and write. We will not cover in-depth how a skiplist works, and in a nutshell, it is an ordered key-value map that easily allows concurrent read and write. crossbeam-skiplist provides similar interfaces to the Rust std's `BTreeMap`: insert, get, and iter. The only difference is that the modification interfaces (i.e., `insert`) only require an immutable reference to the skiplist, instead of a mutable one. Therefore, in your implementation, you should not take any mutex when implementing the memtable structure. @@ -39,7 +39,6 @@ In this task, you will need to modify: ``` src/lsm_storage.rs -src/mem_table.rs ``` Now, we will add our first data structure, the memtable, to the LSM state. In `LsmStorageState::create`, you will find that when a LSM structure is created, we will initialize a memtable of id 0. This is the **mutable memtable** in the initial state. At any point of the time, the engine will have only one single mutable memtable. A memtable usually has a size limit (i.e., 256MB), and it will be frozen to an immutable memtable when it reaches the size limit. @@ -67,7 +66,7 @@ src/mem_table.rs A memtable cannot continuously grow in size, and we will need to freeze them (and later flush to the disk) when it reaches the size limit. You may find the memtable size limit, which is **equal to the SST size limit** (not `num_memtables_limit`), in the `LsmStorageOptions`. This is not a hard limit and you should freeze the memtable at best effort. -In this task, you will need to compute the approximate memtable size when put/delete a key in the memtable. This can be computed by simply adding the total number of bytes of keys and values when `put` is called. Is a key is put twice, though the skiplist only contains the latest value, you may count it twice in the approximate memtable size. Once a memtable reaches the limit, you should call `force_freeze_memtable` to freeze the memtable and create a new one. +In this task, you will need to compute the approximate memtable size when put/delete a key in the memtable. This can be computed by simply adding the total number of bytes of keys and values when `put` is called. If a key is put twice, though the skiplist only contains the latest value, you may count it twice in the approximate memtable size. Once a memtable reaches the limit, you should call `force_freeze_memtable` to freeze the memtable and create a new one. Because there could be multiple threads getting data into the storage engine, `force_freeze_memtable` might be called concurrently from multiple threads. You will need to think about how to avoid race conditions in this case. diff --git a/mini-lsm-book/src/week1-02-merge-iterator.md b/mini-lsm-book/src/week1-02-merge-iterator.md index d10bb6a93..df6896512 100644 --- a/mini-lsm-book/src/week1-02-merge-iterator.md +++ b/mini-lsm-book/src/week1-02-merge-iterator.md @@ -103,7 +103,7 @@ Starting this section, we will use `Key` to represent LSM key types and disti In this task, you will need to modify: ``` -src/iterators/lsm_iterator.rs +src/lsm_iterator.rs ``` We use the `LsmIterator` structure to represent the internal LSM iterators. You will need to modify this structure multiple times throughout the tutorial when more iterators are added into the system. For now, because we only have multiple memtables, it should be defined as: @@ -123,7 +123,7 @@ Then, we want to provide extra safety on the iterator to avoid users from misusi In this task, you will need to modify: ``` -src/iterators/lsm_storage.rs +src/lsm_storage.rs ``` We are finally there -- with all iterators you have implemented, you can finally implement the `scan` interface of the LSM engine. You can simply construct an LSM iterator with the memtable iterators (remember to put the latest memtable at the front of the merge iterator), and your storage engine will be able to handle the scan request. diff --git a/mini-lsm-book/src/week1-03-block.md b/mini-lsm-book/src/week1-03-block.md index 087e66c5c..9a90f1db9 100644 --- a/mini-lsm-book/src/week1-03-block.md +++ b/mini-lsm-book/src/week1-03-block.md @@ -77,7 +77,7 @@ In this task, you will need to modify: src/block/iterator.rs ``` -Now that we have an encoded block, we will need to implement the `StorageIterator` interface, so that the user can lookup/scan keys in the block. +Now that we have an encoded block, we will need to implement the `BlockIterator` interface, so that the user can lookup/scan keys in the block. `BlockIterator` can be created with an `Arc`. If `create_and_seek_to_first` is called, it will be positioned at the first key in the block. If `create_and_seek_to_key` is called, the iterator will be positioned at the first key that is `>=` the provided key. For example, if `1, 3, 5` is in a block. diff --git a/mini-lsm-book/src/week1-05-read-path.md b/mini-lsm-book/src/week1-05-read-path.md index 3c9c6ccc6..dc5d71907 100644 --- a/mini-lsm-book/src/week1-05-read-path.md +++ b/mini-lsm-book/src/week1-05-read-path.md @@ -84,5 +84,6 @@ We do not provide reference answers to the questions, and feel free to discuss a ## Bonus Tasks * **The Cost of Dynamic Dispatch.** Implement a `Box` version of merge iterators and benchmark to see the performance differences. +* **Parallel Seek.** Creating a merge iterator requires loading the first block of all underlying SSTs (when you create `SSTIterator`). You may parallelize the process of creating iterators. {{#include copyright.md}} diff --git a/mini-lsm-book/src/week1-07-sst-optimizations.md b/mini-lsm-book/src/week1-07-sst-optimizations.md index 6bab55de4..7abdd797c 100644 --- a/mini-lsm-book/src/week1-07-sst-optimizations.md +++ b/mini-lsm-book/src/week1-07-sst-optimizations.md @@ -23,9 +23,11 @@ Bloom filters are probabilistic data structures that maintains a set of keys. Yo You usually need to have a hash function in order to construct a bloom filter, and a key can have multiple hashes. Let us take a look at the below example. Assume that we already have hashes of some keys and the bloom filter has 7 bits. +[Note: If you want to understand bloom filters better, look [here](https://samwho.dev/bloom-filters/)] + ```plaintext hash1 = ((character - a) * 13) % 7 -hash1 = ((character - a) * 11) % 7 +hash2 = ((character - a) * 11) % 7 b -> 6 4 c -> 5 1 d -> 4 5 @@ -34,7 +36,7 @@ g -> 1 3 h -> 0 0 ``` -If we insert b, c, d into the 6-bit bloom filter, we will get: +If we insert b, c, d into the 7-bit bloom filter, we will get: ``` bit 0123456 diff --git a/mini-lsm-book/src/week2-01-compaction.md b/mini-lsm-book/src/week2-01-compaction.md index 74ed2a48c..c8bc8d696 100644 --- a/mini-lsm-book/src/week2-01-compaction.md +++ b/mini-lsm-book/src/week2-01-compaction.md @@ -15,6 +15,12 @@ cargo x copy-test --week 2 --day 1 cargo x scheck ``` +
+ +It might be helpful to take a look at [week 2 overview](./week2-overview.md) before reading this chapter to have a general overview of compactions. + +
+ ## Task 1: Compaction Implementation In this task, you will implement the core logic of doing a compaction -- merge sort a set of SST files into a sorted run. You will need to modify: diff --git a/mini-lsm-book/src/week2-02-simple.md b/mini-lsm-book/src/week2-02-simple.md index 1921259e1..dbf877040 100644 --- a/mini-lsm-book/src/week2-02-simple.md +++ b/mini-lsm-book/src/week2-02-simple.md @@ -14,6 +14,12 @@ cargo x copy-test --week 2 --day 2 cargo x scheck ``` +
+ +It might be helpful to take a look at [week 2 overview](./week2-overview.md) before reading this chapter to have a general overview of compactions. + +
+ ## Task 1: Simple Leveled Compaction In this chapter, we are going to implement our first compaction strategy -- simple leveled compaction. In this task, you will need to modify: diff --git a/mini-lsm-book/src/week2-03-tiered.md b/mini-lsm-book/src/week2-03-tiered.md index 68f506f90..b5b619e12 100644 --- a/mini-lsm-book/src/week2-03-tiered.md +++ b/mini-lsm-book/src/week2-03-tiered.md @@ -16,6 +16,12 @@ cargo x copy-test --week 2 --day 3 cargo x scheck ``` +
+ +It might be helpful to take a look at [week 2 overview](./week2-overview.md) before reading this chapter to have a general overview of compactions. + +
+ ## Task 1: Universal Compaction In this chapter, you will implement RocksDB's universal compaction, which is of the tiered compaction family compaction strategies. Similar to the simple leveled compaction strategy, we only use number of files as the indicator in this compaction strategy. And when we trigger the compaction jobs, we always include a full sorted run (tier) in the compaction job. @@ -28,7 +34,7 @@ In this task, you will need to modify: src/compact/tiered.rs ``` -In universal compaction, we do not use L0 SSTs in the LSM state. Instead, we directly flush new SSTs to a single sorted run (called tier). In the LSM state, `levels` will now include all tiers, where the lowest index is the latest SST flushed. The compaction simulator generates tier id based on the first SST id, and you should do the same in your implementation. +In universal compaction, we do not use L0 SSTs in the LSM state. Instead, we directly flush new SSTs to a single sorted run (called tier). In the LSM state, `levels` will now include all tiers, where **the lowest index is the latest SST flushed**. Each element in the `levels` vector stores a tuple: level ID (used as tier ID) and the SSTs in that level. Every time you flush L0 SSTs, you should flush the SST into a tier placed at the front of the vector. The compaction simulator generates tier id based on the first SST id, and you should do the same in your implementation. Universal compaction will only trigger tasks when the number of tiers (sorted runs) is larger than `num_tiers`. Otherwise, it does not trigger any compaction. @@ -130,6 +136,7 @@ As tiered compaction does not use the L0 level of the LSM state, you should dire * What happens if compaction speed cannot keep up with the SST flushes? * What might needs to be considered if the system schedules multiple compaction tasks in parallel? * SSDs also write its own logs (basically it is a log-structured storage). If the SSD has a write amplification of 2x, what is the end-to-end write amplification of the whole system? Related: [ZNS: Avoiding the Block Interface Tax for Flash-based SSDs](https://www.usenix.org/conference/atc21/presentation/bjorling). +* Consider the case that the user chooses to keep a large number of sorted runs (i.e., 300) for tiered compaction. To make the read path faster, is it a good idea to keep some data structure that helps reduce the time complexity (i.e., to `O(log n)`) of finding SSTs to read in each layer for some key ranges? Note that normally, you will need to do a binary search in each sorted run to find the key ranges that you will need to read. (Check out Neon's [layer map](https://neon.tech/blog/persistent-structures-in-neons-wal-indexing) implementation!) We do not provide reference answers to the questions, and feel free to discuss about them in the Discord community. diff --git a/mini-lsm-book/src/week2-04-leveled.md b/mini-lsm-book/src/week2-04-leveled.md index cbf629bae..371041591 100644 --- a/mini-lsm-book/src/week2-04-leveled.md +++ b/mini-lsm-book/src/week2-04-leveled.md @@ -14,6 +14,12 @@ cargo x copy-test --week 2 --day 4 cargo x scheck ``` +
+ +It might be helpful to take a look at [week 2 overview](./week2-overview.md) before reading this chapter to have a general overview of compactions. + +
+ ## Task 1: Leveled Compaction In chapter 2 day 2, you have implemented the simple leveled compaction strategies. However, the implementation has a few problems: @@ -43,21 +49,25 @@ You will need to compute the target sizes of the levels. Assume `base_level_size [0 0 0 0 0 200MB] ``` -When the levels grow in size as more SSTs get compacted to that level, we will compute the target size based on the size of the last level. When the actual size of SST files in the last level reaches 200MB, for example, 300MB, we will compute the target size of the other levels by dividing the `level_size_multiplier`. Assume `level_size_multiplier=10`. +Before the bottom level exceeds `base_level_size_mb`, all other intermediate levels will have target sizes of 0. The idea is that when the total amount of data is small, it's wasteful to create intermediate levels. + +When the bottom level reaches or exceeds `base_level_size_mb`, we will compute the target size of the other levels by dividing the `level_size_multiplier` from the size. Assume the bottom level contains 300MB of data, and `level_size_multiplier=10`. ``` 0 0 0 0 30MB 300MB ``` -We will only keep at most *one* level below `base_level_size_mb`, and in this case, it is L5. Assume we now have 30GB files in the last level, the target sizes will be, +In addition, at most *one* level can have a positive target size below `base_level_size_mb`. Assume we now have 30GB files in the last level, the target sizes will be, ``` 0 0 30MB 300MB 3GB 30GB ``` +Notice in this case L1 and L2 have target size of 0, and L3 is the only level with a postive target size below `base_level_size_mb`. + ### Task 1.2: Decide Base Level -Now, let us solve the problem that SSTs may be compacted across empty levels in the simple leveled compaction strategy. When we compact L0 SSTs with lower levels, we do not directly put it to L1. Instead, we compact it with the first level with `target size > 0``. For example, when the target level sizes are: +Now, let us solve the problem that SSTs may be compacted across empty levels in the simple leveled compaction strategy. When we compact L0 SSTs with lower levels, we do not directly put it to L1. Instead, we compact it with the first level with `target size > 0`. For example, when the target level sizes are: ``` 0 0 0 0 30MB 300MB @@ -89,7 +99,7 @@ The number of levels in the compaction simulator is 4. Therefore, the SSTs shoul ### Task 1.3: Decide Level Priorities -Now that we will need to handle compactions below L0. L0 compaction always has the top priority, that you should compact L0 with other levels first if it reaches the threshold. After that, we can compute the compaction priorities of each level by `current_size / target_size`. We only compact levels with this ratio `> 1.0` The one with the largest ratio will be chosen for compaction with the lower level. For example, if we have: +Now that we will need to handle compactions below L0. L0 compaction always has the top priority, thus you should compact L0 with other levels first if it reaches the threshold. After that, we can compute the compaction priorities of each level by `current_size / target_size`. We only compact levels with this ratio `> 1.0` The one with the largest ratio will be chosen for compaction with the lower level. For example, if we have: ``` L3: 200MB, target_size=20MB diff --git a/mini-lsm-book/src/week2-05-manifest.md b/mini-lsm-book/src/week2-05-manifest.md index c4d898c06..2361ed0ac 100644 --- a/mini-lsm-book/src/week2-05-manifest.md +++ b/mini-lsm-book/src/week2-05-manifest.md @@ -7,6 +7,13 @@ In this chapter, you will: * Implement encoding and decoding of the manifest file. * Recover from the manifest when the system restarts. +To copy the test cases into the starter code and run them, + +``` +cargo x copy-test --week 2 --day 5 +cargo x scheck +``` + ## Task 1: Manifest Encoding The system uses a manifest file to record all operations happened in the engine. Currently, there are only two types of them: compaction and SST flush. When the engine restarts, it will read the manifest file, reconstruct the state, and load the SST files on the disk. @@ -87,5 +94,6 @@ get 1500 ## Bonus Tasks * **Manifest Compaction.** When the number of logs in the manifest file gets too large, you can rewrite the manifest file to only store the current snapshot and append new logs to that file. +* **Parallel Open.** After you collect the list of SSTs to open, you can open and decode them in parallel, instead of doing it one by one, therefore accelerating the recovery process. {{#include copyright.md}} diff --git a/mini-lsm-book/src/week2-06-wal.md b/mini-lsm-book/src/week2-06-wal.md index fb4ebc998..b9500f970 100644 --- a/mini-lsm-book/src/week2-06-wal.md +++ b/mini-lsm-book/src/week2-06-wal.md @@ -7,6 +7,13 @@ In this chapter, you will: * Implement encoding and decoding of the write-ahead log file. * Recover memtables from the WALs when the system restarts. +To copy the test cases into the starter code and run them, + +``` +cargo x copy-test --week 2 --day 6 +cargo x scheck +``` + ## Task 1: WAL Encoding In this task, you will need to modify: diff --git a/mini-lsm-book/src/week3-04-watermark.md b/mini-lsm-book/src/week3-04-watermark.md index 75fc879cc..cc95c615f 100644 --- a/mini-lsm-book/src/week3-04-watermark.md +++ b/mini-lsm-book/src/week3-04-watermark.md @@ -74,6 +74,8 @@ Assume these are all keys in the engine. If we do a scan at ts=3, we will get `a * In our implementation, we manage watermarks by ourselves with the lifecycle of `Transaction` (so-called un-managed mode). If the user intends to manage key timestamps and the watermarks by themselves (i.e., when they have their own timestamp generator), what do you need to do in the write_batch/get/scan API to validate their requests? Is there any architectural assumption we had that might be hard to maintain in this case? * Why do we need to store an `Arc` of `Transaction` inside a transaction iterator? +* What is the condition to fully remove a key from the SST file? +* For now, we only remove a key when compacting to the bottom-most level. Is there any other prior time that we can remove the key? (Hint: you know the start/end key of each SST in all levels.) ## Bonus Tasks diff --git a/mini-lsm-book/src/week3-06-serializable.md b/mini-lsm-book/src/week3-06-serializable.md index 1aa1bfe82..232642364 100644 --- a/mini-lsm-book/src/week3-06-serializable.md +++ b/mini-lsm-book/src/week3-06-serializable.md @@ -102,11 +102,21 @@ You can skip the check if `write_set` is empty. A read-only transaction can alwa You should also modify the `put`, `delete`, and `write_batch` interface in `LsmStorageInner`. We recommend you define a helper function `write_batch_inner` that processes a write batch. If `options.serializable = true`, `put`, `delete`, and the user-facing `write_batch` should create a transaction instead of directly creating a write batch. Your write batch helper function should also return a `u64` commit timestamp so that `Transaction::Commit` can correctly store the committed transaction data into the MVCC structure. +## Task 4: Garbage Collection + +In this task, you will need to modify: + +``` +src/mvcc/txn.rs +``` + +When you commit a transaction, you can also clean up the committed txn map to remove all transactions below the watermark, as they will not be involved in any future serializable validations. + ## Test Your Understanding * If you have some experience with building a relational database, you may think about the following question: assume that we build a database based on Mini-LSM where we store each row in the relation table as a key-value pair (key: primary key, value: serialized row) and enable serializable verification, does the database system directly gain ANSI serializable isolation level capability? Why or why not? * The thing we implement here is actually write snapshot-isolation (see [A critique of snapshot isolation](https://dl.acm.org/doi/abs/10.1145/2168836.2168853)) that guarantees serializable. Is there any cases where the execution is serializable, but will be rejected by the write snapshot-isolation validation? -* There are databases that claim they have serializable snapshot isolation support by only tracking the keys accessed in gets and scans. Do they really prevent write skews caused by phantoms? (Okay... Actually, I'm talking about [BadgerDB](https://dgraph.io/blog/post/badger-txn/).) +* There are databases that claim they have serializable snapshot isolation support by only tracking the keys accessed in gets and scans (instead of key range). Do they really prevent write skews caused by phantoms? (Okay... Actually, I'm talking about [BadgerDB](https://dgraph.io/blog/post/badger-txn/).) We do not provide reference answers to the questions, and feel free to discuss about them in the Discord community. diff --git a/mini-lsm-book/src/week3-07-compaction-filter.md b/mini-lsm-book/src/week3-07-compaction-filter.md index 82213a63b..a9a5eea0f 100644 --- a/mini-lsm-book/src/week3-07-compaction-filter.md +++ b/mini-lsm-book/src/week3-07-compaction-filter.md @@ -1,4 +1,46 @@ -# Snack Time: Compaction Filter +# Snack Time: Compaction Filters +Congratulations! You made it there! In the previous chapter, you made your LSM engine multi-version capable, and the users can use transaction APIs to interact with your storage engine. At the end of this week, we will implement some easy but important features of the storage engine. Welcome to Mini-LSM's week 3 snack time! + +In this chapter, we will generalize our compaction garbage collection logic to become compaction filters. + +For now, our compaction will simply retain the keys above the watermark and the latest version of the keys below the watermark. We can add some magic to the compaction process to help the user collect some unused data automatically as a background job. + +Consider a case that the user uses Mini-LSM to store database tables. Each row in the table are prefixed with the table name. For example, + +``` +table1_key1 -> row +table1_key2 -> row +table1_key3 -> row +table2_key1 -> row +table2_key2 -> row +``` + +Now the user executes `DROP TABLE table1`. The engine will need to clean up all the data beginning with `table1`. + +There are a lot of ways to achieve the goal. The user of Mini-LSM can scan all the keys beginning with `table1` and requests the engine to delete it. However, scanning a very large database might be slow, and it will generate the same number of delete tombstones as the existing keys. Therefore, scan-and-delete will not free up the space occupied by the dropped table -- instead, it will add more data to the engine and the space can only be reclaimed when the tombstones reach the bottom level of the engine. + +Or, they can create column families (we will talk about this in *rest of your life* chapter). They store each table in a column family, which is a standalone LSM state, and directly remove the SST files corresponding to the column family when the user drop the table. + +In this tutorial, we will implement the third approach: compaction filters. Compaction filters can be dynamically added to the engine at runtime. During the compaction, if a key matching the compaction filter is found, we can silently remove it in the background. Therefore, the user can attach a compaction filter of `prefix=table1` to the engine, and all these keys will be removed during compaction. + +## Task 1: Compaction Filter + +In this task, you will need to modify: + +``` +src/compact.rs +``` + +You can iterate all compaction filters in `LsmStorageInner::compaction_filters`. If the first version of the key below watermark matches the compaction filter, simply remove it instead of keeping it in the SST file. + +To run test cases, + +``` +cargo x copy-test --week 3 --day 7 +cargo x scheck +``` + +You can assume that the user will not get the keys within the prefix filter range. And, they will not scan the keys in the prefix range. Therefore, it is okay to return a wrong value when a user requests the keys in the prefix filter range (i.e., undefined behavior). {{#include copyright.md}} diff --git a/mini-lsm-book/src/week3-overview.md b/mini-lsm-book/src/week3-overview.md index 0bbfac5c6..500c4f15b 100644 --- a/mini-lsm-book/src/week3-overview.md +++ b/mini-lsm-book/src/week3-overview.md @@ -2,6 +2,8 @@ In this part, you will implement MVCC over the LSM engine that you have built in the previous two weeks. We will add timestamp encoding in the keys to maintain multiple versions of a key, and change some part of the engine to ensure old data are either retained or garbage-collected based on whether there are users reading an old version. +The general approach of the MVCC part in this tutorial is inspired and partially based on [BadgerDB](https://github.com/dgraph-io/badger). + The key of MVCC is to store and access multiple versions of a key in the storage engine. Therefore, we will need to change the key format to `user_key + timestamp (u64)`. And on the user interface side, we will need to have new APIs to help users to gain access to a history version. In summary, we will add a monotonically-increasing timestamp to the key. In previous parts, we assumed that newer keys are in the upper level of the LSM tree, and older keys are in the lower level of the LSM tree. During compaction, we only keep the latest version of a key if multiple versions are found in multiple levels, and the compaction process will ensure that newer keys will be kept on the upper level by only merging adjacent levels/tiers. In the MVCC implementation, the key with a larger timestamp is the newest key. During compaction, we can only remove the key if no user is accessing an older version of the database. Though not keeping the latest version of key in the upper level may still yield a correct result for the MVCC LSM implementation, in our tutorial, we choose to keep the invariant, and if there are multiple versions of a key, a later version will always appear in a upper level. @@ -16,7 +18,7 @@ put/delete/write_batch(key, timestamp) set_watermark(timestamp) # we will talk about watermarks soon! ``` -**Un-managed Mode APIs** +**Un-managed/Normal Mode APIs** ``` get(key) -> value scan(key_range) -> iterator diff --git a/mini-lsm-mvcc/Cargo.toml b/mini-lsm-mvcc/Cargo.toml index 15213c311..00b2e5bfd 100644 --- a/mini-lsm-mvcc/Cargo.toml +++ b/mini-lsm-mvcc/Cargo.toml @@ -25,6 +25,8 @@ serde_json = { version = "1.0" } serde = { version = "1.0", features = ["derive"] } farmhash = "1" crc32fast = "1.3.2" +nom = "7.1.3" +rustyline = "13.0.0" [dev-dependencies] tempfile = "3" diff --git a/mini-lsm-mvcc/src/compact.rs b/mini-lsm-mvcc/src/compact.rs index 06df389e9..cc52c63fe 100644 --- a/mini-lsm-mvcc/src/compact.rs +++ b/mini-lsm-mvcc/src/compact.rs @@ -19,7 +19,7 @@ use crate::iterators::merge_iterator::MergeIterator; use crate::iterators::two_merge_iterator::TwoMergeIterator; use crate::iterators::StorageIterator; use crate::key::KeySlice; -use crate::lsm_storage::{LsmStorageInner, LsmStorageState}; +use crate::lsm_storage::{CompactionFilter, LsmStorageInner, LsmStorageState}; use crate::manifest::ManifestRecord; use crate::table::{SsTable, SsTableBuilder, SsTableIterator}; @@ -122,7 +122,8 @@ impl LsmStorageInner { let watermark = self.mvcc().watermark(); let mut last_key = Vec::::new(); let mut first_key_below_watermark = false; - while iter.is_valid() { + let compaction_filters = self.compaction_filters.lock().clone(); + 'outer: while iter.is_valid() { if builder.is_none() { builder = Some(SsTableBuilder::new(self.options.block_size)); } @@ -144,12 +145,26 @@ impl LsmStorageInner { continue; } - if same_as_last_key && iter.key().ts() <= watermark { - if !first_key_below_watermark { + if iter.key().ts() <= watermark { + if same_as_last_key && !first_key_below_watermark { iter.next()?; continue; } + first_key_below_watermark = false; + + if !compaction_filters.is_empty() { + for filter in &compaction_filters { + match filter { + CompactionFilter::Prefix(x) => { + if iter.key().key_ref().starts_with(x) { + iter.next()?; + continue 'outer; + } + } + } + } + } } let builder_inner = builder.as_mut().unwrap(); diff --git a/mini-lsm-mvcc/src/compact/leveled.rs b/mini-lsm-mvcc/src/compact/leveled.rs index 213661888..043bc9273 100644 --- a/mini-lsm-mvcc/src/compact/leveled.rs +++ b/mini-lsm-mvcc/src/compact/leveled.rs @@ -118,6 +118,7 @@ impl LeveledCompactionController { } } priorities.sort_by(|a, b| a.partial_cmp(b).unwrap().reverse()); + let priority = priorities.first(); if let Some((_, level)) = priority { println!( diff --git a/mini-lsm-mvcc/src/iterators/merge_iterator.rs b/mini-lsm-mvcc/src/iterators/merge_iterator.rs index c4abc8d3e..b1f5bdf77 100644 --- a/mini-lsm-mvcc/src/iterators/merge_iterator.rs +++ b/mini-lsm-mvcc/src/iterators/merge_iterator.rs @@ -37,7 +37,7 @@ impl Ord for HeapWrapper { } /// Merge multiple iterators of the same type. If the same key occurs multiple times in some -/// iterators, perfer the one with smaller index. +/// iterators, prefer the one with smaller index. pub struct MergeIterator { iters: BinaryHeap>, current: Option>, diff --git a/mini-lsm-mvcc/src/iterators/two_merge_iterator.rs b/mini-lsm-mvcc/src/iterators/two_merge_iterator.rs index 8488cd282..2ff2ce3fa 100644 --- a/mini-lsm-mvcc/src/iterators/two_merge_iterator.rs +++ b/mini-lsm-mvcc/src/iterators/two_merge_iterator.rs @@ -53,8 +53,10 @@ impl< fn key(&self) -> A::KeyType<'_> { if self.choose_a { + debug_assert!(self.a.is_valid()); self.a.key() } else { + debug_assert!(self.b.is_valid()); self.b.key() } } diff --git a/mini-lsm-mvcc/src/lsm_iterator.rs b/mini-lsm-mvcc/src/lsm_iterator.rs index ad3242447..36a87bfb9 100644 --- a/mini-lsm-mvcc/src/lsm_iterator.rs +++ b/mini-lsm-mvcc/src/lsm_iterator.rs @@ -136,14 +136,14 @@ impl StorageIterator for FusedIterator { } fn key(&self) -> Self::KeyType<'_> { - if self.has_errored || !self.iter.is_valid() { + if !self.is_valid() { panic!("invalid access to the underlying iterator"); } self.iter.key() } fn value(&self) -> &[u8] { - if self.has_errored || !self.iter.is_valid() { + if !self.is_valid() { panic!("invalid access to the underlying iterator"); } self.iter.value() diff --git a/mini-lsm-mvcc/src/lsm_storage.rs b/mini-lsm-mvcc/src/lsm_storage.rs index c284b4a75..25786dfd1 100644 --- a/mini-lsm-mvcc/src/lsm_storage.rs +++ b/mini-lsm-mvcc/src/lsm_storage.rs @@ -149,6 +149,11 @@ fn key_within(user_key: &[u8], table_begin: KeySlice, table_end: KeySlice) -> bo table_begin.key_ref() <= user_key && user_key <= table_end.key_ref() } +#[derive(Clone, Debug)] +pub enum CompactionFilter { + Prefix(Bytes), +} + /// The storage interface of the LSM tree. pub(crate) struct LsmStorageInner { pub(crate) state: Arc>>, @@ -160,6 +165,7 @@ pub(crate) struct LsmStorageInner { pub(crate) compaction_controller: CompactionController, pub(crate) manifest: Option, pub(crate) mvcc: Option, + pub(crate) compaction_filters: Arc>>, } /// A thin wrapper for `LsmStorageInner` and the user interface for MiniLSM. @@ -243,6 +249,10 @@ impl MiniLsm { })) } + pub fn add_compaction_filter(&self, compaction_filter: CompactionFilter) { + self.inner.add_compaction_filter(compaction_filter) + } + pub fn get(&self, key: &[u8]) -> Result> { self.inner.get(key) } @@ -431,12 +441,18 @@ impl LsmStorageInner { manifest: Some(manifest), options: options.into(), mvcc: Some(LsmMvccInner::new(last_commit_ts)), + compaction_filters: Arc::new(Mutex::new(Vec::new())), }; storage.sync_dir()?; Ok(storage) } + pub fn add_compaction_filter(&self, compaction_filter: CompactionFilter) { + let mut compaction_filters = self.compaction_filters.lock(); + compaction_filters.push(compaction_filter); + } + pub fn sync(&self) -> Result<()> { self.state.read().memtable.sync_wal() } diff --git a/mini-lsm-mvcc/src/mem_table.rs b/mini-lsm-mvcc/src/mem_table.rs index 5b6dd5008..92d90d764 100644 --- a/mini-lsm-mvcc/src/mem_table.rs +++ b/mini-lsm-mvcc/src/mem_table.rs @@ -152,8 +152,7 @@ impl MemTable { item: (KeyBytes::new(), Bytes::new()), } .build(); - let entry = iter.with_iter_mut(|iter| MemTableIterator::entry_to_item(iter.next())); - iter.with_mut(|x| *x.item = entry); + iter.next().unwrap(); iter } diff --git a/mini-lsm-mvcc/src/tests.rs b/mini-lsm-mvcc/src/tests.rs index ab4e40520..7b12c1134 100644 --- a/mini-lsm-mvcc/src/tests.rs +++ b/mini-lsm-mvcc/src/tests.rs @@ -18,3 +18,4 @@ mod week3_day3; mod week3_day4; mod week3_day5; mod week3_day6; +mod week3_day7; diff --git a/mini-lsm-mvcc/src/tests/week3_day7.rs b/mini-lsm-mvcc/src/tests/week3_day7.rs new file mode 100644 index 000000000..bfbc05d7a --- /dev/null +++ b/mini-lsm-mvcc/src/tests/week3_day7.rs @@ -0,0 +1,70 @@ +use bytes::Bytes; +use tempfile::tempdir; + +use crate::{ + compact::CompactionOptions, + lsm_storage::{CompactionFilter, LsmStorageOptions, MiniLsm, WriteBatchRecord}, +}; + +use super::harness::{check_iter_result_by_key, construct_merge_iterator_over_storage}; + +#[test] +fn test_task3_mvcc_compaction() { + let dir = tempdir().unwrap(); + let options = LsmStorageOptions::default_for_week2_test(CompactionOptions::NoCompaction); + let storage = MiniLsm::open(&dir, options.clone()).unwrap(); + storage + .write_batch(&[ + WriteBatchRecord::Put("table1_a", "1"), + WriteBatchRecord::Put("table1_b", "1"), + WriteBatchRecord::Put("table1_c", "1"), + WriteBatchRecord::Put("table2_a", "1"), + WriteBatchRecord::Put("table2_b", "1"), + WriteBatchRecord::Put("table2_c", "1"), + ]) + .unwrap(); + storage.force_flush().unwrap(); + let snapshot0 = storage.new_txn().unwrap(); + storage + .write_batch(&[ + WriteBatchRecord::Put("table1_a", "2"), + WriteBatchRecord::Del("table1_b"), + WriteBatchRecord::Put("table1_c", "2"), + WriteBatchRecord::Put("table2_a", "2"), + WriteBatchRecord::Del("table2_b"), + WriteBatchRecord::Put("table2_c", "2"), + ]) + .unwrap(); + storage.force_flush().unwrap(); + storage.add_compaction_filter(CompactionFilter::Prefix(Bytes::from("table2_"))); + storage.force_full_compaction().unwrap(); + + let mut iter = construct_merge_iterator_over_storage(&storage.inner.state.read()); + check_iter_result_by_key( + &mut iter, + vec![ + (Bytes::from("table1_a"), Bytes::from("2")), + (Bytes::from("table1_a"), Bytes::from("1")), + (Bytes::from("table1_b"), Bytes::new()), + (Bytes::from("table1_b"), Bytes::from("1")), + (Bytes::from("table1_c"), Bytes::from("2")), + (Bytes::from("table1_c"), Bytes::from("1")), + (Bytes::from("table2_a"), Bytes::from("2")), + (Bytes::from("table2_b"), Bytes::new()), + (Bytes::from("table2_c"), Bytes::from("2")), + ], + ); + + drop(snapshot0); + + storage.force_full_compaction().unwrap(); + + let mut iter = construct_merge_iterator_over_storage(&storage.inner.state.read()); + check_iter_result_by_key( + &mut iter, + vec![ + (Bytes::from("table1_a"), Bytes::from("2")), + (Bytes::from("table1_c"), Bytes::from("2")), + ], + ); +} diff --git a/mini-lsm-starter/Cargo.toml b/mini-lsm-starter/Cargo.toml index 25943e510..d824d0300 100644 --- a/mini-lsm-starter/Cargo.toml +++ b/mini-lsm-starter/Cargo.toml @@ -19,6 +19,9 @@ crossbeam-channel = "0.5.11" serde_json = { version = "1.0" } serde = { version = "1.0", features = ["derive"] } farmhash = "1" +nom = "7.1.3" +rustyline = "13.0.0" +crc32fast = "1.3.2" [dev-dependencies] tempfile = "3" diff --git a/mini-lsm-starter/src/bin/compaction-simulator.rs b/mini-lsm-starter/src/bin/compaction-simulator.rs index 18eba53d4..978fedeca 100644 --- a/mini-lsm-starter/src/bin/compaction-simulator.rs +++ b/mini-lsm-starter/src/bin/compaction-simulator.rs @@ -289,7 +289,7 @@ fn main() { storage.snapshot = snapshot; storage.remove(&del); println!("--- After Compaction ---"); - if dump_real_id { + if !dump_real_id { storage.dump_real_id(true, false); } else { storage.dump_original_id(true, false); diff --git a/mini-lsm-starter/src/bin/mini-lsm-cli.rs b/mini-lsm-starter/src/bin/mini-lsm-cli.rs index f44ef912d..ee2290cb6 100644 --- a/mini-lsm-starter/src/bin/mini-lsm-cli.rs +++ b/mini-lsm-starter/src/bin/mini-lsm-cli.rs @@ -1,5 +1,4 @@ mod wrapper; -use wrapper::mini_lsm_wrapper; use anyhow::Result; use bytes::Bytes; @@ -10,7 +9,10 @@ use mini_lsm_wrapper::compact::{ }; use mini_lsm_wrapper::iterators::StorageIterator; use mini_lsm_wrapper::lsm_storage::{LsmStorageOptions, MiniLsm}; +use rustyline::DefaultEditor; use std::path::PathBuf; +use std::sync::Arc; +use wrapper::mini_lsm_wrapper; #[derive(Debug, Clone, ValueEnum)] enum CompactionStrategy { @@ -33,6 +35,279 @@ struct Args { serializable: bool, } +struct ReplHandler { + epoch: u64, + lsm: Arc, +} + +impl ReplHandler { + fn handle(&mut self, command: &Command) -> Result<()> { + match command { + Command::Fill { begin, end } => { + for i in *begin..=*end { + self.lsm.put( + format!("{}", i).as_bytes(), + format!("value{}@{}", i, self.epoch).as_bytes(), + )?; + } + + println!( + "{} values filled with epoch {}", + end - begin + 1, + self.epoch + ); + } + Command::Del { key } => { + self.lsm.delete(key.as_bytes())?; + println!("{} deleted", key); + } + Command::Get { key } => { + if let Some(value) = self.lsm.get(key.as_bytes())? { + println!("{}={:?}", key, value); + } else { + println!("{} not exist", key); + } + } + Command::Scan { begin, end } => match (begin, end) { + (None, None) => { + let mut iter = self + .lsm + .scan(std::ops::Bound::Unbounded, std::ops::Bound::Unbounded)?; + let mut cnt = 0; + while iter.is_valid() { + println!( + "{:?}={:?}", + Bytes::copy_from_slice(iter.key()), + Bytes::copy_from_slice(iter.value()), + ); + iter.next()?; + cnt += 1; + } + println!(); + println!("{} keys scanned", cnt); + } + (Some(begin), Some(end)) => { + let mut iter = self.lsm.scan( + std::ops::Bound::Included(begin.as_bytes()), + std::ops::Bound::Included(end.as_bytes()), + )?; + let mut cnt = 0; + while iter.is_valid() { + println!( + "{:?}={:?}", + Bytes::copy_from_slice(iter.key()), + Bytes::copy_from_slice(iter.value()), + ); + iter.next()?; + cnt += 1; + } + println!(); + println!("{} keys scanned", cnt); + } + _ => { + println!("invalid command"); + } + }, + Command::Dump => { + self.lsm.dump_structure(); + println!("dump success"); + } + Command::Flush => { + self.lsm.force_flush()?; + println!("flush success"); + } + Command::FullCompaction => { + self.lsm.force_full_compaction()?; + println!("full compaction success"); + } + Command::Quit | Command::Close => std::process::exit(0), + }; + + self.epoch += 1; + + Ok(()) + } +} + +#[derive(Debug)] +enum Command { + Fill { + begin: u64, + end: u64, + }, + Del { + key: String, + }, + Get { + key: String, + }, + Scan { + begin: Option, + end: Option, + }, + + Dump, + Flush, + FullCompaction, + Quit, + Close, +} + +impl Command { + pub fn parse(input: &str) -> Result { + use nom::bytes::complete::*; + use nom::character::complete::*; + + use nom::branch::*; + use nom::combinator::*; + use nom::sequence::*; + + let uint = |i| { + map_res(digit1::<&str, nom::error::Error<_>>, |s: &str| { + s.parse() + .map_err(|_| nom::error::Error::new(s, nom::error::ErrorKind::Digit)) + })(i) + }; + + let string = |i| { + map(take_till1(|c: char| c.is_whitespace()), |s: &str| { + s.to_string() + })(i) + }; + + let fill = |i| { + map( + tuple((tag_no_case("fill"), space1, uint, space1, uint)), + |(_, _, key, _, value)| Command::Fill { + begin: key, + end: value, + }, + )(i) + }; + + let del = |i| { + map( + tuple((tag_no_case("del"), space1, string)), + |(_, _, key)| Command::Del { key }, + )(i) + }; + + let get = |i| { + map( + tuple((tag_no_case("get"), space1, string)), + |(_, _, key)| Command::Get { key }, + )(i) + }; + + let scan = |i| { + map( + tuple(( + tag_no_case("scan"), + opt(tuple((space1, string, space1, string))), + )), + |(_, opt_args)| { + let (begin, end) = opt_args + .map_or((None, None), |(_, begin, _, end)| (Some(begin), Some(end))); + Command::Scan { begin, end } + }, + )(i) + }; + + let command = |i| { + alt(( + fill, + del, + get, + scan, + map(tag_no_case("dump"), |_| Command::Dump), + map(tag_no_case("flush"), |_| Command::Flush), + map(tag_no_case("full_compaction"), |_| Command::FullCompaction), + map(tag_no_case("quit"), |_| Command::Quit), + map(tag_no_case("close"), |_| Command::Close), + ))(i) + }; + + command(input) + .map(|(_, c)| c) + .map_err(|e| anyhow::anyhow!("{}", e)) + } +} + +struct Repl { + app_name: String, + description: String, + prompt: String, + + handler: ReplHandler, + + editor: DefaultEditor, +} + +impl Repl { + pub fn run(mut self) -> Result<()> { + self.bootstrap()?; + + loop { + let readline = self.editor.readline(&self.prompt)?; + if readline.trim().is_empty() { + // Skip noop + continue; + } + let command = Command::parse(&readline)?; + self.handler.handle(&command)?; + self.editor.add_history_entry(readline)?; + } + } + + fn bootstrap(&mut self) -> Result<()> { + println!("Welcome to {}!", self.app_name); + println!("{}", self.description); + println!(); + Ok(()) + } +} + +struct ReplBuilder { + app_name: String, + description: String, + prompt: String, +} + +impl ReplBuilder { + pub fn new() -> Self { + Self { + app_name: "mini-lsm-cli".to_string(), + description: "A CLI for mini-lsm".to_string(), + prompt: "mini-lsm-cli> ".to_string(), + } + } + + pub fn app_name(mut self, app_name: &str) -> Self { + self.app_name = app_name.to_string(); + self + } + + pub fn description(mut self, description: &str) -> Self { + self.description = description.to_string(); + self + } + + pub fn prompt(mut self, prompt: &str) -> Self { + self.prompt = prompt.to_string(); + self + } + + pub fn build(self, handler: ReplHandler) -> Result { + Ok(Repl { + app_name: self.app_name, + description: self.description, + prompt: self.prompt, + editor: DefaultEditor::new()?, + handler, + }) + } +} + fn main() -> Result<()> { let args = Args::parse(); let lsm = MiniLsm::open( @@ -69,97 +344,13 @@ fn main() -> Result<()> { serializable: args.serializable, }, )?; - let mut epoch = 0; - loop { - let mut line = String::new(); - std::io::stdin().read_line(&mut line)?; - let line = line.trim().to_string(); - if line.starts_with("fill ") { - let Some((_, options)) = line.split_once(' ') else { - println!("invalid command"); - continue; - }; - let Some((begin, end)) = options.split_once(' ') else { - println!("invalid command"); - continue; - }; - let begin = begin.parse::()?; - let end = end.parse::()?; - - for i in begin..=end { - lsm.put( - format!("{}", i).as_bytes(), - format!("value{}@{}", i, epoch).as_bytes(), - )?; - } - println!("{} values filled with epoch {}", end - begin + 1, epoch); - } else if line.starts_with("del ") { - let Some((_, key)) = line.split_once(' ') else { - println!("invalid command"); - continue; - }; - lsm.delete(key.as_bytes())?; - } else if line.starts_with("get ") { - let Some((_, key)) = line.split_once(' ') else { - println!("invalid command"); - continue; - }; - if let Some(value) = lsm.get(key.as_bytes())? { - println!("{}={:?}", key, value); - } else { - println!("{} not exist", key); - } - } else if line == "scan" { - let mut iter = lsm.scan(std::ops::Bound::Unbounded, std::ops::Bound::Unbounded)?; - let mut cnt = 0; - while iter.is_valid() { - println!( - "{:?}={:?}", - Bytes::copy_from_slice(iter.key()), - Bytes::copy_from_slice(iter.value()), - ); - iter.next()?; - cnt += 1; - } - println!("{} keys scanned", cnt); - } else if line.starts_with("scan ") { - let Some((_, rest)) = line.split_once(' ') else { - println!("invalid command"); - continue; - }; - let Some((begin_key, end_key)) = rest.split_once(' ') else { - println!("invalid command"); - continue; - }; - let mut iter = lsm.scan( - std::ops::Bound::Included(begin_key.as_bytes()), - std::ops::Bound::Included(end_key.as_bytes()), - )?; - let mut cnt = 0; - while iter.is_valid() { - println!( - "{:?}={:?}", - Bytes::copy_from_slice(iter.key()), - Bytes::copy_from_slice(iter.value()), - ); - iter.next()?; - cnt += 1; - } - println!("{} keys scanned", cnt); - } else if line == "dump" { - lsm.dump_structure(); - } else if line == "flush" { - lsm.force_flush()?; - } else if line == "full_compaction" { - lsm.force_full_compaction()?; - } else if line == "quit" || line == "close" { - lsm.close()?; - break; - } else { - println!("invalid command: {}", line); - } - epoch += 1; - } + let repl = ReplBuilder::new() + .app_name("mini-lsm-cli") + .description("A CLI for mini-lsm") + .prompt("mini-lsm-cli> ") + .build(ReplHandler { epoch: 0, lsm })?; + + repl.run()?; Ok(()) } diff --git a/mini-lsm-starter/src/block.rs b/mini-lsm-starter/src/block.rs index 41e3df2d4..0f907adc7 100644 --- a/mini-lsm-starter/src/block.rs +++ b/mini-lsm-starter/src/block.rs @@ -1,13 +1,16 @@ -#![allow(unused_variables)] // TODO(you): remove this lint after implementing this mod -#![allow(dead_code)] // TODO(you): remove this lint after implementing this mod - mod builder; mod iterator; +use std::u8; + pub use builder::BlockBuilder; -use bytes::Bytes; +use bytes::{Buf, BufMut, Bytes}; pub use iterator::BlockIterator; +use crate::key::KeyVec; + +pub(crate) const SIZEOF_U16: usize = std::mem::size_of::(); + /// A block is the smallest unit of read and caching in LSM tree. It is a collection of sorted key-value pairs. pub struct Block { pub(crate) data: Vec, @@ -18,11 +21,37 @@ impl Block { /// Encode the internal data to the data layout illustrated in the tutorial /// Note: You may want to recheck if any of the expected field is missing from your output pub fn encode(&self) -> Bytes { - unimplemented!() + let mut buf = self.data.clone(); + for offset in &self.offsets { + buf.put_u16(*offset); + } + // num of elements + buf.put_u16(self.offsets.len() as u16); + buf.into() } /// Decode from the data layout, transform the input `data` to a single `Block` pub fn decode(data: &[u8]) -> Self { - unimplemented!() + let num_of_elements = (&data[data.len() - SIZEOF_U16..]).get_u16(); + // transform offsets + let data_end = data.len() - SIZEOF_U16 - SIZEOF_U16 * (num_of_elements as usize); + let offsets = data[data_end..data.len() - SIZEOF_U16] + .chunks(SIZEOF_U16) + .map(|mut x| x.get_u16()) + .collect(); + // transform data + let data = data[..data_end].to_vec(); + Self { data, offsets } + } + + fn get_first_key(&self) -> KeyVec { + // The first key is the first entry in the data section + // redundant key is 0 + let mut entry = &self.data[0..]; + entry.get_u16(); + let key_len = entry.get_u16() as usize; + let key = &entry[..key_len]; + entry.advance(key_len); + KeyVec::from_vec_with_ts(key.to_vec(), entry.get_u64()) } } diff --git a/mini-lsm-starter/src/block/builder.rs b/mini-lsm-starter/src/block/builder.rs index 4c8395997..96698cb4d 100644 --- a/mini-lsm-starter/src/block/builder.rs +++ b/mini-lsm-starter/src/block/builder.rs @@ -1,11 +1,17 @@ -#![allow(unused_variables)] // TODO(you): remove this lint after implementing this mod -#![allow(dead_code)] // TODO(you): remove this lint after implementing this mod +use bytes::BufMut; use crate::key::{KeySlice, KeyVec}; -use super::Block; +use super::{Block, SIZEOF_U16}; /// Builds a block. +/* +---------------------------------------------------------------------------------------------------- +| Data Section | Offset Section | Extra | +---------------------------------------------------------------------------------------------------- +| Entry #1 | Entry #2 | ... | Entry #N | Offset #1 | Offset #2 | ... | Offset #N | num_of_elements | +---------------------------------------------------------------------------------------------------- +*/ pub struct BlockBuilder { /// Offsets of each key-value entries. offsets: Vec, @@ -17,25 +23,83 @@ pub struct BlockBuilder { first_key: KeyVec, } +fn compute_redundant_key(first_key: KeySlice, cur_key: KeySlice) -> usize { + let mut i = 0; + while i < first_key.key_len() && i < cur_key.key_len() { + if first_key.key_ref()[i] == cur_key.key_ref()[i] { + i += 1; + } else { + break; + } + } + i +} + impl BlockBuilder { /// Creates a new block builder. pub fn new(block_size: usize) -> Self { - unimplemented!() + Self { + offsets: Vec::new(), + data: Vec::new(), + block_size, + first_key: KeyVec::new(), + } } + fn estimated_size(&self) -> usize { + SIZEOF_U16 /* number of key-value pairs in the block */ + self.offsets.len() * SIZEOF_U16 /* offsets */ + self.data.len() + // key-value pairs + } + + /* Each entry is a key-value pair. + ----------------------------------------------------------------------- + | Entry #1 | ... | + ----------------------------------------------------------------------- + | redundant_key(first_key_index) | key_len (2B) | key (keylen) | timestamp (u64) | value_len (2B) | value (varlen) | ... | + ----------------------------------------------------------------------- + */ /// Adds a key-value pair to the block. Returns false when the block is full. #[must_use] pub fn add(&mut self, key: KeySlice, value: &[u8]) -> bool { - unimplemented!() + if self.estimated_size() + SIZEOF_U16 /* num of elements */ + + key.raw_len() + value.len() + SIZEOF_U16 /* key len */ + SIZEOF_U16 /* value len */ + > self.block_size + && !self.is_empty() + { + return false; + } + // Add the offset of the data into the offset array. + self.offsets.push(self.data.len() as u16); + // compute redundant key + // when is the first key, the redundant key is 0 + let redundant_index = compute_redundant_key(self.first_key.as_key_slice(), key); + self.data.put_u16(redundant_index as u16); + // rest of the key + self.data.put_u16((key.key_len() - redundant_index) as u16); + self.data.put(&key.key_ref()[redundant_index..]); + // timestamp + self.data.put_u64(key.ts()); + // value + self.data.put_u16(value.len() as u16); + self.data.put(value); + + if self.first_key.is_empty() { + self.first_key = key.to_key_vec(); + } + + true } /// Check if there is no key-value pair in the block. pub fn is_empty(&self) -> bool { - unimplemented!() + self.data.is_empty() } /// Finalize the block. pub fn build(self) -> Block { - unimplemented!() + Block { + data: self.data, + offsets: self.offsets, + } } } diff --git a/mini-lsm-starter/src/block/iterator.rs b/mini-lsm-starter/src/block/iterator.rs index f823b5ad2..20acd8fc1 100644 --- a/mini-lsm-starter/src/block/iterator.rs +++ b/mini-lsm-starter/src/block/iterator.rs @@ -1,11 +1,10 @@ -#![allow(unused_variables)] // TODO(you): remove this lint after implementing this mod -#![allow(dead_code)] // TODO(you): remove this lint after implementing this mod - use std::sync::Arc; +use bytes::Buf; + use crate::key::{KeySlice, KeyVec}; -use super::Block; +use super::{Block, SIZEOF_U16}; /// Iterates on a block. pub struct BlockIterator { @@ -24,54 +23,111 @@ pub struct BlockIterator { impl BlockIterator { fn new(block: Arc) -> Self { Self { + first_key: block.get_first_key(), block, key: KeyVec::new(), value_range: (0, 0), idx: 0, - first_key: KeyVec::new(), } } /// Creates a block iterator and seek to the first entry. pub fn create_and_seek_to_first(block: Arc) -> Self { - unimplemented!() + let mut iter = BlockIterator::new(block); + iter.seek_to_first(); + iter } /// Creates a block iterator and seek to the first key that >= `key`. pub fn create_and_seek_to_key(block: Arc, key: KeySlice) -> Self { - unimplemented!() + let mut iter = BlockIterator::new(block); + iter.seek_to_key(key); + iter } /// Returns the key of the current entry. pub fn key(&self) -> KeySlice { - unimplemented!() + debug_assert!(!self.key.is_empty(), "invalid iterator"); + self.key.as_key_slice() } /// Returns the value of the current entry. pub fn value(&self) -> &[u8] { - unimplemented!() + debug_assert!(!self.key.is_empty(), "invalid iterator"); + &self.block.data[self.value_range.0..self.value_range.1] } /// Returns true if the iterator is valid. - /// Note: You may want to make use of `key` pub fn is_valid(&self) -> bool { - unimplemented!() + !self.key.is_empty() } /// Seeks to the first key in the block. pub fn seek_to_first(&mut self) { - unimplemented!() + self.seek_to_idx(0); } /// Move to the next key in the block. pub fn next(&mut self) { - unimplemented!() + self.idx += 1; + self.seek_to_idx(self.idx); + } + + /// Seek to the specified position and update the current `key` and `value` + /// Index update will be handled by caller + pub fn seek_to_idx(&mut self, idx: usize) { + if idx >= self.block.offsets.len() { + self.key.clear(); + self.value_range = (0, 0); + return; + } + + let offset = self.block.offsets[idx] as usize; + let mut entry = &self.block.data[offset..]; + // since `get_u16()` will automatically move the ptr 2 bytes ahead here, + // we don't need to manually advance it + let redundant_len = entry.get_u16() as usize; + let key_len = entry.get_u16() as usize; + let key = &entry[..key_len]; + self.key.clear(); + self.key.append(&self.first_key.key_ref()[..redundant_len]); + self.key.append(key); + // move the entry ptr to the begin of the value + entry.advance(key_len); + // set timestamp + let ts = entry.get_u64(); + self.key.set_ts(ts); + let value_len = entry.get_u16() as usize; + let value_offset_begin = offset + + SIZEOF_U16*2 /* redundant + key_len(2B) */ + + std::mem::size_of::() /* timestamp(u64) */ + + key_len + + SIZEOF_U16 /* value_len(2B) */; + let value_offset_end = value_offset_begin + value_len; + self.value_range = (value_offset_begin, value_offset_end); + // move the entry ptr to the end of the value + entry.advance(value_len); + // set index + self.idx = idx; + // idx will be 0 if the iterator is invalid } /// Seek to the first key that >= `key`. - /// Note: You should assume the key-value pairs in the block are sorted when being added by + /// Note: we should assume the key-value pairs in the block are sorted when being added by /// callers. pub fn seek_to_key(&mut self, key: KeySlice) { - unimplemented!() + let mut low = 0; + let mut high = self.block.offsets.len(); + while low < high { + let mid = low + (high - low) / 2; + self.seek_to_idx(mid); + assert!(self.is_valid()); + match self.key().cmp(&key) { + std::cmp::Ordering::Less => low = mid + 1, + std::cmp::Ordering::Greater => high = mid, + std::cmp::Ordering::Equal => return, + } + } + self.seek_to_idx(low); } } diff --git a/mini-lsm-starter/src/compact.rs b/mini-lsm-starter/src/compact.rs index d52cc135f..354bad4d3 100644 --- a/mini-lsm-starter/src/compact.rs +++ b/mini-lsm-starter/src/compact.rs @@ -4,10 +4,11 @@ mod leveled; mod simple_leveled; mod tiered; +use std::collections::HashSet; use std::sync::Arc; use std::time::Duration; -use anyhow::Result; +use anyhow::{Ok, Result}; pub use leveled::{LeveledCompactionController, LeveledCompactionOptions, LeveledCompactionTask}; use serde::{Deserialize, Serialize}; pub use simple_leveled::{ @@ -15,8 +16,15 @@ pub use simple_leveled::{ }; pub use tiered::{TieredCompactionController, TieredCompactionOptions, TieredCompactionTask}; -use crate::lsm_storage::{LsmStorageInner, LsmStorageState}; -use crate::table::SsTable; +use crate::iterators::concat_iterator::SstConcatIterator; +use crate::iterators::merge_iterator::MergeIterator; +use crate::iterators::two_merge_iterator::TwoMergeIterator; +use crate::iterators::StorageIterator; +use crate::key::KeySlice; +use crate::lsm_iterator::FusedIterator; +use crate::lsm_storage::{CompactionFilter, LsmStorageInner, LsmStorageState}; +use crate::manifest::ManifestRecord; +use crate::table::{SsTable, SsTableBuilder, SsTableIterator}; #[derive(Debug, Serialize, Deserialize)] pub enum CompactionTask { @@ -67,17 +75,17 @@ impl CompactionController { &self, snapshot: &LsmStorageState, task: &CompactionTask, - output: &[usize], + new_ssts: &[usize], ) -> (LsmStorageState, Vec) { match (self, task) { (CompactionController::Leveled(ctrl), CompactionTask::Leveled(task)) => { - ctrl.apply_compaction_result(snapshot, task, output) + ctrl.apply_compaction_result(snapshot, task, new_ssts) } (CompactionController::Simple(ctrl), CompactionTask::Simple(task)) => { - ctrl.apply_compaction_result(snapshot, task, output) + ctrl.apply_compaction_result(snapshot, task, new_ssts) } (CompactionController::Tiered(ctrl), CompactionTask::Tiered(task)) => { - ctrl.apply_compaction_result(snapshot, task, output) + ctrl.apply_compaction_result(snapshot, task, new_ssts) } _ => unreachable!(), } @@ -107,16 +115,315 @@ pub enum CompactionOptions { } impl LsmStorageInner { - fn compact(&self, _task: &CompactionTask) -> Result>> { - unimplemented!() + fn compact_generate_sst_from_iter( + &self, + mut iter: impl for<'a> StorageIterator = KeySlice<'a>>, + compact_to_bottom_level: bool, + ) -> Result>> { + let mut new_ssts = Vec::new(); + // compact the iterators + let mut builder = None; + let mut last_key = Vec::::new(); + // All ts (strictly) below this ts can be garbage collected. + let gc_ts = self.mvcc().watermark(); + let mut last_gc_key = Vec::::new(); + // add compaction filters, ref https://skyzh.github.io/mini-lsm/week3-07-compaction-filter.html + let compaction_filters = self.compaction_filters.lock().clone(); + 'outer: while iter.is_valid() { + if builder.is_none() { + builder = Some(SsTableBuilder::new(self.options.block_size)); + } + let builder_inner = builder.as_mut().unwrap(); + + // 1. for all versions of a key below or equal to the watermark, keep the latest version. + if iter.key().ts() <= gc_ts { + let same_as_last_gc_key = iter.key().key_ref() == last_gc_key; + if !same_as_last_gc_key { + last_gc_key.clear(); + last_gc_key.extend(iter.key().key_ref()); + } + + if same_as_last_gc_key { + iter.next()?; + continue; + } + + if !compaction_filters.is_empty() { + for filter in &compaction_filters { + match filter { + CompactionFilter::Prefix(prefix) => { + if iter.key().key_ref().starts_with(prefix) { + iter.next()?; + continue 'outer; + } + } + } + } + } + } + + let same_as_last_key = iter.key().key_ref() == last_key; + // 2. gc the version of a key is empty and below the watermark + if compact_to_bottom_level && iter.key().ts() <= gc_ts && iter.value().is_empty() { + // if this key is not same as the last key, need to update last_key + if !same_as_last_key { + last_key.clear(); + last_key.extend(iter.key().key_ref()); + } + iter.next()?; + continue; + } + + // the same key with different timestamps are put in the same SST file, even if it exceeds the SST size limit + if builder_inner.estimated_size() >= self.options.target_sst_size && !same_as_last_key { + let sst_id = self.next_sst_id(); + let old_builder = builder.take().unwrap(); + let new_sst = Arc::new(old_builder.build( + sst_id, + Some(self.block_cache.clone()), + self.path_of_sst(sst_id), + )?); + new_ssts.push(new_sst); + builder = Some(SsTableBuilder::new(self.options.block_size)); + } + + // add the key-value pair to the builder + builder.as_mut().unwrap().add(iter.key(), iter.value()); + + if !same_as_last_key { + last_key.clear(); + last_key.extend(iter.key().key_ref()); + } + + iter.next()?; + } + + // put last sst if exists builder + if let Some(builder) = builder { + println!("compact_generate_sst_from_iter put last"); + let sst_id = self.next_sst_id(); // lock dropped here + let sst = Arc::new(builder.build( + sst_id, + Some(self.block_cache.clone()), + self.path_of_sst(sst_id), + )?); + new_ssts.push(sst); + } + Ok(new_ssts) + } + + fn compact(&self, task: &CompactionTask) -> Result>> { + let snapshot = { + let state = self.state.read(); + state.clone() + }; + + match task { + CompactionTask::ForceFullCompaction { + l0_sstables, + l1_sstables, + } => { + // create l0_sstables + let mut l0_iters = Vec::with_capacity(l0_sstables.len()); + for sst_id in l0_sstables.iter() { + let sst = snapshot.sstables.get(sst_id).unwrap(); + let iter = SsTableIterator::create_and_seek_to_first(sst.clone())?; + l0_iters.push(Box::new(iter)); + } + + // create l1_sstables + let mut l1_iters = Vec::with_capacity(l1_sstables.len()); + for sst_id in l1_sstables.iter() { + let sst = snapshot.sstables.get(sst_id).unwrap(); + l1_iters.push(sst.clone()); + } + + // merge l0_sstables and l1_sstables + let iter = FusedIterator::new(TwoMergeIterator::create( + MergeIterator::create(l0_iters), + SstConcatIterator::create_and_seek_to_first(l1_iters)?, + )?); + self.compact_generate_sst_from_iter(iter, task.compact_to_bottom_level()) + } + CompactionTask::Simple(SimpleLeveledCompactionTask { + upper_level, + upper_level_sst_ids, + lower_level_sst_ids, + .. + }) + | CompactionTask::Leveled(LeveledCompactionTask { + upper_level, + upper_level_sst_ids, + lower_level_sst_ids, + .. + }) => { + match upper_level { + Some(_) => { + // create iterators for upper and lower level sstables + let mut upper_ssts = Vec::with_capacity(upper_level_sst_ids.len()); + for id in upper_level_sst_ids.iter() { + upper_ssts.push(snapshot.sstables.get(id).unwrap().clone()); + } + let upper_iter = SstConcatIterator::create_and_seek_to_first(upper_ssts)?; + let mut lower_ssts = Vec::with_capacity(upper_level_sst_ids.len()); + for id in lower_level_sst_ids.iter() { + lower_ssts.push(snapshot.sstables.get(id).unwrap().clone()); + } + let lower_iter = SstConcatIterator::create_and_seek_to_first(lower_ssts)?; + self.compact_generate_sst_from_iter( + TwoMergeIterator::create(upper_iter, lower_iter)?, + task.compact_to_bottom_level(), + ) + } + // because it is L0 compaction, we can not use concat iterator which is for ordered sstables + None => { + // create iterators for upper and lower level sstables + let mut upper_iters = Vec::with_capacity(upper_level_sst_ids.len()); + for id in upper_level_sst_ids.iter() { + let iter = SsTableIterator::create_and_seek_to_first( + snapshot.sstables.get(id).unwrap().clone(), + )?; + upper_iters.push(Box::new(iter)); + } + let upper_merge_iter = MergeIterator::create(upper_iters); + let mut lower_ssts = Vec::with_capacity(upper_level_sst_ids.len()); + for id in lower_level_sst_ids.iter() { + lower_ssts.push(snapshot.sstables.get(id).unwrap().clone()); + } + let lower_iter = SstConcatIterator::create_and_seek_to_first(lower_ssts)?; + self.compact_generate_sst_from_iter( + TwoMergeIterator::create(upper_merge_iter, lower_iter)?, + task.compact_to_bottom_level(), + ) + } + } + } + CompactionTask::Tiered(TieredCompactionTask { tiers, .. }) => { + let mut iters = Vec::with_capacity(tiers.len()); + for (_, tier_sst_ids) in tiers { + let mut ssts = Vec::with_capacity(tier_sst_ids.len()); + for id in tier_sst_ids.iter() { + ssts.push(snapshot.sstables[id].clone()); + } + let iter = SstConcatIterator::create_and_seek_to_first(ssts)?; + iters.push(Box::new(iter)); + } + self.compact_generate_sst_from_iter( + MergeIterator::create(iters), + task.compact_to_bottom_level(), + ) + } + } } pub fn force_full_compaction(&self) -> Result<()> { - unimplemented!() + let snapshot = { + let state = self.state.read(); + state.clone() + }; + + let l0_sstables = snapshot.l0_sstables.clone(); + let l1_sstables = snapshot.levels[0].1.clone(); + // compact the l0_sstables and l1_sstables to get compacted SSTs + println!( + "force full compaction with l0_sstables: {:?}, l1_sstables: {:?}", + l0_sstables, l1_sstables + ); + let new_ssts = self.compact(&CompactionTask::ForceFullCompaction { + l0_sstables: l0_sstables.clone(), + l1_sstables: l1_sstables.clone(), + })?; + + // update the state + let ids; + { + let _state_lock = self.state_lock.lock(); + let mut state = self.state.read().as_ref().clone(); + + // remove all participants of the compaction from the state + for sst in l0_sstables.iter().chain(l1_sstables.iter()) { + state.sstables.remove(sst); + } + + // remove old l0_sstables from the state + let mut l0_sstables_map = l0_sstables.iter().copied().collect::>(); + state.l0_sstables = state + .l0_sstables + .iter() + .filter(|x| !l0_sstables_map.remove(x)) + .copied() + .collect::>(); + assert!(l0_sstables_map.is_empty()); + + ids = new_ssts.iter().map(|x| x.sst_id()).collect::>(); + state.levels[0].1 = ids.clone(); + // insert new SSTs to sstables + for sst in new_ssts.iter() { + state.sstables.insert(sst.sst_id(), sst.clone()); + } + *self.state.write() = Arc::new(state); + }; + + for sst in l0_sstables.iter().chain(l1_sstables.iter()) { + std::fs::remove_file(self.path_of_sst(*sst))?; + } + println!("force full compaction done, new SSTs: {:?}", ids); + Ok(()) } fn trigger_compaction(&self) -> Result<()> { - unimplemented!() + let snapshot = { + let state = self.state.read(); + state.clone() + }; + + let task = self + .compaction_controller + .generate_compaction_task(&snapshot); + if let Some(task) = task { + let state_lock = self.state_lock.lock(); + self.dump_structure(); + println!("running compaction task: {:?}", task); + let new_ssts = self.compact(&task)?; + let new_ssts_to_add = new_ssts.iter().map(|x| x.sst_id()).collect::>(); + let mut snapshot = self.state.read().as_ref().clone(); + // insert new SSTs to sstables + for ssts_to_add in new_ssts { + let result = snapshot.sstables.insert(ssts_to_add.sst_id(), ssts_to_add); + assert!(result.is_none()); + } + let (mut new_snapshot, files_to_remove) = self + .compaction_controller + .apply_compaction_result(&snapshot, &task, &new_ssts_to_add); + // remove old SSTs from sstables + let mut ssts_to_remove = Vec::with_capacity(files_to_remove.len()); + for file_to_remove in &files_to_remove { + let result = new_snapshot.sstables.remove(file_to_remove); + assert!(result.is_some(), "cannot remove {}.sst", file_to_remove); + ssts_to_remove.push(result.unwrap()); + } + let mut state = self.state.write(); + *state = Arc::new(new_snapshot); + drop(state); + + println!( + "compaction finished: {} files removed, {} files added, ssts={:?}", + ssts_to_remove.len(), + new_ssts_to_add.len(), + new_ssts_to_add + ); + for sst in ssts_to_remove.iter() { + std::fs::remove_file(self.path_of_sst(sst.sst_id()))?; + } + + // update manifest + self.manifest.as_ref().unwrap().add_record( + &state_lock, + ManifestRecord::Compaction(task, new_ssts_to_add), + )?; + self.sync_dir()?; + } + Ok(()) } pub(crate) fn spawn_compaction_thread( @@ -145,6 +452,14 @@ impl LsmStorageInner { } fn trigger_flush(&self) -> Result<()> { + let res = { + let state = self.state.read(); + state.imm_memtables.len() >= self.options.num_memtable_limit + }; + if res { + self.force_flush_next_imm_memtable()?; + } + Ok(()) } diff --git a/mini-lsm-starter/src/compact/leveled.rs b/mini-lsm-starter/src/compact/leveled.rs index 50db7b024..2c49436fd 100644 --- a/mini-lsm-starter/src/compact/leveled.rs +++ b/mini-lsm-starter/src/compact/leveled.rs @@ -1,3 +1,5 @@ +use std::collections::HashSet; + use serde::{Deserialize, Serialize}; use crate::lsm_storage::LsmStorageState; @@ -29,28 +31,207 @@ impl LeveledCompactionController { Self { options } } + // contain exactly one SST in the upper level and overlapping SSTs in the lower level. fn find_overlapping_ssts( &self, - _snapshot: &LsmStorageState, - _sst_ids: &[usize], - _in_level: usize, + snapshot: &LsmStorageState, + sst_ids: &[usize], + in_level: usize, ) -> Vec { - unimplemented!() + let begin_key = sst_ids + .iter() + .map(|id| snapshot.sstables[id].first_key()) + .min() + .cloned() + .unwrap(); + let end_key = sst_ids + .iter() + .map(|id| snapshot.sstables[id].last_key()) + .max() + .cloned() + .unwrap(); + let mut overlap_ssts = Vec::new(); + for sst_id in &snapshot.levels[in_level - 1].1 { + let sst = &snapshot.sstables[sst_id]; + let first_key = sst.first_key(); + let last_key = sst.last_key(); + if !(last_key < &begin_key || first_key > &end_key) { + overlap_ssts.push(*sst_id); + } + } + overlap_ssts } + // rely on https://github.com/facebook/rocksdb/wiki/Leveled-Compaction pub fn generate_compaction_task( &self, - _snapshot: &LsmStorageState, + snapshot: &LsmStorageState, ) -> Option { - unimplemented!() + // compute target level size + // such as [0 0 30MB 300MB 3GB 30GB] when level_size_multiplier=10 and base_level_size_mb=200 + // because only keep at most one level below `base_level_size_mb` + let mut target_level_size = (0..self.options.max_levels).map(|_| 0).collect::>(); // exclude level 0 + let mut real_level_size = Vec::with_capacity(self.options.max_levels); + for i in 0..self.options.max_levels { + real_level_size.push( + snapshot.levels[i] + .1 + .iter() + .map(|x| snapshot.sstables.get(x).unwrap().table_size()) + .sum::() as usize, + ); + } + + // select base level and compute target level size + let mut base_level = self.options.max_levels; + let base_level_size_bytes = self.options.base_level_size_mb * 1024 * 1024; + target_level_size[self.options.max_levels - 1] = + real_level_size[self.options.max_levels - 1].max(base_level_size_bytes); + for i in (0..(self.options.max_levels - 1)).rev() { + let next_level_size = target_level_size[i + 1]; + let this_level_size = next_level_size / self.options.level_size_multiplier; + if next_level_size > base_level_size_bytes { + target_level_size[i] = this_level_size; + } + if target_level_size[i] > 0 { + base_level = i + 1; + } + } + + // directly place the SST from L0 to the lowest level possible + if snapshot.l0_sstables.len() >= self.options.level0_file_num_compaction_trigger { + println!("flush L0 SST to base level {}", base_level); + return Some(LeveledCompactionTask { + upper_level: None, + upper_level_sst_ids: snapshot.l0_sstables.clone(), + lower_level: base_level, + lower_level_sst_ids: self.find_overlapping_ssts( + snapshot, + &snapshot.l0_sstables, + base_level, + ), + is_lower_level_bottom_level: false, + }); + } + + // compute the compaction priorities of each level by `current_size / target_size` + let mut compaction_priorities = Vec::with_capacity(self.options.max_levels); + for level in 0..self.options.max_levels { + let prio = real_level_size[level] as f64 / target_level_size[level] as f64; + if prio > 1.0 { + compaction_priorities.push((prio, level + 1)); + } + } + + // select the compaction task with the highest priority + compaction_priorities.sort_by(|a, b| a.partial_cmp(b).unwrap().reverse()); + if let Some((_, level)) = compaction_priorities.first() { + println!( + "target level sizes: {:?}, real level sizes: {:?}, base_level: {}", + target_level_size + .iter() + .map(|x| format!("{}MB", x / 1024 / 1024)) + .collect::>(), + real_level_size + .iter() + .map(|x| format!("{}MB", x / 1024 / 1024)) + .collect::>(), + base_level, + ); + let level = *level; + // select the oldest sst to compact + let selected_sst = snapshot.levels[level - 1].1.iter().min().copied().unwrap(); + println!( + "compaction triggered by priority: {level} out of {:?}, select {selected_sst} for compaction", + compaction_priorities + ); + return Some(LeveledCompactionTask { + upper_level: Some(level), + upper_level_sst_ids: vec![selected_sst], + lower_level: level + 1, + lower_level_sst_ids: self.find_overlapping_ssts( + snapshot, + &[selected_sst], + level + 1, + ), + is_lower_level_bottom_level: level + 1 == self.options.max_levels, + }); + } + + None } pub fn apply_compaction_result( &self, - _snapshot: &LsmStorageState, - _task: &LeveledCompactionTask, - _output: &[usize], + snapshot: &LsmStorageState, + task: &LeveledCompactionTask, + new_ssts: &[usize], ) -> (LsmStorageState, Vec) { - unimplemented!() + let mut snapshot = snapshot.clone(); + let mut files_to_remove = Vec::new(); + let mut upper_level_sst_ids_set = task + .upper_level_sst_ids + .iter() + .copied() + .collect::>(); + let mut lower_level_sst_ids_set = task + .lower_level_sst_ids + .iter() + .copied() + .collect::>(); + // None is for l0 compaction + if let Some(upper_level) = task.upper_level { + // remove the compacted SSTs from the upper level + let new_upper_level_ssts = snapshot.levels[upper_level - 1] + .1 + .iter() + .filter_map(|x| { + if upper_level_sst_ids_set.remove(x) { + return None; + } + Some(*x) + }) + .collect::>(); + assert!(upper_level_sst_ids_set.is_empty()); + snapshot.levels[upper_level - 1].1 = new_upper_level_ssts; + } else { + let new_l0_ssts = snapshot + .l0_sstables + .iter() + .filter_map(|x| { + if upper_level_sst_ids_set.remove(x) { + return None; + } + Some(*x) + }) + .collect::>(); + assert!(upper_level_sst_ids_set.is_empty()); + snapshot.l0_sstables = new_l0_ssts; + } + files_to_remove.extend(&task.upper_level_sst_ids); + + let mut new_lower_level_ssts = snapshot.levels[task.lower_level - 1] + .1 + .iter() + .filter_map(|x| { + if lower_level_sst_ids_set.remove(x) { + return None; + } + Some(*x) + }) + .collect::>(); + assert!(lower_level_sst_ids_set.is_empty()); + files_to_remove.extend(&task.lower_level_sst_ids); + // add the new SSTs to the lower level + new_lower_level_ssts.extend(new_ssts); + // need to be sorted + new_lower_level_ssts.sort_by(|a, b| { + snapshot.sstables[a] + .first_key() + .cmp(snapshot.sstables[b].first_key()) + }); + snapshot.levels[task.lower_level - 1].1 = new_lower_level_ssts; + + (snapshot, files_to_remove) } } diff --git a/mini-lsm-starter/src/compact/simple_leveled.rs b/mini-lsm-starter/src/compact/simple_leveled.rs index 1c008ce16..88915fb82 100644 --- a/mini-lsm-starter/src/compact/simple_leveled.rs +++ b/mini-lsm-starter/src/compact/simple_leveled.rs @@ -1,3 +1,5 @@ +use std::collections::HashSet; + use serde::{Deserialize, Serialize}; use crate::lsm_storage::LsmStorageState; @@ -33,9 +35,48 @@ impl SimpleLeveledCompactionController { /// Returns `None` if no compaction needs to be scheduled. The order of SSTs in the compaction task id vector matters. pub fn generate_compaction_task( &self, - _snapshot: &LsmStorageState, + snapshot: &LsmStorageState, ) -> Option { - unimplemented!() + // check if the size ratio is satisfied + for i in 0..self.options.max_levels { + // trigger a compaction of L0 and L1 + if i == 0 + && snapshot.l0_sstables.len() < self.options.level0_file_num_compaction_trigger + { + continue; + } + let upper_level_size = if i == 0 { + snapshot.l0_sstables.len() + } else { + // levels start from 0 + snapshot.levels[i - 1].1.len() + }; + let lower_level_size = snapshot.levels[i].1.len(); + let size_ratio = lower_level_size as f64 / upper_level_size as f64; + if size_ratio < self.options.size_ratio_percent as f64 / 100.0 { + let message = if i == 0 { + "L0 to L1".to_string() + } else { + format!("L{} to L{}", i, i + 1) + }; + println!( + "compaction triggered at {} with size ratio {}", + message, size_ratio, + ); + return Some(SimpleLeveledCompactionTask { + upper_level: if i == 0 { None } else { Some(i - 1) }, + upper_level_sst_ids: if i == 0 { + snapshot.l0_sstables.clone() + } else { + snapshot.levels[i - 1].1.clone() + }, + lower_level: i, + lower_level_sst_ids: snapshot.levels[i].1.clone(), + is_lower_level_bottom_level: i == self.options.max_levels - 1, + }); + } + } + None } /// Apply the compaction result. @@ -47,10 +88,42 @@ impl SimpleLeveledCompactionController { /// in your implementation. pub fn apply_compaction_result( &self, - _snapshot: &LsmStorageState, - _task: &SimpleLeveledCompactionTask, - _output: &[usize], + snapshot: &LsmStorageState, + task: &SimpleLeveledCompactionTask, + output: &[usize], ) -> (LsmStorageState, Vec) { - unimplemented!() + let mut files_to_remove = Vec::new(); + let mut new_snapshot = snapshot.clone(); + if let Some(upper_level) = task.upper_level { + // L1+ compaction + assert_eq!( + task.upper_level_sst_ids, snapshot.levels[upper_level].1, + "sst mismatched" + ); + println!("L{} compaction output: {:?}", upper_level + 1, output); + files_to_remove.extend(&task.upper_level_sst_ids); + new_snapshot.levels[upper_level].1.clear(); + } else { + // L0 compaction + println!("L0 compaction output: {:?}", output); + files_to_remove.extend(&task.upper_level_sst_ids); + let mut l0_ssts_compacted = task + .upper_level_sst_ids + .iter() + .copied() + .collect::>(); + let new_l0_sstables = snapshot + .l0_sstables + .iter() + .copied() + .filter(|x| !l0_ssts_compacted.remove(x)) + .collect::>(); + assert!(l0_ssts_compacted.is_empty()); + new_snapshot.l0_sstables = new_l0_sstables; + } + + files_to_remove.extend(&new_snapshot.levels[task.lower_level].1); + new_snapshot.levels[task.lower_level].1 = output.to_vec(); + (new_snapshot, files_to_remove) } } diff --git a/mini-lsm-starter/src/compact/tiered.rs b/mini-lsm-starter/src/compact/tiered.rs index 25f300e15..2805a5af3 100644 --- a/mini-lsm-starter/src/compact/tiered.rs +++ b/mini-lsm-starter/src/compact/tiered.rs @@ -1,3 +1,5 @@ +use std::collections::HashMap; + use serde::{Deserialize, Serialize}; use crate::lsm_storage::LsmStorageState; @@ -25,19 +27,119 @@ impl TieredCompactionController { Self { options } } + // rely on https://github.com/facebook/rocksdb/wiki/Universal-Compaction pub fn generate_compaction_task( &self, - _snapshot: &LsmStorageState, + snapshot: &LsmStorageState, ) -> Option { - unimplemented!() + assert!( + snapshot.l0_sstables.is_empty(), + "should not add l0 ssts in tiered compaction" + ); + if snapshot.levels.len() < self.options.num_tiers { + return None; + } + // compaction triggered by space amplification ratio + // all levels except last level size / last level size + let mut size = 0; + for id in 0..(snapshot.levels.len() - 1) { + size += snapshot.levels[id].1.len(); + } + let space_amp_ratio = + (size as f64) / (snapshot.levels.last().unwrap().1.len() as f64) * 100.0; + if space_amp_ratio >= self.options.max_size_amplification_percent as f64 { + println!( + "compaction triggered by space amplification ratio: {}", + space_amp_ratio + ); + return Some(TieredCompactionTask { + tiers: snapshot.levels.clone(), + bottom_tier_included: true, + }); + } + + // size of all previous tiers / this tier >= (1 + size_ratio) * 100% + // compaction triggered by size ratio(number of sorted runs) + let size_ratio_trigger = (100.0 + self.options.size_ratio as f64) / 100.0; + let mut size = 0; + for id in 0..(snapshot.levels.len() - 1) { + size += snapshot.levels[id].1.len(); + let cur_tier = snapshot.levels[id + 1].1.len(); + let cur_size_ratio = size as f64 / cur_tier as f64; + // compaction num need exceed `self.options.min_merge_width` + if cur_size_ratio >= size_ratio_trigger && id + 2 >= self.options.min_merge_width { + println!( + "compaction triggered by size ratio: {}", + cur_size_ratio * 100.0 + ); + return Some(TieredCompactionTask { + tiers: snapshot + .levels + .iter() + .take(id + 2) + .cloned() + .collect::>(), + bottom_tier_included: id + 2 >= snapshot.levels.len(), + }); + } + } + // trying to reduce sorted runs without respecting size ratio + // to make sure we have exactly `num_tiers` tiers + println!("compaction triggered by reducing sorted runs"); + let num_tiers_to_take = snapshot.levels.len() - self.options.num_tiers + 2; + Some(TieredCompactionTask { + tiers: snapshot + .levels + .iter() + .take(num_tiers_to_take) + .cloned() + .collect::>(), + bottom_tier_included: snapshot.levels.len() >= num_tiers_to_take, + }) } pub fn apply_compaction_result( &self, - _snapshot: &LsmStorageState, - _task: &TieredCompactionTask, - _output: &[usize], + snapshot: &LsmStorageState, + task: &TieredCompactionTask, + output: &[usize], ) -> (LsmStorageState, Vec) { - unimplemented!() + assert!( + snapshot.l0_sstables.is_empty(), + "should not add l0 ssts in tiered compaction" + ); + let mut new_snapshot = snapshot.clone(); + let mut tier_to_remove = task + .tiers + .iter() + .map(|(x, y)| (*x, y)) + .collect::>(); + let mut files_to_remove = Vec::new(); + let mut levels = Vec::new(); + let mut new_tier_added = false; + for (tier_id, files) in &new_snapshot.levels { + if let Some(remove_files) = tier_to_remove.remove(tier_id) { + // the tier should be removed + assert_eq!( + remove_files, files, + "file changed after issuing compaction task" + ); + files_to_remove.extend(files.iter().cloned()); + } else { + // retain the tier + levels.push((*tier_id, files.clone())); + } + // ? + if tier_to_remove.is_empty() && !new_tier_added { + // add the compacted tier to the LSM tree + new_tier_added = true; + levels.push((output[0], output.to_vec())); + } + } + if !tier_to_remove.is_empty() { + unreachable!("some tiers not found??"); + } + new_snapshot.levels = levels; + (new_snapshot, files_to_remove) } } diff --git a/mini-lsm-starter/src/debug.rs b/mini-lsm-starter/src/debug.rs index c9eab3dd9..2b05d6c91 100644 --- a/mini-lsm-starter/src/debug.rs +++ b/mini-lsm-starter/src/debug.rs @@ -2,7 +2,15 @@ use crate::lsm_storage::{LsmStorageInner, MiniLsm}; impl LsmStorageInner { pub fn dump_structure(&self) { + println!("----------------- Dump Structure -----------------"); let snapshot = self.state.read(); + // print mem-table + if !snapshot.memtable.is_empty() { + println!("Mem-table: {:?}", snapshot.memtable); + } + if !snapshot.imm_memtables.is_empty() { + println!("Imm-mem-tables: {:?}", snapshot.imm_memtables); + } if !snapshot.l0_sstables.is_empty() { println!( "L0 ({}): {:?}", @@ -13,6 +21,7 @@ impl LsmStorageInner { for (level, files) in &snapshot.levels { println!("L{level} ({}): {:?}", files.len(), files); } + println!("----------------- Dump Structure finished -----------------"); } } diff --git a/mini-lsm-starter/src/iterators/concat_iterator.rs b/mini-lsm-starter/src/iterators/concat_iterator.rs index 8cef71528..c010dee55 100644 --- a/mini-lsm-starter/src/iterators/concat_iterator.rs +++ b/mini-lsm-starter/src/iterators/concat_iterator.rs @@ -3,7 +3,7 @@ use std::sync::Arc; -use anyhow::Result; +use anyhow::{Ok, Result}; use super::StorageIterator; use crate::{ @@ -19,13 +19,85 @@ pub struct SstConcatIterator { sstables: Vec>, } +// check ssts vaild +fn check_valid(sstables: &[Arc]) { + for sst in sstables { + assert!(sst.first_key() <= sst.last_key()); + } + if !sstables.is_empty() { + for i in 1..sstables.len() { + assert!(sstables[i - 1].last_key() < sstables[i].first_key()); + } + } +} + impl SstConcatIterator { pub fn create_and_seek_to_first(sstables: Vec>) -> Result { - unimplemented!() + if sstables.is_empty() { + return Ok(Self { + current: None, + next_sst_idx: 0, + sstables, + }); + } + check_valid(&sstables); + let mut iter = Self { + current: Some(SsTableIterator::create_and_seek_to_first( + sstables[0].clone(), + )?), + next_sst_idx: 1, + sstables, + }; + + iter.move_iter_until_valid()?; + Ok(iter) } pub fn create_and_seek_to_key(sstables: Vec>, key: KeySlice) -> Result { - unimplemented!() + if sstables.is_empty() { + return Ok(Self { + current: None, + next_sst_idx: 0, + sstables, + }); + } + check_valid(&sstables); + // get key from sstables + let idx: usize = sstables + .partition_point(|table| table.first_key().as_key_slice() <= key) + .saturating_sub(1); + + let mut iter = Self { + current: Some(SsTableIterator::create_and_seek_to_key( + sstables[idx].clone(), + key, + )?), + next_sst_idx: idx + 1, + sstables, + }; + iter.move_iter_until_valid()?; + Ok(iter) + } + + fn move_iter_until_valid(&mut self) -> Result<()> { + // check if the current iterator is valid + loop { + if let Some(current) = &self.current { + if current.is_valid() { + return Ok(()); + } + if self.next_sst_idx >= self.sstables.len() { + self.current = None; + return Ok(()); + } + self.current = Some(SsTableIterator::create_and_seek_to_first( + self.sstables[self.next_sst_idx].clone(), + )?); + self.next_sst_idx += 1; + } else { + return Ok(()); + } + } } } @@ -33,19 +105,29 @@ impl StorageIterator for SstConcatIterator { type KeyType<'a> = KeySlice<'a>; fn key(&self) -> KeySlice { - unimplemented!() + self.current.as_ref().unwrap().key() } fn value(&self) -> &[u8] { - unimplemented!() + self.current.as_ref().unwrap().value() } fn is_valid(&self) -> bool { - unimplemented!() + if let Some(current) = &self.current { + assert!(current.is_valid()); + true + } else { + false + } } fn next(&mut self) -> Result<()> { - unimplemented!() + if self.current.is_none() { + return Ok(()); + } + self.current.as_mut().unwrap().next()?; + self.move_iter_until_valid()?; + Ok(()) } fn num_active_iterators(&self) -> usize { diff --git a/mini-lsm-starter/src/iterators/merge_iterator.rs b/mini-lsm-starter/src/iterators/merge_iterator.rs index a3e911ddb..8496e0389 100644 --- a/mini-lsm-starter/src/iterators/merge_iterator.rs +++ b/mini-lsm-starter/src/iterators/merge_iterator.rs @@ -1,7 +1,5 @@ -#![allow(unused_variables)] // TODO(you): remove this lint after implementing this mod -#![allow(dead_code)] // TODO(you): remove this lint after implementing this mod - use std::cmp::{self}; +use std::collections::binary_heap::PeekMut; use std::collections::BinaryHeap; use anyhow::Result; @@ -39,15 +37,24 @@ impl Ord for HeapWrapper { } /// Merge multiple iterators of the same type. If the same key occurs multiple times in some -/// iterators, perfer the one with smaller index. +/// iterators, prefer the one with smaller index. pub struct MergeIterator { iters: BinaryHeap>, - current: HeapWrapper, + current: Option>, } impl MergeIterator { pub fn create(iters: Vec>) -> Self { - unimplemented!() + let mut iters = iters + .into_iter() + .enumerate() + .filter(|(_, iter)| iter.is_valid()) + .map(|(i, iter)| HeapWrapper(i, iter)) + .collect::>(); + Self { + current: iters.pop(), + iters, + } } } @@ -57,18 +64,72 @@ impl StorageIterator = KeySlice<'a>>> StorageIt type KeyType<'a> = KeySlice<'a>; fn key(&self) -> KeySlice { - unimplemented!() + self.current.as_ref().unwrap().1.key() } fn value(&self) -> &[u8] { - unimplemented!() + self.current.as_ref().unwrap().1.value() } fn is_valid(&self) -> bool { - unimplemented!() + // check if current iterator is valid including when current iterator is None + self.current + .as_ref() + .map(|x| x.1.is_valid()) + .unwrap_or(false) } fn next(&mut self) -> Result<()> { - unimplemented!() + if self.current.is_none() { + return Ok(()); + } + let current = self.current.as_mut().unwrap(); + // merge same key + while let Some(mut inner_iter) = self.iters.peek_mut() { + if current.1.key() == inner_iter.1.key() { + // check next error + if let Err(e) = inner_iter.1.next() { + PeekMut::pop(inner_iter); + return Err(e); + } + + // need to remove the iterator if it is invalid + if !inner_iter.1.is_valid() { + PeekMut::pop(inner_iter); + } + } else { + break; + } + } + + current.1.next()?; + // if current iterator is invalid, replace with next iterator + if !current.1.is_valid() { + if let Some(next_iter) = self.iters.pop() { + *current = next_iter; + } + return Ok(()); + } + + if let Some(mut inner_iter) = self.iters.peek_mut() { + // TODO: check value is reverse because heap is max heap + if *current < *inner_iter { + std::mem::swap(&mut *current, &mut *inner_iter); + } + } + + Ok(()) + } + + fn num_active_iterators(&self) -> usize { + self.iters + .iter() + .map(|x| x.1.num_active_iterators()) + .sum::() + + self + .current + .as_ref() + .map(|x| x.1.num_active_iterators()) + .unwrap_or(0) } } diff --git a/mini-lsm-starter/src/iterators/two_merge_iterator.rs b/mini-lsm-starter/src/iterators/two_merge_iterator.rs index bb7b4a8a8..77cb3b5be 100644 --- a/mini-lsm-starter/src/iterators/two_merge_iterator.rs +++ b/mini-lsm-starter/src/iterators/two_merge_iterator.rs @@ -1,7 +1,4 @@ -#![allow(unused_variables)] // TODO(you): remove this lint after implementing this mod -#![allow(dead_code)] // TODO(you): remove this lint after implementing this mod - -use anyhow::Result; +use anyhow::{Ok, Result}; use super::StorageIterator; @@ -10,7 +7,7 @@ use super::StorageIterator; pub struct TwoMergeIterator { a: A, b: B, - // Add fields as need + choose_a: bool, } impl< @@ -19,7 +16,31 @@ impl< > TwoMergeIterator { pub fn create(a: A, b: B) -> Result { - unimplemented!() + let mut iter = Self { + choose_a: false, + a, + b, + }; + iter.skip_b()?; + iter.choose_a = Self::choose_a(&iter.a, &iter.b); + Ok(iter) + } + + fn skip_b(&mut self) -> Result<()> { + if self.a.is_valid() && self.b.is_valid() && self.b.key() == self.a.key() { + self.b.next()?; + } + Ok(()) + } + + fn choose_a(a: &A, b: &B) -> bool { + if !a.is_valid() { + return false; + } + if !b.is_valid() { + return true; + } + a.key() < b.key() } } @@ -31,18 +52,48 @@ impl< type KeyType<'a> = A::KeyType<'a>; fn key(&self) -> Self::KeyType<'_> { - unimplemented!() + if self.choose_a { + self.a.key() + } else { + self.b.key() + } } fn value(&self) -> &[u8] { - unimplemented!() + if self.choose_a { + self.a.value() + } else { + self.b.value() + } } fn is_valid(&self) -> bool { - unimplemented!() + if !self.a.is_valid() && !self.b.is_valid() { + false + } else if self.choose_a { + self.a.is_valid() + } else { + self.b.is_valid() + } } fn next(&mut self) -> Result<()> { - unimplemented!() + // skip same key for b + if self.choose_a { + self.a.next()?; + } else { + self.b.next()?; + } + + if self.a.is_valid() && self.b.is_valid() && self.a.key() == self.b.key() { + self.b.next()?; + } + self.choose_a = Self::choose_a(&self.a, &self.b); + + Ok(()) + } + + fn num_active_iterators(&self) -> usize { + self.a.num_active_iterators() + self.b.num_active_iterators() } } diff --git a/mini-lsm-starter/src/key.rs b/mini-lsm-starter/src/key.rs index f459ee745..b383282ce 100644 --- a/mini-lsm-starter/src/key.rs +++ b/mini-lsm-starter/src/key.rs @@ -1,41 +1,54 @@ -use std::fmt::Debug; +use std::{cmp::Reverse, fmt::Debug}; use bytes::Bytes; -pub const TS_ENABLED: bool = false; - -pub struct Key>(T); +pub struct Key>(T, u64); pub type KeySlice<'a> = Key<&'a [u8]>; pub type KeyVec = Key>; pub type KeyBytes = Key; +/// For testing purpose, should not use anywhere in your implementation. +pub const TS_ENABLED: bool = true; + +/// Temporary, should remove after implementing full week 3 day 1 + 2. +pub const TS_DEFAULT: u64 = 0; + +pub const TS_MAX: u64 = std::u64::MAX; +pub const TS_MIN: u64 = std::u64::MIN; +pub const TS_RANGE_BEGIN: u64 = std::u64::MAX; +pub const TS_RANGE_END: u64 = std::u64::MIN; + impl> Key { pub fn into_inner(self) -> T { self.0 } - pub fn len(&self) -> usize { + pub fn key_len(&self) -> usize { self.0.as_ref().len() } + pub fn raw_len(&self) -> usize { + self.0.as_ref().len() + std::mem::size_of::() + } + pub fn is_empty(&self) -> bool { self.0.as_ref().is_empty() } pub fn for_testing_ts(self) -> u64 { - 0 + self.1 } } impl Key> { pub fn new() -> Self { - Self(Vec::new()) + Self(Vec::new(), TS_DEFAULT) } - /// Create a `KeyVec` from a `Vec`. Will be removed in week 3. - pub fn from_vec(key: Vec) -> Self { - Self(key) + /// Create a `KeyVec` from a `Vec` and a ts. Will be removed in week 3. + pub fn from_vec_with_ts(key: Vec, ts: u64) -> Self { + Self(key, ts) } /// Clears the key and set ts to 0. @@ -48,51 +61,66 @@ impl Key> { self.0.extend(data) } - /// Set the key from a slice without re-allocating. The signature will change in week 3. + pub fn set_ts(&mut self, ts: u64) { + self.1 = ts; + } + + /// Set the key from a slice without re-allocating. pub fn set_from_slice(&mut self, key_slice: KeySlice) { self.0.clear(); self.0.extend(key_slice.0); + self.1 = key_slice.1; } pub fn as_key_slice(&self) -> KeySlice { - Key(self.0.as_slice()) + Key(self.0.as_slice(), self.1) } pub fn into_key_bytes(self) -> KeyBytes { - Key(self.0.into()) + Key(self.0.into(), self.1) } - /// Always use `raw_ref` to access the key in week 1 + 2. This function will be removed in week 3. - pub fn raw_ref(&self) -> &[u8] { + pub fn key_ref(&self) -> &[u8] { self.0.as_ref() } + pub fn ts(&self) -> u64 { + self.1 + } + pub fn for_testing_key_ref(&self) -> &[u8] { self.0.as_ref() } pub fn for_testing_from_vec_no_ts(key: Vec) -> Self { - Self(key) + Self(key, TS_DEFAULT) } } impl Key { + pub fn new() -> Self { + Self(Bytes::new(), TS_DEFAULT) + } + pub fn as_key_slice(&self) -> KeySlice { - Key(&self.0) + Key(&self.0, self.1) } - /// Create a `KeyBytes` from a `Bytes`. Will be removed in week 3. - pub fn from_bytes(bytes: Bytes) -> KeyBytes { - Key(bytes) + /// Create a `KeyBytes` from a `Bytes` and a ts. + pub fn from_bytes_with_ts(bytes: Bytes, ts: u64) -> KeyBytes { + Key(bytes, ts) } - /// Always use `raw_ref` to access the key in week 1 + 2. This function will be removed in week 3. - pub fn raw_ref(&self) -> &[u8] { + pub fn key_ref(&self) -> &[u8] { self.0.as_ref() } + pub fn ts(&self) -> u64 { + self.1 + } + pub fn for_testing_from_bytes_no_ts(bytes: Bytes) -> KeyBytes { - Key(bytes) + Key(bytes, TS_DEFAULT) } pub fn for_testing_key_ref(&self) -> &[u8] { @@ -102,29 +130,32 @@ impl Key { impl<'a> Key<&'a [u8]> { pub fn to_key_vec(self) -> KeyVec { - Key(self.0.to_vec()) + Key(self.0.to_vec(), self.1) } /// Create a key slice from a slice. Will be removed in week 3. - pub fn from_slice(slice: &'a [u8]) -> Self { - Self(slice) + pub fn from_slice(slice: &'a [u8], ts: u64) -> Self { + Self(slice, ts) } - /// Always use `raw_ref` to access the key in week 1 + 2. This function will be removed in week 3. - pub fn raw_ref(self) -> &'a [u8] { + pub fn key_ref(self) -> &'a [u8] { self.0 } + pub fn ts(&self) -> u64 { + self.1 + } + pub fn for_testing_key_ref(self) -> &'a [u8] { self.0 } pub fn for_testing_from_slice_no_ts(slice: &'a [u8]) -> Self { - Self(slice) + Self(slice, TS_DEFAULT) } - pub fn for_testing_from_slice_with_ts(slice: &'a [u8], _ts: u64) -> Self { - Self(slice) + pub fn for_testing_from_slice_with_ts(slice: &'a [u8], ts: u64) -> Self { + Self(slice, ts) } } @@ -136,13 +167,13 @@ impl + Debug> Debug for Key { impl + Default> Default for Key { fn default() -> Self { - Self(T::default()) + Self(T::default(), TS_DEFAULT) } } impl + PartialEq> PartialEq for Key { fn eq(&self, other: &Self) -> bool { - self.0.eq(&other.0) + (self.0.as_ref(), self.1).eq(&(other.0.as_ref(), other.1)) } } @@ -150,7 +181,7 @@ impl + Eq> Eq for Key {} impl + Clone> Clone for Key { fn clone(&self) -> Self { - Self(self.0.clone()) + Self(self.0.clone(), self.1) } } @@ -158,12 +189,12 @@ impl + Copy> Copy for Key {} impl + PartialOrd> PartialOrd for Key { fn partial_cmp(&self, other: &Self) -> Option { - self.0.partial_cmp(&other.0) + (self.0.as_ref(), Reverse(self.1)).partial_cmp(&(other.0.as_ref(), Reverse(other.1))) } } impl + Ord> Ord for Key { fn cmp(&self, other: &Self) -> std::cmp::Ordering { - self.0.cmp(&other.0) + (self.0.as_ref(), Reverse(self.1)).cmp(&(other.0.as_ref(), Reverse(other.1))) } } diff --git a/mini-lsm-starter/src/lsm_iterator.rs b/mini-lsm-starter/src/lsm_iterator.rs index 82842b2d9..843518148 100644 --- a/mini-lsm-starter/src/lsm_iterator.rs +++ b/mini-lsm-starter/src/lsm_iterator.rs @@ -1,23 +1,92 @@ -#![allow(unused_variables)] // TODO(you): remove this lint after implementing this mod -#![allow(dead_code)] // TODO(you): remove this lint after implementing this mod +use std::ops::Bound; -use anyhow::Result; +use anyhow::{bail, Ok, Result}; +use bytes::Bytes; use crate::{ - iterators::{merge_iterator::MergeIterator, StorageIterator}, + iterators::{ + concat_iterator::SstConcatIterator, merge_iterator::MergeIterator, + two_merge_iterator::TwoMergeIterator, StorageIterator, + }, mem_table::MemTableIterator, + table::SsTableIterator, }; /// Represents the internal type for an LSM iterator. This type will be changed across the tutorial for multiple times. -type LsmIteratorInner = MergeIterator; +/// choosing memtable firstly +type LsmIteratorInner = TwoMergeIterator< + TwoMergeIterator, MergeIterator>, + MergeIterator, +>; pub struct LsmIterator { inner: LsmIteratorInner, + end_bound: Bound, + is_valid: bool, + read_ts: u64, + prev_key: Vec, } impl LsmIterator { - pub(crate) fn new(iter: LsmIteratorInner) -> Result { - Ok(Self { inner: iter }) + pub(crate) fn new( + iter: LsmIteratorInner, + end_bound: Bound, + read_ts: u64, + ) -> Result { + let mut iter = Self { + is_valid: iter.is_valid(), + inner: iter, + end_bound, + read_ts, + prev_key: Vec::new(), + }; + iter.move_to_key()?; + Ok(iter) + } + + fn move_to_key(&mut self) -> Result<()> { + loop { + while self.is_valid() && self.key() == self.prev_key { + self.next_inner()?; + } + if !self.is_valid() { + return Ok(()); + } + self.prev_key.clear(); + self.prev_key.extend(self.inner.key().key_ref()); + // traverse all version of the same key + while self.is_valid() + && self.key() == self.prev_key + && self.inner.key().ts() > self.read_ts + { + self.next_inner()?; + } + if !self.is_valid() { + return Ok(()); + } + // if now the key is different(all ts above `read_ts``), + // need to move to the next key. + if self.key() != self.prev_key { + continue; + } + if !self.value().is_empty() { + return Ok(()); + } + } + } + + fn next_inner(&mut self) -> Result<()> { + self.inner.next()?; + if !self.inner.is_valid() { + self.is_valid = false; + return Ok(()); + } + match self.end_bound.as_ref() { + Bound::Unbounded => {} + Bound::Included(key) => self.is_valid = self.inner.key().key_ref() <= key.as_ref(), + Bound::Excluded(key) => self.is_valid = self.inner.key().key_ref() < key.as_ref(), + } + Ok(()) } } @@ -25,19 +94,26 @@ impl StorageIterator for LsmIterator { type KeyType<'a> = &'a [u8]; fn is_valid(&self) -> bool { - unimplemented!() + self.is_valid } fn key(&self) -> &[u8] { - unimplemented!() + self.inner.key().key_ref() } fn value(&self) -> &[u8] { - unimplemented!() + self.inner.value() } fn next(&mut self) -> Result<()> { - unimplemented!() + self.next_inner()?; + // move to the next key + self.move_to_key()?; + Ok(()) + } + + fn num_active_iterators(&self) -> usize { + self.inner.num_active_iterators() } } @@ -46,11 +122,15 @@ impl StorageIterator for LsmIterator { /// `is_valid` should return false, and `next` should always return an error. pub struct FusedIterator { iter: I, + has_errored: bool, } impl FusedIterator { pub fn new(iter: I) -> Self { - Self { iter } + Self { + iter, + has_errored: false, + } } } @@ -58,18 +138,29 @@ impl StorageIterator for FusedIterator { type KeyType<'a> = I::KeyType<'a> where Self: 'a; fn is_valid(&self) -> bool { - unimplemented!() + !self.has_errored && self.iter.is_valid() } fn key(&self) -> Self::KeyType<'_> { - unimplemented!() + self.iter.key() } fn value(&self) -> &[u8] { - unimplemented!() + self.iter.value() } fn next(&mut self) -> Result<()> { - unimplemented!() + if self.has_errored { + bail!("Iterator has already returned an error") + } + if let Some(err) = self.iter.next().err() { + self.has_errored = true; + return Err(err); + } + Ok(()) + } + + fn num_active_iterators(&self) -> usize { + self.iter.num_active_iterators() } } diff --git a/mini-lsm-starter/src/lsm_storage.rs b/mini-lsm-starter/src/lsm_storage.rs index 428ef25bf..7a0a16f9b 100644 --- a/mini-lsm-starter/src/lsm_storage.rs +++ b/mini-lsm-starter/src/lsm_storage.rs @@ -1,12 +1,13 @@ #![allow(dead_code)] // REMOVE THIS LINE after fully implementing this functionality -use std::collections::HashMap; +use std::collections::{BTreeSet, HashMap}; +use std::fs::File; use std::ops::Bound; use std::path::{Path, PathBuf}; use std::sync::atomic::AtomicUsize; use std::sync::Arc; -use anyhow::Result; +use anyhow::{Context, Ok, Result}; use bytes::Bytes; use parking_lot::{Mutex, MutexGuard, RwLock}; @@ -15,11 +16,17 @@ use crate::compact::{ CompactionController, CompactionOptions, LeveledCompactionController, LeveledCompactionOptions, SimpleLeveledCompactionController, SimpleLeveledCompactionOptions, TieredCompactionController, }; +use crate::iterators::concat_iterator::SstConcatIterator; +use crate::iterators::merge_iterator::MergeIterator; +use crate::iterators::two_merge_iterator::TwoMergeIterator; +use crate::iterators::StorageIterator; +use crate::key::{KeySlice, TS_RANGE_BEGIN, TS_RANGE_END}; use crate::lsm_iterator::{FusedIterator, LsmIterator}; -use crate::manifest::Manifest; -use crate::mem_table::MemTable; +use crate::manifest::{Manifest, ManifestRecord}; +use crate::mem_table::{map_bound, map_key_bound_plus_ts, MemTable}; +use crate::mvcc::txn::{Transaction, TxnIterator}; use crate::mvcc::LsmMvccInner; -use crate::table::SsTable; +use crate::table::{FileObject, SsTable, SsTableBuilder, SsTableIterator}; pub type BlockCache = moka::sync::Cache<(usize, usize), Arc>; @@ -113,6 +120,11 @@ impl LsmStorageOptions { } } +#[derive(Clone, Debug)] +pub enum CompactionFilter { + Prefix(Bytes), +} + /// The storage interface of the LSM tree. pub(crate) struct LsmStorageInner { pub(crate) state: Arc>>, @@ -124,6 +136,7 @@ pub(crate) struct LsmStorageInner { pub(crate) compaction_controller: CompactionController, pub(crate) manifest: Option, pub(crate) mvcc: Option, + pub(crate) compaction_filters: Arc>>, } /// A thin wrapper for `LsmStorageInner` and the user interface for MiniLSM. @@ -148,7 +161,39 @@ impl Drop for MiniLsm { impl MiniLsm { pub fn close(&self) -> Result<()> { - unimplemented!() + self.inner.sync_dir()?; + // flush and compaction threads should be stopped + self.flush_notifier.send(()).ok(); + self.compaction_notifier.send(()).ok(); + + // close the compaction thread and flush thread + if let Some(handle) = self.flush_thread.lock().take() { + handle.join().ok(); + } + if let Some(handle) = self.compaction_thread.lock().take() { + handle.join().ok(); + } + + // sync wal + if self.inner.options.enable_wal { + self.inner.sync()?; + self.inner.sync_dir()?; + return Ok(()); + } + + // flush memtable and imm_memtables + if !self.inner.state.read().memtable.is_empty() { + self.inner + .force_freeze_memtable(&self.inner.state_lock.lock())?; + } + + while !self.inner.state.read().imm_memtables.is_empty() { + self.inner.force_flush_next_imm_memtable()?; + } + + self.inner.sync_dir()?; + + Ok(()) } /// Start the storage engine by either loading an existing directory or creating a new one if the directory does @@ -168,7 +213,7 @@ impl MiniLsm { })) } - pub fn new_txn(&self) -> Result<()> { + pub fn new_txn(&self) -> Result> { self.inner.new_txn() } @@ -176,6 +221,10 @@ impl MiniLsm { self.inner.write_batch(batch) } + pub fn add_compaction_filter(&self, compaction_filter: CompactionFilter) { + self.inner.add_compaction_filter(compaction_filter) + } + pub fn get(&self, key: &[u8]) -> Result> { self.inner.get(key) } @@ -192,11 +241,7 @@ impl MiniLsm { self.inner.sync() } - pub fn scan( - &self, - lower: Bound<&[u8]>, - upper: Bound<&[u8]>, - ) -> Result> { + pub fn scan(&self, lower: Bound<&[u8]>, upper: Bound<&[u8]>) -> Result { self.inner.scan(lower, upper) } @@ -223,11 +268,20 @@ impl LsmStorageInner { .fetch_add(1, std::sync::atomic::Ordering::SeqCst) } + pub(crate) fn mvcc(&self) -> &LsmMvccInner { + self.mvcc.as_ref().unwrap() + } + /// Start the storage engine by either loading an existing directory or creating a new one if the directory does /// not exist. pub(crate) fn open(path: impl AsRef, options: LsmStorageOptions) -> Result { let path = path.as_ref(); - let state = LsmStorageState::create(&options); + if !path.exists() { + std::fs::create_dir_all(path) + .context("[LsmStorageInner.open] failed to create DB directory")?; + } + + let block_cache = Arc::new(BlockCache::new(1 << 20)); // 4GB block cache, let compaction_controller = match &options.compaction_options { CompactionOptions::Leveled(options) => { @@ -242,43 +296,145 @@ impl LsmStorageInner { CompactionOptions::NoCompaction => CompactionController::NoCompaction, }; + let mut state = LsmStorageState::create(&options); + let manifest; + let mut next_sst_id = 1; + // recover from MANIFEST, `/MANIFEST` + let manifest_path = path.join("MANIFEST"); + let mut last_commit_ts = 0; + if !manifest_path.exists() { + if options.enable_wal { + let memtable = Arc::new(MemTable::create_with_wal( + state.memtable.id(), + Self::path_of_wal_static(path, state.memtable.id()), + )?); + state.memtable = memtable; + } + manifest = Manifest::create(&manifest_path) + .context("[LsmStorageInner.open] failed to create manifest")?; + manifest.add_record_when_init(ManifestRecord::NewMemtable(state.memtable.id()))?; + } else { + let (m, records) = Manifest::recover(manifest_path)?; + let mut memtables = BTreeSet::new(); + for record in records { + match record { + ManifestRecord::Flush(sst_id) => { + let res = memtables.remove(&sst_id); + assert!(res, "memtable not exist?"); + if compaction_controller.flush_to_l0() { + state.l0_sstables.insert(0, sst_id); + } else { + state.levels.insert(0, (sst_id, vec![sst_id])); + } + next_sst_id = next_sst_id.max(sst_id); + } + ManifestRecord::NewMemtable(x) => { + next_sst_id = next_sst_id.max(x); + memtables.insert(x); + } + ManifestRecord::Compaction(task, new_ssts) => { + let (new_state, _) = + compaction_controller.apply_compaction_result(&state, &task, &new_ssts); + state = new_state; + next_sst_id = next_sst_id.max(new_ssts.iter().max().copied().unwrap()); + } + } + } + + // recover SSTs + let mut sst_cnt = 0; + for &sst_id in state + .l0_sstables + .iter() + .chain(state.levels.iter().flat_map(|(_, files)| files)) + { + let sst = SsTable::open( + sst_id, + Some(block_cache.clone()), + FileObject::open(&Self::path_of_sst_static(path, sst_id)) + .context("failed to open SST")?, + )?; + // update last_commit_ts + last_commit_ts = last_commit_ts.max(sst.max_ts()); + state.sstables.insert(sst_id, Arc::new(sst)); + next_sst_id = next_sst_id.max(sst_id); + sst_cnt += 1; + } + println!("recovered {} SSTs", sst_cnt); + next_sst_id += 1; + + // recover memtables + if options.enable_wal { + let mut wal_cnt = 0; + for id in memtables { + let memtable = + MemTable::recover_from_wal(id, Self::path_of_wal_static(path, id))?; + // update last_commit_ts + let max_ts = memtable + .map + .iter() + .map(|entry| entry.key().ts()) + .max() + .unwrap_or(0); + last_commit_ts = last_commit_ts.max(max_ts); + if !memtable.is_empty() { + state.imm_memtables.insert(0, Arc::new(memtable)); + wal_cnt += 1; + } + } + println!("recovered {} memtables from WAL", wal_cnt); + state.memtable = Arc::new(MemTable::create_with_wal( + next_sst_id, + Self::path_of_wal_static(path, next_sst_id), + )?); + } else { + state.memtable = Arc::new(MemTable::create(next_sst_id)); + } + m.add_record_when_init(ManifestRecord::NewMemtable(next_sst_id))?; + + next_sst_id += 1; + manifest = m; + } + let storage = Self { state: Arc::new(RwLock::new(Arc::new(state))), state_lock: Mutex::new(()), path: path.to_path_buf(), - block_cache: Arc::new(BlockCache::new(1024)), - next_sst_id: AtomicUsize::new(1), + block_cache, + next_sst_id: AtomicUsize::new(next_sst_id), compaction_controller, - manifest: None, + manifest: Some(manifest), options: options.into(), - mvcc: None, + mvcc: Some(LsmMvccInner::new(last_commit_ts)), + compaction_filters: Arc::new(Mutex::new(Vec::new())), }; + storage.sync_dir()?; + Ok(storage) } pub fn sync(&self) -> Result<()> { - unimplemented!() - } - - /// Get a key from the storage. In day 7, this can be further optimized by using a bloom filter. - pub fn get(&self, _key: &[u8]) -> Result> { - unimplemented!() + self.state.read().memtable.sync_wal() } - /// Write a batch of data into the storage. Implement in week 2 day 7. - pub fn write_batch>(&self, _batch: &[WriteBatchRecord]) -> Result<()> { - unimplemented!() - } + pub fn try_freeze(&self, size: usize) -> Result<()> { + // using double check for concurrency + if size >= self.options.target_sst_size { + let state_lock = self.state_lock.lock(); + let snapshot = self.state.read(); - /// Put a key-value pair into the storage by writing into the current memtable. - pub fn put(&self, _key: &[u8], _value: &[u8]) -> Result<()> { - unimplemented!() + if snapshot.memtable.approximate_size() >= self.options.target_sst_size { + drop(snapshot); + self.force_freeze_memtable(&state_lock)?; + } + } + Ok(()) } - /// Remove a key from the storage by writing an empty value. - pub fn delete(&self, _key: &[u8]) -> Result<()> { - unimplemented!() + pub fn add_compaction_filter(&self, compaction_filter: CompactionFilter) { + let mut compaction_filters = self.compaction_filters.lock(); + compaction_filters.push(compaction_filter); } pub(crate) fn path_of_sst_static(path: impl AsRef, id: usize) -> PathBuf { @@ -289,6 +445,10 @@ impl LsmStorageInner { Self::path_of_sst_static(&self.path, id) } + pub(crate) fn path(&self) -> &Path { + &self.path + } + pub(crate) fn path_of_wal_static(path: impl AsRef, id: usize) -> PathBuf { path.as_ref().join(format!("{:05}.wal", id)) } @@ -298,30 +458,431 @@ impl LsmStorageInner { } pub(super) fn sync_dir(&self) -> Result<()> { - unimplemented!() + File::open(&self.path)?.sync_all()?; + Ok(()) } /// Force freeze the current memtable to an immutable memtable - pub fn force_freeze_memtable(&self, _state_lock_observer: &MutexGuard<'_, ()>) -> Result<()> { - unimplemented!() + pub fn force_freeze_memtable(&self, state_lock_observer: &MutexGuard<'_, ()>) -> Result<()> { + let new_memtable_id = self.next_sst_id(); + let new_memtable = if self.options.enable_wal { + Arc::new(MemTable::create_with_wal( + new_memtable_id, + self.path_of_wal(new_memtable_id), + )?) + } else { + Arc::new(MemTable::create(new_memtable_id)) + }; + + { + let mut guard = self.state.write(); + let mut snapshot = guard.as_ref().clone(); + let old_memtable = std::mem::replace(&mut snapshot.memtable, new_memtable); + // imm_memtables.first() should be the last frozen memtable + snapshot.imm_memtables.insert(0, old_memtable.clone()); + *guard = Arc::new(snapshot); + drop(guard); + old_memtable.sync_wal()?; + } + + // add NewMemtable record to manifest + self.manifest.as_ref().unwrap().add_record( + state_lock_observer, + ManifestRecord::NewMemtable(new_memtable_id), + )?; + Ok(()) } /// Force flush the earliest-created immutable memtable to disk pub fn force_flush_next_imm_memtable(&self) -> Result<()> { - unimplemented!() - } + let state_lock = self.state_lock.lock(); + + let last_imm_memtable; + { + let snapshot = self.state.read(); + last_imm_memtable = snapshot + .imm_memtables + .last() + .expect("no imm memtables!") + .clone(); + } + + // build new sstable + let mut builder = SsTableBuilder::new(self.options.block_size); + last_imm_memtable.flush(&mut builder)?; + let sst_id = last_imm_memtable.id(); + let new_sst = Arc::new(builder.build( + sst_id, + Some(self.block_cache.clone()), + self.path_of_sst(sst_id), + )?); + + // Add the new SST to the storage + { + let mut guard = self.state.write(); + let mut snapshot = guard.as_ref().clone(); + let sst_id = snapshot.imm_memtables.pop().unwrap().id(); + println!("flushed {}.sst with size={}", sst_id, new_sst.table_size()); + snapshot.sstables.insert(sst_id, new_sst); + if self.compaction_controller.flush_to_l0() { + // In leveled compaction or no compaction, simply flush to L0 + // L0 SSTs are sorted by creation time, from latest to earliest. + snapshot.l0_sstables.insert(0, sst_id); + } else { + // In tiered compaction, create a new tier + snapshot.levels.insert(0, (sst_id, vec![sst_id])); + } + *guard = Arc::new(snapshot); + } + + if self.options.enable_wal { + std::fs::remove_file(self.path_of_wal(sst_id))?; + } + + // update manifest + self.manifest + .as_ref() + .unwrap() + .add_record(&state_lock, ManifestRecord::Flush(sst_id))?; - pub fn new_txn(&self) -> Result<()> { - // no-op Ok(()) } + pub fn new_txn(self: &Arc) -> Result> { + Ok(self.mvcc().new_txn(self.clone(), self.options.serializable)) + } + + /// Get a key from the storage. In week1 day7, this can be further optimized by using a bloom filter. + pub fn get(self: &Arc, key: &[u8]) -> Result> { + let txn = self.mvcc().new_txn(self.clone(), self.options.serializable); + txn.get(key) + } + + pub fn get_with_txn(&self, key: &[u8], read_ts: u64) -> Result> { + let snapshot = self.state.read(); + // search memtable firstly + let mut memtable_iters = Vec::with_capacity(snapshot.imm_memtables.len() + 1); + memtable_iters.push(Box::new(snapshot.memtable.scan( + Bound::Included(KeySlice::from_slice(key, TS_RANGE_BEGIN)), + Bound::Included(KeySlice::from_slice(key, TS_RANGE_END)), + ))); // memtable scan range is [key, key] + + // traverse imm-memtable + for imm_memtable in snapshot.imm_memtables.iter() { + memtable_iters.push(Box::new(imm_memtable.scan( + Bound::Included(KeySlice::from_slice(key, TS_RANGE_BEGIN)), + Bound::Included(KeySlice::from_slice(key, TS_RANGE_END)), + ))); + } + // using merge iterator to merge all memtables and imm_memtables iters + let memtable_iter = MergeIterator::create(memtable_iters); + + let check_sst = |key: &[u8], sst: &SsTable| { + // check if the key is within the SST's key range + if key_within(key, sst.first_key().key_ref(), sst.last_key().key_ref()) { + // bloom filter check + if let Some(bloom) = &sst.bloom { + if bloom.may_contain(farmhash::fingerprint32(key)) { + return true; + } + } else { + return true; + } + } + false + }; + + // create merge iterator for l0_sstables + let mut l0_iters = Vec::with_capacity(snapshot.l0_sstables.len()); + for sst_id in snapshot.l0_sstables.iter() { + let sst = snapshot.sstables[sst_id].clone(); + if check_sst(key, &sst) { + let iter = SsTableIterator::create_and_seek_to_key( + sst.clone(), + KeySlice::from_slice(key, TS_RANGE_BEGIN), + )?; + l0_iters.push(Box::new(iter)); + } + } + let memtable_and_l0_iter = + TwoMergeIterator::create(memtable_iter, MergeIterator::create(l0_iters))?; + + // create merge iterator for multi-level sstables + let mut level_iters = Vec::with_capacity(snapshot.levels.len()); + for (_, level_sst_ids) in &snapshot.levels { + let mut ssts = Vec::with_capacity(level_sst_ids.len()); + for sst_id in level_sst_ids { + let sst = snapshot.sstables[sst_id].clone(); + if check_sst(key, &sst) { + ssts.push(sst.clone()); + } + } + let level_iter = SstConcatIterator::create_and_seek_to_key( + ssts, + KeySlice::from_slice(key, TS_RANGE_BEGIN), + )?; + level_iters.push(Box::new(level_iter)); + } + let merge_level_iter = MergeIterator::create(level_iters); + + let iter = LsmIterator::new( + TwoMergeIterator::create(memtable_and_l0_iter, merge_level_iter)?, + Bound::Unbounded, + read_ts, + )?; + if iter.is_valid() && iter.key() == key && !iter.value().is_empty() { + return Ok(Some(Bytes::copy_from_slice(iter.value()))); + } + + Ok(None) + } + /// Create an iterator over a range of keys. - pub fn scan( + pub fn scan<'a>( + self: &'a Arc, + lower: Bound<&[u8]>, + upper: Bound<&[u8]>, + ) -> Result { + let txn = self.mvcc().new_txn(self.clone(), self.options.serializable); + txn.scan(lower, upper) + } + + pub fn scan_with_txn( &self, - _lower: Bound<&[u8]>, - _upper: Bound<&[u8]>, + lower: Bound<&[u8]>, + upper: Bound<&[u8]>, + read_ts: u64, ) -> Result> { - unimplemented!() + let snapshot = { + let guard = self.state.read(); + Arc::clone(&guard) + }; // drop global lock here + + // need to get all memtables and imm_memtables + let mut memtable_iters = Vec::with_capacity(snapshot.imm_memtables.len() + 1); + memtable_iters.push(Box::new(snapshot.memtable.scan( + map_key_bound_plus_ts(lower, TS_RANGE_BEGIN), + map_key_bound_plus_ts(upper, TS_RANGE_END), + ))); + + for imm_memtable in snapshot.imm_memtables.iter() { + memtable_iters.push(Box::new(imm_memtable.scan( + map_key_bound_plus_ts(lower, TS_RANGE_BEGIN), + map_key_bound_plus_ts(upper, TS_RANGE_END), + ))); + } + // using merge iterator to merge all memtables and imm_memtables iters + let memtable_iter = MergeIterator::create(memtable_iters); + + // using merge iterator to merge all sstables iters + let mut l0_iters = Vec::with_capacity(snapshot.l0_sstables.len()); + for sst_id in snapshot.l0_sstables.iter() { + let sst = snapshot.sstables[sst_id].clone(); + println!("sst_id: {}, scan range: {:?} {:?}", sst_id, lower, upper); + // filter out some SSTs that do not contain the key range + if check_intersect_of_range( + lower, + upper, + sst.first_key().as_key_slice(), + sst.last_key().as_key_slice(), + ) { + // SST iterator does not support passing an end bound to it. + // Therefore, need to handle the end_bound manually in LsmIterator + let iter = match lower { + Bound::Included(key) => SsTableIterator::create_and_seek_to_key( + sst, + KeySlice::from_slice(key, TS_RANGE_BEGIN), + )?, + Bound::Excluded(key) => { + let mut iter = SsTableIterator::create_and_seek_to_key( + sst, + KeySlice::from_slice(key, TS_RANGE_BEGIN), + )?; + while iter.is_valid() && iter.key().key_ref() == key { + iter.next()?; + } + iter + } + Bound::Unbounded => SsTableIterator::create_and_seek_to_first(sst)?, + }; + + l0_iters.push(Box::new(iter)); + } + } + // memtables and imm_memtables are merged first, then the result is merged with L0 SSTs + let memtable_and_l0_iter = + TwoMergeIterator::create(memtable_iter, MergeIterator::create(l0_iters))?; + + // concat multi-level sstables + let mut sst_concat_iters = Vec::with_capacity(snapshot.levels.len()); + for (level, level_sst_ids) in &snapshot.levels { + println!("level: {}", level); + let mut ssts = Vec::with_capacity(level_sst_ids.len()); + for sst_id in level_sst_ids.iter() { + let sst = snapshot.sstables[sst_id].clone(); + if check_intersect_of_range( + lower, + upper, + sst.first_key().as_key_slice(), + sst.last_key().as_key_slice(), + ) { + ssts.push(sst); + } + } + let concat_iter = match lower { + Bound::Included(key) => SstConcatIterator::create_and_seek_to_key( + ssts, + KeySlice::from_slice(key, TS_RANGE_BEGIN), + )?, + Bound::Excluded(key) => { + let mut iter = SstConcatIterator::create_and_seek_to_key( + ssts, + KeySlice::from_slice(key, TS_RANGE_BEGIN), + )?; + while iter.is_valid() && iter.key().key_ref() == key { + iter.next()?; + } + iter + } + Bound::Unbounded => SstConcatIterator::create_and_seek_to_first(ssts)?, + }; + sst_concat_iters.push(Box::new(concat_iter)); + } + let merge_sst_iter = MergeIterator::create(sst_concat_iters); + + // finally, the result is merged with L1 SSTs + let two_merge_iter = TwoMergeIterator::create(memtable_and_l0_iter, merge_sst_iter)?; + + Ok(FusedIterator::new(LsmIterator::new( + two_merge_iter, + map_bound(upper), + read_ts, + )?)) } + + /// Write a batch of data into the storage. Implement in week 2 day 7. + pub fn write_batch>( + self: &Arc, + batch: &[WriteBatchRecord], + ) -> Result<()> { + if self.options.serializable { + let txn = self.mvcc().new_txn(self.clone(), true); + for record in batch { + match record { + WriteBatchRecord::Put(key, value) => { + txn.put(key.as_ref(), value.as_ref()); + } + WriteBatchRecord::Del(key) => { + txn.delete(key.as_ref()); + } + } + } + txn.commit()?; + } else { + self.write_batch_inner(batch)?; + } + Ok(()) + } + + pub fn write_batch_inner>(&self, batch: &[WriteBatchRecord]) -> Result { + let _lck = self.mvcc().write_lock.lock(); + let ts = self.mvcc().latest_commit_ts() + 1; + for record in batch { + match record { + WriteBatchRecord::Put(key, value) => { + let size; + { + let key = key.as_ref(); + let value = value.as_ref(); + assert!(!key.is_empty(), "key cannot be empty"); + assert!(!value.is_empty(), "value cannot be empty"); + let snapshot = self.state.read(); + snapshot + .memtable + .put(KeySlice::from_slice(key, ts), value)?; + size = snapshot.memtable.approximate_size(); + } + + self.try_freeze(size)?; + } + WriteBatchRecord::Del(key) => { + let size; + { + let key = key.as_ref(); + let snapshot = self.state.read(); + snapshot.memtable.put(KeySlice::from_slice(key, ts), &[])?; + size = snapshot.memtable.approximate_size(); + } + + self.try_freeze(size)?; + } + } + } + self.mvcc().update_commit_ts(ts); + Ok(ts) + } + + /// Put a key-value pair into the storage by writing into the current memtable. + pub fn put(self: &Arc, key: &[u8], value: &[u8]) -> Result<()> { + if self.options.serializable { + let txn = self.mvcc().new_txn(self.clone(), true); + txn.put(key, value); + txn.commit()?; + } else { + self.write_batch_inner(&[WriteBatchRecord::Put(key, value)])?; + } + Ok(()) + } + + /// Remove a key from the storage by writing an empty value. + pub fn delete(self: &Arc, key: &[u8]) -> Result<()> { + if self.options.serializable { + let txn = self.mvcc().new_txn(self.clone(), true); + txn.delete(key); + txn.commit()?; + } else { + self.write_batch_inner(&[WriteBatchRecord::Del(key)])?; + } + Ok(()) + } +} + +// utils + +// key_within checks if the user's key is within the SST's key range. +fn key_within(key: &[u8], sst_begin: &[u8], sst_end: &[u8]) -> bool { + sst_begin <= key && key <= sst_end +} + +// Check if user's `key_range` intersects with the SST's key range. +fn check_intersect_of_range( + begin: Bound<&[u8]>, + end: Bound<&[u8]>, + sst_begin: KeySlice, + sst_end: KeySlice, +) -> bool { + println!( + "intersected: {:?} {:?}, sst: {:?} {:?}", + begin, end, sst_begin, sst_end + ); + match end { + Bound::Excluded(key) if key <= sst_begin.key_ref() => { + return false; + } + Bound::Included(key) if key < sst_begin.key_ref() => { + return false; + } + _ => {} + } + match begin { + Bound::Excluded(key) if sst_end.key_ref() <= key => { + return false; + } + Bound::Included(key) if sst_end.key_ref() < key => { + return false; + } + _ => {} + } + + true } diff --git a/mini-lsm-starter/src/manifest.rs b/mini-lsm-starter/src/manifest.rs index e9b005926..891fae727 100644 --- a/mini-lsm-starter/src/manifest.rs +++ b/mini-lsm-starter/src/manifest.rs @@ -1,15 +1,17 @@ -#![allow(dead_code)] // REMOVE THIS LINE after fully implementing this functionality - -use std::fs::File; +use std::fs::OpenOptions; +use std::io::Read; use std::path::Path; use std::sync::Arc; +use std::{fs::File, io::Write}; -use anyhow::Result; +use anyhow::{Context, Result}; +use bytes::{Buf, BufMut}; use parking_lot::{Mutex, MutexGuard}; use serde::{Deserialize, Serialize}; use crate::compact::CompactionTask; +// | len | JSON record | checksum | len | JSON record | checksum | len | JSON record | checksum | pub struct Manifest { file: Arc>, } @@ -22,12 +24,45 @@ pub enum ManifestRecord { } impl Manifest { - pub fn create(_path: impl AsRef) -> Result { - unimplemented!() + pub fn create(path: impl AsRef) -> Result { + Ok(Self { + file: Arc::new(Mutex::new( + OpenOptions::new() + .read(true) + .create_new(true) + .write(true) + .open(path) + .context("[manifest.create] failed to create manifest")?, + )), + }) } - pub fn recover(_path: impl AsRef) -> Result<(Self, Vec)> { - unimplemented!() + pub fn recover(path: impl AsRef) -> Result<(Self, Vec)> { + let mut file = OpenOptions::new() + .read(true) + .append(true) + .open(path) + .context("failed to recover manifest")?; + let mut buf = Vec::new(); + file.read_to_end(&mut buf)?; + // need to check all records checksum + let mut stream = buf.as_slice(); + let mut records = Vec::new(); + while stream.has_remaining() { + let len = stream.get_u64() as usize; + let record = stream.copy_to_bytes(len); + let checksum = stream.get_u32(); + if crc32fast::hash(&record) != checksum { + return Err(anyhow::anyhow!("Manifest record checksum mismatch")); + } + records.push(serde_json::from_slice(&record)?); + } + Ok(( + Self { + file: Arc::new(Mutex::new(file)), + }, + records, + )) } pub fn add_record( @@ -38,7 +73,15 @@ impl Manifest { self.add_record_when_init(record) } - pub fn add_record_when_init(&self, _record: ManifestRecord) -> Result<()> { - unimplemented!() + pub fn add_record_when_init(&self, record: ManifestRecord) -> Result<()> { + let mut file = self.file.lock(); + let mut buf = serde_json::to_vec(&record)?; + file.write_all(&(buf.len() as u64).to_be_bytes())?; + // add checksum + let checksum = crc32fast::hash(&buf); + buf.put_u32(checksum); + file.write_all(&buf)?; + file.sync_all()?; + Ok(()) } } diff --git a/mini-lsm-starter/src/mem_table.rs b/mini-lsm-starter/src/mem_table.rs index 57c961482..6ba0f2d8b 100644 --- a/mini-lsm-starter/src/mem_table.rs +++ b/mini-lsm-starter/src/mem_table.rs @@ -7,11 +7,11 @@ use std::sync::Arc; use anyhow::Result; use bytes::Bytes; -use crossbeam_skiplist::SkipMap; +use crossbeam_skiplist::{map::Entry, SkipMap}; use ouroboros::self_referencing; use crate::iterators::StorageIterator; -use crate::key::KeySlice; +use crate::key::{KeyBytes, KeySlice, TS_DEFAULT}; use crate::table::SsTableBuilder; use crate::wal::Wal; @@ -19,8 +19,9 @@ use crate::wal::Wal; /// /// An initial implementation of memtable is part of week 1, day 1. It will be incrementally implemented in other /// chapters of week 1 and week 2. +#[derive(Debug)] pub struct MemTable { - map: Arc>, + pub(crate) map: Arc>, wal: Option, id: usize, approximate_size: Arc, @@ -35,28 +36,68 @@ pub(crate) fn map_bound(bound: Bound<&[u8]>) -> Bound { } } +/// Create a bound of `Bytes` from a bound of `KeySlice`. +pub(crate) fn map_key_bound(bound: Bound) -> Bound { + match bound { + Bound::Included(x) => Bound::Included(KeyBytes::from_bytes_with_ts( + Bytes::copy_from_slice(x.key_ref()), + x.ts(), + )), + Bound::Excluded(x) => Bound::Excluded(KeyBytes::from_bytes_with_ts( + Bytes::copy_from_slice(x.key_ref()), + x.ts(), + )), + Bound::Unbounded => Bound::Unbounded, + } +} + +/// Create a bound of `Bytes` from a bound of `KeySlice`. +pub(crate) fn map_key_bound_plus_ts(bound: Bound<&[u8]>, ts: u64) -> Bound { + match bound { + Bound::Included(x) => Bound::Included(KeySlice::from_slice(x, ts)), + Bound::Excluded(x) => Bound::Excluded(KeySlice::from_slice(x, ts)), + Bound::Unbounded => Bound::Unbounded, + } +} + impl MemTable { /// Create a new mem-table. - pub fn create(_id: usize) -> Self { - unimplemented!() + pub fn create(id: usize) -> Self { + Self { + map: Arc::new(SkipMap::new()), + wal: None, + id, + approximate_size: Arc::new(AtomicUsize::new(0)), + } } /// Create a new mem-table with WAL - pub fn create_with_wal(_id: usize, _path: impl AsRef) -> Result { - unimplemented!() + pub fn create_with_wal(id: usize, path: impl AsRef) -> Result { + Ok(Self { + map: Arc::new(SkipMap::new()), + wal: Some(Wal::create(path)?), + id, + approximate_size: Arc::new(AtomicUsize::new(0)), + }) } /// Create a memtable from WAL - pub fn recover_from_wal(_id: usize, _path: impl AsRef) -> Result { - unimplemented!() + pub fn recover_from_wal(id: usize, path: impl AsRef) -> Result { + let map = Arc::new(SkipMap::new()); + Ok(Self { + wal: Some(Wal::recover(path, &map)?), + map, + id, + approximate_size: Arc::new(AtomicUsize::new(0)), + }) } pub fn for_testing_put_slice(&self, key: &[u8], value: &[u8]) -> Result<()> { - self.put(key, value) + self.put(KeySlice::from_slice(key, TS_DEFAULT), value) } pub fn for_testing_get_slice(&self, key: &[u8]) -> Option { - self.get(key) + self.get(KeySlice::from_slice(key, TS_DEFAULT)) } pub fn for_testing_scan_slice( @@ -64,20 +105,40 @@ impl MemTable { lower: Bound<&[u8]>, upper: Bound<&[u8]>, ) -> MemTableIterator { - self.scan(lower, upper) + self.scan( + map_key_bound_plus_ts(lower, TS_DEFAULT), + map_key_bound_plus_ts(upper, TS_DEFAULT), + ) } + /// should not be used after finishing week 3 /// Get a value by key. - pub fn get(&self, _key: &[u8]) -> Option { - unimplemented!() + pub fn get(&self, key: KeySlice) -> Option { + let key_bytes = KeyBytes::from_bytes_with_ts( + Bytes::from_static(unsafe { std::mem::transmute(key.key_ref()) }), + key.ts(), + ); + + self.map.get(&key_bytes).map(|x| x.value().clone()) } /// Put a key-value pair into the mem-table. /// /// In week 1, day 1, simply put the key-value pair into the skipmap. /// In week 2, day 6, also flush the data to WAL. - pub fn put(&self, _key: &[u8], _value: &[u8]) -> Result<()> { - unimplemented!() + pub fn put(&self, key: KeySlice, value: &[u8]) -> Result<()> { + self.approximate_size.fetch_add( + key.raw_len() + value.len(), + std::sync::atomic::Ordering::Relaxed, + ); + self.map.insert( + key.to_key_vec().into_key_bytes(), + Bytes::copy_from_slice(value), + ); + if let Some(ref wal) = self.wal { + wal.put(key, value)?; + } + Ok(()) } pub fn sync_wal(&self) -> Result<()> { @@ -88,13 +149,25 @@ impl MemTable { } /// Get an iterator over a range of keys. - pub fn scan(&self, _lower: Bound<&[u8]>, _upper: Bound<&[u8]>) -> MemTableIterator { - unimplemented!() + pub fn scan(&self, lower: Bound, upper: Bound) -> MemTableIterator { + let map = self.map.clone(); + let mut iter = MemTableIteratorBuilder { + map, + iter_builder: |map| map.range((map_key_bound(lower), map_key_bound(upper))), + item: (KeyBytes::new(), Bytes::new()), + } + .build(); + let next = iter.with_iter_mut(|iter| MemTableIterator::entry_to_item(iter.next())); + iter.with_item_mut(|item| *item = next); + iter } /// Flush the mem-table to SSTable. Implement in week 1 day 6. - pub fn flush(&self, _builder: &mut SsTableBuilder) -> Result<()> { - unimplemented!() + pub fn flush(&self, builder: &mut SsTableBuilder) -> Result<()> { + self.map + .iter() + .for_each(|entry| builder.add(entry.key().as_key_slice(), entry.value())); + Ok(()) } pub fn id(&self) -> usize { @@ -112,41 +185,59 @@ impl MemTable { } } -type SkipMapRangeIter<'a> = - crossbeam_skiplist::map::Range<'a, Bytes, (Bound, Bound), Bytes, Bytes>; +type SkipMapRangeIter<'a> = crossbeam_skiplist::map::Range< + 'a, + KeyBytes, + (Bound, Bound), + KeyBytes, + Bytes, +>; /// An iterator over a range of `SkipMap`. This is a self-referential structure and please refer to week 1, day 2 /// chapter for more information. /// /// This is part of week 1, day 2. +/// changed in week 3, day 2. #[self_referencing] pub struct MemTableIterator { /// Stores a reference to the skipmap. - map: Arc>, + map: Arc>, /// Stores a skipmap iterator that refers to the lifetime of `MemTableIterator` itself. #[borrows(map)] #[not_covariant] iter: SkipMapRangeIter<'this>, /// Stores the current key-value pair. - item: (Bytes, Bytes), + item: (KeyBytes, Bytes), +} + +impl MemTableIterator { + // This function is used to convert a `SkipMap` entry to a key-value pair. + fn entry_to_item(entry: Option>) -> (KeyBytes, Bytes) { + entry + .map(|x| (x.key().clone(), x.value().clone())) + .unwrap_or_else(|| (KeyBytes::new(), Bytes::new())) + } } impl StorageIterator for MemTableIterator { type KeyType<'a> = KeySlice<'a>; fn value(&self) -> &[u8] { - unimplemented!() + &self.borrow_item().1 } fn key(&self) -> KeySlice { - unimplemented!() + self.borrow_item().0.as_key_slice() } + // is_valid returns if the iterator has reached the end or errored. fn is_valid(&self) -> bool { - unimplemented!() + !self.borrow_item().0.is_empty() } fn next(&mut self) -> Result<()> { - unimplemented!() + let next = self.with_iter_mut(|iter| MemTableIterator::entry_to_item(iter.next())); + self.with_item_mut(|item| *item = next); + Ok(()) } } diff --git a/mini-lsm-starter/src/mvcc.rs b/mini-lsm-starter/src/mvcc.rs index 28481f4ec..1e4f94d51 100644 --- a/mini-lsm-starter/src/mvcc.rs +++ b/mini-lsm-starter/src/mvcc.rs @@ -1,8 +1,5 @@ -#![allow(unused_variables)] // TODO(you): remove this lint after implementing this mod -#![allow(dead_code)] // TODO(you): remove this lint after implementing this mod - pub mod txn; -mod watermark; +pub mod watermark; use std::{ collections::{BTreeMap, HashSet}, @@ -55,6 +52,19 @@ impl LsmMvccInner { } pub fn new_txn(&self, inner: Arc, serializable: bool) -> Arc { - unimplemented!() + let mut ts = self.ts.lock(); + let read_ts = ts.0; + ts.1.add_reader(read_ts); + Arc::new(Transaction { + read_ts, + inner, + local_storage: Arc::new(Default::default()), + committed: Arc::new(Default::default()), + key_hashes: if serializable { + Some(Mutex::new((HashSet::new(), HashSet::new()))) + } else { + None + }, + }) } } diff --git a/mini-lsm-starter/src/mvcc/txn.rs b/mini-lsm-starter/src/mvcc/txn.rs index 33ec6a0df..12c9d0ecc 100644 --- a/mini-lsm-starter/src/mvcc/txn.rs +++ b/mini-lsm-starter/src/mvcc/txn.rs @@ -1,24 +1,27 @@ -#![allow(unused_variables)] // TODO(you): remove this lint after implementing this mod -#![allow(dead_code)] // TODO(you): remove this lint after implementing this mod - use std::{ collections::HashSet, ops::Bound, - sync::{atomic::AtomicBool, Arc}, + sync::{ + atomic::{AtomicBool, Ordering}, + Arc, + }, }; use anyhow::Result; use bytes::Bytes; -use crossbeam_skiplist::SkipMap; +use crossbeam_skiplist::{map::Entry, SkipMap}; use ouroboros::self_referencing; use parking_lot::Mutex; use crate::{ iterators::{two_merge_iterator::TwoMergeIterator, StorageIterator}, lsm_iterator::{FusedIterator, LsmIterator}, - lsm_storage::LsmStorageInner, + lsm_storage::{LsmStorageInner, WriteBatchRecord}, + mem_table::map_bound, }; +use super::CommittedTxnData; + pub struct Transaction { pub(crate) read_ts: u64, pub(crate) inner: Arc, @@ -30,28 +33,152 @@ pub struct Transaction { impl Transaction { pub fn get(&self, key: &[u8]) -> Result> { - unimplemented!() + if self.committed.load(Ordering::SeqCst) { + panic!("cannot operate on committed txn!"); + } + + if let Some(guard) = &self.key_hashes { + let mut guard = guard.lock(); + let (_, read_set) = &mut *guard; + read_set.insert(farmhash::hash32(key)); + } + + if let Some(entry) = self.local_storage.get(key) { + if entry.value().is_empty() { + return Ok(None); + } + return Ok(Some(entry.value().clone())); + } + self.inner.get_with_txn(key, self.read_ts) } pub fn scan(self: &Arc, lower: Bound<&[u8]>, upper: Bound<&[u8]>) -> Result { - unimplemented!() + if self.committed.load(Ordering::SeqCst) { + panic!("cannot operate on committed txn!"); + } + let mut local_iter = TxnLocalIteratorBuilder { + map: self.local_storage.clone(), + iter_builder: |map| map.range((map_bound(lower), map_bound(upper))), + item: (Bytes::new(), Bytes::new()), + } + .build(); + + // handle deletions + let next = local_iter.with_iter_mut(|iter| TxnLocalIterator::entry_to_item(iter.next())); + local_iter.with_mut(|x| *x.item = next); + + TxnIterator::create( + self.clone(), + TwoMergeIterator::create( + local_iter, + self.inner.scan_with_txn(lower, upper, self.read_ts)?, + )?, + ) } pub fn put(&self, key: &[u8], value: &[u8]) { - unimplemented!() + if self.committed.load(Ordering::SeqCst) { + panic!("cannot operate on committed txn!"); + } + if let Some(key_hashes) = &self.key_hashes { + let mut key_hashes = key_hashes.lock(); + let (write_hashes, _) = &mut *key_hashes; + write_hashes.insert(farmhash::hash32(key)); + } + self.local_storage + .insert(Bytes::copy_from_slice(key), Bytes::copy_from_slice(value)); } pub fn delete(&self, key: &[u8]) { - unimplemented!() + if self.committed.load(Ordering::SeqCst) { + panic!("cannot operate on committed txn!"); + } + if let Some(key_hashes) = &self.key_hashes { + let mut key_hashes = key_hashes.lock(); + let (write_hashes, _) = &mut *key_hashes; + write_hashes.insert(farmhash::hash32(key)); + } + self.local_storage + .insert(Bytes::copy_from_slice(key), Bytes::new()); } pub fn commit(&self) -> Result<()> { - unimplemented!() + self.committed + .compare_exchange(false, true, Ordering::SeqCst, Ordering::SeqCst) + .expect("cannot operate on committed txn!"); + // ensures only one transaction goes into the transaction verification and commit phase. + let _commit_lock = self.inner.mvcc().commit_lock.lock(); + if let Some(guard) = &self.key_hashes { + let guard = guard.lock(); + let (write_set, read_set) = &*guard; + println!( + "commit txn: write_set: {:?}, read_set: {:?}", + write_set, read_set + ); + if !write_set.is_empty() { + let committed_txns = self.inner.mvcc().committed_txns.lock(); + // go through all transactions with commit timestamp within range (read_ts, expected_commit_ts) (both excluded bounds) + for (_, txn_data) in committed_txns.range(self.read_ts + 1..) { + for key_hash in read_set { + // if the read set of the current transaction overlaps with the write set of any transaction. + if txn_data.key_hashes.contains(key_hash) { + println!( + "txn conflict detected: {:?} {:?}", + write_set, txn_data.key_hashes + ); + return Err(anyhow::anyhow!("txn conflict detected")); + } + } + } + } + } + + let batch = self + .local_storage + .iter() + .map(|entry| { + if entry.value().is_empty() { + WriteBatchRecord::Del(entry.key().clone()) + } else { + WriteBatchRecord::Put(entry.key().clone(), entry.value().clone()) + } + }) + .collect::>(); + let commit_ts = self.inner.write_batch_inner(&batch)?; + + // insert the write set into the committed_txns + if let Some(_) = &self.key_hashes { + let mut committed_txns = self.inner.mvcc().committed_txns.lock(); + let mut key_hashes = self.key_hashes.as_ref().unwrap().lock(); + let (write_set, _) = &mut *key_hashes; + committed_txns.insert( + commit_ts, + CommittedTxnData { + key_hashes: std::mem::take(write_set), + read_ts: self.read_ts, + commit_ts, + }, + ); + + // remove all transactions below the watermark + let watermark = self.inner.mvcc().watermark(); + while let Some(entry) = committed_txns.first_entry() { + if *entry.key() < watermark { + entry.remove(); + } else { + break; + } + } + } + + Ok(()) } } impl Drop for Transaction { - fn drop(&mut self) {} + fn drop(&mut self) { + self.inner.mvcc().ts.lock().1.remove_reader(self.read_ts); + } } type SkipMapRangeIter<'a> = @@ -69,28 +196,39 @@ pub struct TxnLocalIterator { item: (Bytes, Bytes), } +impl TxnLocalIterator { + // This function is used to convert a `SkipMap` entry to a key-value pair. + fn entry_to_item(entry: Option>) -> (Bytes, Bytes) { + entry + .map(|x| (x.key().clone(), x.value().clone())) + .unwrap_or_else(|| (Bytes::new(), Bytes::new())) + } +} + impl StorageIterator for TxnLocalIterator { type KeyType<'a> = &'a [u8]; fn value(&self) -> &[u8] { - unimplemented!() + &self.borrow_item().1 } fn key(&self) -> &[u8] { - unimplemented!() + &self.borrow_item().0[..] } fn is_valid(&self) -> bool { - unimplemented!() + !self.borrow_item().0.is_empty() } fn next(&mut self) -> Result<()> { - unimplemented!() + let next = self.with_iter_mut(|iter| TxnLocalIterator::entry_to_item(iter.next())); + self.with_item_mut(|item| *item = next); + Ok(()) } } pub struct TxnIterator { - _txn: Arc, + txn: Arc, iter: TwoMergeIterator>, } @@ -99,7 +237,29 @@ impl TxnIterator { txn: Arc, iter: TwoMergeIterator>, ) -> Result { - unimplemented!() + let mut iter = Self { txn, iter }; + iter.skip_deletes()?; + if iter.is_valid() { + iter.add_to_read_set(iter.key()); + } + Ok(iter) + } + + // TwoMergeIterator will retain the deletion markers in the child iterators, + // we need to modify your TxnIterator implementation to correctly handle deletions. + fn skip_deletes(&mut self) -> Result<()> { + while self.iter.is_valid() && self.iter.value().is_empty() { + self.iter.next()?; + } + Ok(()) + } + + fn add_to_read_set(&self, key: &[u8]) { + if let Some(guard) = &self.txn.key_hashes { + let mut guard = guard.lock(); + let (_, read_set) = &mut *guard; + read_set.insert(farmhash::hash32(key)); + } } } @@ -119,7 +279,8 @@ impl StorageIterator for TxnIterator { } fn next(&mut self) -> Result<()> { - unimplemented!() + self.iter.next()?; + self.skip_deletes() } fn num_active_iterators(&self) -> usize { diff --git a/mini-lsm-starter/src/mvcc/watermark.rs b/mini-lsm-starter/src/mvcc/watermark.rs index 4bbb4fa01..05fc56f44 100644 --- a/mini-lsm-starter/src/mvcc/watermark.rs +++ b/mini-lsm-starter/src/mvcc/watermark.rs @@ -1,12 +1,16 @@ -#![allow(unused_variables)] // TODO(you): remove this lint after implementing this mod -#![allow(dead_code)] // TODO(you): remove this lint after implementing this mod - use std::collections::BTreeMap; +// Watermark is the lowest read timestamp among all in-progress transactions. pub struct Watermark { readers: BTreeMap, } +impl Default for Watermark { + fn default() -> Self { + Self::new() + } +} + impl Watermark { pub fn new() -> Self { Self { @@ -14,11 +18,23 @@ impl Watermark { } } - pub fn add_reader(&mut self, ts: u64) {} + pub fn add_reader(&mut self, ts: u64) { + *self.readers.entry(ts).or_default() += 1; + } - pub fn remove_reader(&mut self, ts: u64) {} + pub fn remove_reader(&mut self, ts: u64) { + let cnt = self.readers.get_mut(&ts).unwrap(); + *cnt -= 1; + if *cnt == 0 { + self.readers.remove(&ts); + } + } + + pub fn num_retained_snapshots(&self) -> usize { + self.readers.len() + } pub fn watermark(&self) -> Option { - Some(0) + self.readers.first_key_value().map(|(ts, _)| *ts) } } diff --git a/mini-lsm-starter/src/table.rs b/mini-lsm-starter/src/table.rs index 02baaa230..41522de33 100644 --- a/mini-lsm-starter/src/table.rs +++ b/mini-lsm-starter/src/table.rs @@ -9,9 +9,9 @@ use std::fs::File; use std::path::Path; use std::sync::Arc; -use anyhow::Result; +use anyhow::{Ok, Result}; pub use builder::SsTableBuilder; -use bytes::Buf; +use bytes::{Buf, BufMut}; pub use iterator::SsTableIterator; use crate::block::Block; @@ -30,21 +30,65 @@ pub struct BlockMeta { pub last_key: KeyBytes, } +/* +----------------------------------------------------------------------- +| block meta | ... | +----------------------------------------------------------------------- +| offset(4B) | first_key_len (2B) | first_key (keylen) ts | last_key_len (2B) | last_key (keylen) ts | ... | +----------------------------------------------------------------------- +*/ impl BlockMeta { /// Encode block meta to a buffer. /// You may add extra fields to the buffer, /// in order to help keep track of `first_key` when decoding from the same buffer in the future. - pub fn encode_block_meta( - block_meta: &[BlockMeta], - #[allow(clippy::ptr_arg)] // remove this allow after you finish - buf: &mut Vec, - ) { - unimplemented!() + pub fn encode_block_metas(block_meta: &[BlockMeta], buf: &mut Vec, max_ts: u64) { + let original_len = buf.len(); + let meta_len = block_meta.len(); + buf.put_u32(block_meta.len() as u32); + for meta in block_meta { + buf.put_u32(meta.offset as u32); + buf.put_u16(meta.first_key.key_len() as u16); + buf.put_slice(meta.first_key.key_ref()); + buf.put_u64(meta.first_key.ts()); + buf.put_u16(meta.last_key.key_len() as u16); + buf.put_slice(meta.last_key.key_ref()); + buf.put_u64(meta.last_key.ts()); + } + // put max ts + buf.put_u64(max_ts); + // add checksum + let checksum = crc32fast::hash(&buf[original_len + 4..]); + buf.put_u32(checksum); } /// Decode block meta from a buffer. - pub fn decode_block_meta(buf: impl Buf) -> Vec { - unimplemented!() + pub fn decode_block_metas(mut buf: &[u8]) -> Result<(Vec, u64)> { + // get meta data len + let meta_len = buf.get_u32() as usize; + // cal checksum + let checksum = crc32fast::hash(&buf[..buf.remaining() - 4]); + let mut block_meta = vec![]; + for _ in 0..meta_len { + let offset = buf.get_u32() as usize; + let first_key_len = buf.get_u16() as usize; + let first_key = + KeyBytes::from_bytes_with_ts(buf.copy_to_bytes(first_key_len), buf.get_u64()); + let last_key_len = buf.get_u16() as usize; + let last_key = + KeyBytes::from_bytes_with_ts(buf.copy_to_bytes(last_key_len), buf.get_u64()); + block_meta.push(BlockMeta { + offset, + first_key, + last_key, + }); + } + let max_ts = buf.get_u64(); + // checksum + if checksum != buf.get_u32() { + return Err(anyhow::anyhow!("BlockMeta checksum mismatch")); + } + + Ok((block_meta, max_ts)) } } @@ -85,16 +129,16 @@ impl FileObject { /// An SSTable. pub struct SsTable { + id: usize, /// The actual storage unit of SsTable, the format is as above. pub(crate) file: FileObject, /// The meta blocks that hold info for data blocks. - pub(crate) block_meta: Vec, + pub(crate) block_metas: Vec, /// The offset that indicates the start point of meta blocks in `file`. pub(crate) block_meta_offset: usize, - id: usize, - block_cache: Option>, first_key: KeyBytes, last_key: KeyBytes, + block_cache: Option>, pub(crate) bloom: Option, /// The maximum timestamp stored in this SST, implemented in week 3. max_ts: u64, @@ -108,7 +152,37 @@ impl SsTable { /// Open SSTable from a file. pub fn open(id: usize, block_cache: Option>, file: FileObject) -> Result { - unimplemented!() + let len = file.size(); + // decode bloom filter + // u32 for extra info + let raw_bloom_offset = file.read(len - 4, 4)?; + let bloom_offset = (&raw_bloom_offset[..]).get_u32() as u64; + let raw_bloom = file.read(bloom_offset, len - bloom_offset - 4 /* extra size */)?; + let bloom = Bloom::decode(raw_bloom.as_slice())?; + println!("decode bloom size: {}, k={}", bloom.filter.len(), bloom.k); + + // decode block meta + // u32 for extra info + let raw_block_meta_offset = file.read(bloom_offset - 4, 4)?; + let block_meta_offset = (&raw_block_meta_offset[..]).get_u32() as u64; + let raw_block_meta = file.read(block_meta_offset, bloom_offset - 4 - block_meta_offset)?; + let (block_metas, max_ts) = BlockMeta::decode_block_metas(raw_block_meta.as_slice())?; + + // decode data blocks + let raw_data = file.read(0, block_meta_offset)?; + + let sst_table = SsTable { + id, + file, + block_meta_offset: block_meta_offset as usize, + first_key: block_metas.first().unwrap().first_key.clone(), + last_key: block_metas.last().unwrap().last_key.clone(), + block_metas, + block_cache, + bloom: Some(bloom), + max_ts, + }; + Ok(sst_table) } /// Create a mock SST with only first key + last key metadata @@ -120,7 +194,7 @@ impl SsTable { ) -> Self { Self { file: FileObject(None, file_size), - block_meta: vec![], + block_metas: vec![], block_meta_offset: 0, id, block_cache: None, @@ -133,24 +207,56 @@ impl SsTable { /// Read a block from the disk. pub fn read_block(&self, block_idx: usize) -> Result> { - unimplemented!() + let offset = self.block_metas[block_idx].offset as u64; + let block_end = self + .block_metas + .get(block_idx + 1) + .map_or(self.block_meta_offset, |x| x.offset) as u64; + // get all data including checksum + let block_len = block_end - offset - 4; + let data_including_checksum = self.file.read(offset, block_end - offset)?; + let checksum = (&data_including_checksum[block_len as usize..]).get_u32(); + let block_data = &data_including_checksum[..block_len as usize]; + // check checksum + if checksum != crc32fast::hash(block_data) { + return Err(anyhow::anyhow!("checksum mismatch")); + } + + Ok(Arc::new(Block::decode(block_data))) } /// Read a block from disk, with block cache. (Day 4) pub fn read_block_cached(&self, block_idx: usize) -> Result> { - unimplemented!() + if let Some(ref block_cache) = self.block_cache { + let block = block_cache + .try_get_with((self.id, block_idx), || self.read_block(block_idx)) + .map_err(|e| anyhow::anyhow!("block cache error: {:?}", e))?; + Ok(block) + } else { + self.read_block(block_idx) + } } + /* + -------------------------------------- + | block 1 | block 2 | block meta | + -------------------------------------- + | a, b, c | e, f, g | 1: a/c, 2: e/g | + -------------------------------------- + */ /// Find the block that may contain `key`. - /// Note: You may want to make use of the `first_key` stored in `BlockMeta`. - /// You may also assume the key-value pairs stored in each consecutive block are sorted. + /// make use of the `first_key` stored in `BlockMeta`. + /// because for example, if we want to get `b`, + /// we can directly we can know block 1 contains keys a <= keys < e. pub fn find_block_idx(&self, key: KeySlice) -> usize { - unimplemented!() + self.block_metas + .partition_point(|meta| meta.first_key.as_key_slice() <= key) + .saturating_sub(1) } /// Get number of data blocks. pub fn num_of_blocks(&self) -> usize { - self.block_meta.len() + self.block_metas.len() } pub fn first_key(&self) -> &KeyBytes { diff --git a/mini-lsm-starter/src/table/bloom.rs b/mini-lsm-starter/src/table/bloom.rs index 1f2f453f6..262ffd09f 100644 --- a/mini-lsm-starter/src/table/bloom.rs +++ b/mini-lsm-starter/src/table/bloom.rs @@ -1,9 +1,10 @@ // Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. use anyhow::Result; -use bytes::{BufMut, Bytes, BytesMut}; +use bytes::{Buf, BufMut, Bytes, BytesMut}; -/// Implements a bloom filter +/// Bloom implements bloom filter functionalities over +/// a bit-slice of data. pub struct Bloom { /// data of filter in bits pub(crate) filter: Bytes, @@ -47,8 +48,14 @@ impl> BitSliceMut for T { impl Bloom { /// Decode a bloom filter pub fn decode(buf: &[u8]) -> Result { - let filter = &buf[..buf.len() - 1]; - let k = buf[buf.len() - 1]; + // get checksum as 4 bytes + let checksum = (&buf[buf.len() - 4..buf.len()]).get_u32(); + let all_data = &buf[..buf.len() - 4]; + if checksum != crc32fast::hash(all_data) { + return Err(anyhow::anyhow!("Bloom filter checksum mismatch")); + } + let filter = &buf[..buf.len() - 1 - 4]; + let k = buf[buf.len() - 1 - 4]; Ok(Self { filter: filter.to_vec().into(), k, @@ -57,15 +64,19 @@ impl Bloom { /// Encode a bloom filter pub fn encode(&self, buf: &mut Vec) { + let offset = buf.len(); buf.extend(&self.filter); buf.put_u8(self.k); + // add checksum + let checksum = crc32fast::hash(buf[offset..].as_ref()); + buf.put_u32(checksum); } /// Get bloom filter bits per key from entries count and FPR - pub fn bloom_bits_per_key(entries: usize, false_positive_rate: f64) -> usize { + pub fn bloom_bits_per_key(hashs_count: usize, false_positive_rate: f64) -> usize { let size = - -1.0 * (entries as f64) * false_positive_rate.ln() / std::f64::consts::LN_2.powi(2); - let locs = (size / (entries as f64)).ceil(); + -1.0 * (hashs_count as f64) * false_positive_rate.ln() / std::f64::consts::LN_2.powi(2); + let locs = (std::f64::consts::LN_2 * size / (hashs_count as f64)).ceil(); locs as usize } @@ -79,7 +90,17 @@ impl Bloom { let mut filter = BytesMut::with_capacity(nbytes); filter.resize(nbytes, 0); - // TODO: build the bloom filter + keys.iter().for_each(|h| { + /* h is the key hash */ + let mut h = *h; + let delta = (h >> 17) | (h << 15); + for j in 0..k { + let bitpos = delta as usize % nbits; + filter.set_bit(bitpos, true); + // we can add delta to uniformly distribute the bits + h = h.wrapping_add(delta); + } + }); Self { filter: filter.freeze(), @@ -88,7 +109,7 @@ impl Bloom { } /// Check if a bloom filter may contain some data - pub fn may_contain(&self, h: u32) -> bool { + pub fn may_contain(&self, mut h: u32) -> bool { if self.k > 30 { // potential new encoding for short bloom filters true @@ -96,9 +117,45 @@ impl Bloom { let nbits = self.filter.bit_len(); let delta = (h >> 17) | (h << 15); - // TODO: probe the bloom filter + for j in 0..self.k { + let bitpos = delta as usize % nbits; + if !self.filter.get_bit(bitpos) { + return false; + } + // we can add delta to uniformly distribute the bits + h = h.wrapping_add(delta); + } true } } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_small_bloom_filter() { + let hash: Vec = vec![b"hello".to_vec(), b"world".to_vec()] + .into_iter() + .map(|x| farmhash::fingerprint32(&x)) + .collect(); + let bloom = Bloom::build_from_key_hashes(&hash, 10); + + let check_hash: Vec = vec![ + b"hello".to_vec(), + b"world".to_vec(), + b"x".to_vec(), + b"fool".to_vec(), + ] + .into_iter() + .map(|x| farmhash::fingerprint32(&x)) + .collect(); + + assert!(bloom.may_contain(check_hash[0])); + assert!(bloom.may_contain(check_hash[1])); + assert!(!bloom.may_contain(check_hash[2])); + assert!(!bloom.may_contain(check_hash[3])); + } +} diff --git a/mini-lsm-starter/src/table/builder.rs b/mini-lsm-starter/src/table/builder.rs index cea3d08f8..28be9774a 100644 --- a/mini-lsm-starter/src/table/builder.rs +++ b/mini-lsm-starter/src/table/builder.rs @@ -1,36 +1,94 @@ -#![allow(unused_variables)] // TODO(you): remove this lint after implementing this mod -#![allow(dead_code)] // TODO(you): remove this lint after implementing this mod - use std::path::Path; use std::sync::Arc; use anyhow::Result; +use bytes::BufMut; -use super::{BlockMeta, SsTable}; -use crate::{block::BlockBuilder, key::KeySlice, lsm_storage::BlockCache}; +use super::{bloom::Bloom, BlockMeta, FileObject, SsTable}; +use crate::{ + block::BlockBuilder, + key::{KeySlice, KeyVec}, + lsm_storage::BlockCache, +}; /// Builds an SSTable from key-value pairs. pub struct SsTableBuilder { - builder: BlockBuilder, - first_key: Vec, - last_key: Vec, + block_builder: BlockBuilder, + first_key: KeyVec, + last_key: KeyVec, data: Vec, - pub(crate) meta: Vec, + pub(crate) metas: Vec, block_size: usize, + key_hashes: Vec, + max_ts: u64, } +/* +--------------------------------------------------------------------------------------------------------------------------------------------------------------- +| Block Section | Meta Section | +--------------------------------------------------------------------------------------------------------------------------------------------------------------- +| data block | checksum | ... | data block | checksum | no. of block | metadata | checksum | meta block offset | bloom filter | checksum | bloom filter offset | +| varlen | u32 | | varlen | u32 | u32 | varlen | u32 | u32 | varlen | u32 | u32 | +--------------------------------------------------------------------------------------------------------------------------------------------------------------- +*/ impl SsTableBuilder { /// Create a builder based on target block size. pub fn new(block_size: usize) -> Self { - unimplemented!() + Self { + block_builder: BlockBuilder::new(block_size), + first_key: KeyVec::new(), + last_key: KeyVec::new(), + data: Vec::new(), + metas: Vec::new(), + block_size, + key_hashes: Vec::new(), + max_ts: 0, + } } /// Adds a key-value pair to SSTable. /// - /// Note: You should split a new block when the current block is full.(`std::mem::replace` may - /// be helpful here) + /// we should split a new block when the current block is full. pub fn add(&mut self, key: KeySlice, value: &[u8]) { - unimplemented!() + // if the first time add to this block, set the first key + if self.first_key.is_empty() { + self.first_key.set_from_slice(key); + } + + // add the key hash to the bloom filter + self.key_hashes.push(farmhash::fingerprint32(key.key_ref())); + if key.ts() > self.max_ts { + self.max_ts = key.ts(); + } + // block builder returns false when the block is full. + if self.block_builder.add(key, value) { + self.last_key.set_from_slice(key); + return; + } + + // if the block is full, build the block and add the block to the data. + self.finalize_block_to_sst(); + + // add (key, value) to next block + assert!(self.block_builder.add(key, value)); + self.first_key.set_from_slice(key); + self.last_key.set_from_slice(key); + } + + // finalize the block and add it to the data. + fn finalize_block_to_sst(&mut self) { + let block = std::mem::replace(&mut self.block_builder, BlockBuilder::new(self.block_size)); + self.metas.push(BlockMeta { + offset: self.data.len(), /* previous data len */ + first_key: std::mem::take(&mut self.first_key).into_key_bytes(), + last_key: std::mem::take(&mut self.last_key).into_key_bytes(), + }); + + let encode_data = block.build().encode(); + self.data.extend_from_slice(&encode_data); + // add checksum + let checksum = crc32fast::hash(&encode_data); + self.data.put_u32(checksum); } /// Get the estimated size of the SSTable. @@ -38,17 +96,45 @@ impl SsTableBuilder { /// Since the data blocks contain much more data than meta blocks, just return the size of data /// blocks here. pub fn estimated_size(&self) -> usize { - unimplemented!() + self.data.len() } /// Builds the SSTable and writes it to the given path. Use the `FileObject` structure to manipulate the disk objects. pub fn build( - self, + mut self, id: usize, block_cache: Option>, path: impl AsRef, ) -> Result { - unimplemented!() + // finalize the last block whatever it is full or not. + self.finalize_block_to_sst(); + // encode the block meta and write it to the disk. + let meta_len = self.metas.len(); + let block_meta_offset = self.data.len(); + let mut buf = self.data; + BlockMeta::encode_block_metas(&self.metas, &mut buf, self.max_ts); + // extra info for the meta block offset + buf.put_u32(block_meta_offset as u32); + // create bloom filter and encode it + let bits_per_key = Bloom::bloom_bits_per_key(self.key_hashes.len(), 0.01); + let bloom = Bloom::build_from_key_hashes(&self.key_hashes, bits_per_key); + let bloom_offset = buf.len(); + bloom.encode(&mut buf); + buf.put_u32(bloom_offset as u32); + + let file = FileObject::create(path.as_ref(), buf)?; + let sst_table = SsTable { + id, + file, + first_key: self.metas.first().unwrap().first_key.clone(), + last_key: self.metas.last().unwrap().last_key.clone(), + block_metas: self.metas, + block_meta_offset, + block_cache, + bloom: Some(bloom), + max_ts: self.max_ts, + }; + Ok(sst_table) } #[cfg(test)] diff --git a/mini-lsm-starter/src/table/iterator.rs b/mini-lsm-starter/src/table/iterator.rs index 32b06a2a3..ad8eff595 100644 --- a/mini-lsm-starter/src/table/iterator.rs +++ b/mini-lsm-starter/src/table/iterator.rs @@ -3,7 +3,7 @@ use std::sync::Arc; -use anyhow::Result; +use anyhow::{Ok, Result}; use super::SsTable; use crate::{block::BlockIterator, iterators::StorageIterator, key::KeySlice}; @@ -16,26 +16,62 @@ pub struct SsTableIterator { } impl SsTableIterator { + fn seek_to_first_inner(table: &Arc) -> Result<(usize, BlockIterator)> { + Ok(( + 0, + BlockIterator::create_and_seek_to_first(table.read_block_cached(0)?), + )) + } + /// Create a new iterator and seek to the first key-value pair in the first data block. pub fn create_and_seek_to_first(table: Arc) -> Result { - unimplemented!() + let (idx, iter) = SsTableIterator::seek_to_first_inner(&table)?; + Ok(Self { + table, + blk_iter: iter, + blk_idx: idx, + }) } /// Seek to the first key-value pair in the first data block. pub fn seek_to_first(&mut self) -> Result<()> { - unimplemented!() + let (blk_idx, blk_iter) = Self::seek_to_first_inner(&self.table)?; + self.blk_idx = blk_idx; + self.blk_iter = blk_iter; + Ok(()) + } + + // using the first key of each block to do the binary search so as to reduce the complexity. + fn seek_to_key_inner(table: &Arc, key: KeySlice) -> Result<(usize, BlockIterator)> { + let mut blk_idx = table.find_block_idx(key); + let mut blk_iter = + BlockIterator::create_and_seek_to_key(table.read_block_cached(blk_idx)?, key); + if !blk_iter.is_valid() { + blk_idx += 1; + if blk_idx < table.num_of_blocks() { + blk_iter = + BlockIterator::create_and_seek_to_first(table.read_block_cached(blk_idx)?); + } + } + Ok((blk_idx, blk_iter)) } /// Create a new iterator and seek to the first key-value pair which >= `key`. pub fn create_and_seek_to_key(table: Arc, key: KeySlice) -> Result { - unimplemented!() + let (idx, iter) = SsTableIterator::seek_to_key_inner(&table, key)?; + Ok(Self { + table, + blk_iter: iter, + blk_idx: idx, + }) } /// Seek to the first key-value pair which >= `key`. - /// Note: You probably want to review the handout for detailed explanation when implementing - /// this function. pub fn seek_to_key(&mut self, key: KeySlice) -> Result<()> { - unimplemented!() + let (blk_idx, blk_iter) = Self::seek_to_key_inner(&self.table, key)?; + self.blk_idx = blk_idx; + self.blk_iter = blk_iter; + Ok(()) } } @@ -44,22 +80,32 @@ impl StorageIterator for SsTableIterator { /// Return the `key` that's held by the underlying block iterator. fn key(&self) -> KeySlice { - unimplemented!() + debug_assert!(!self.blk_iter.key().is_empty(), "invalid iterator"); + self.blk_iter.key() } /// Return the `value` that's held by the underlying block iterator. fn value(&self) -> &[u8] { - unimplemented!() + self.blk_iter.value() } /// Return whether the current block iterator is valid or not. fn is_valid(&self) -> bool { - unimplemented!() + self.blk_iter.is_valid() } /// Move to the next `key` in the block. /// Note: You may want to check if the current block iterator is valid after the move. fn next(&mut self) -> Result<()> { - unimplemented!() + self.blk_iter.next(); + if !self.blk_iter.is_valid() { + self.blk_idx += 1; + if self.blk_idx < self.table.num_of_blocks() { + self.blk_iter = BlockIterator::create_and_seek_to_first( + self.table.read_block_cached(self.blk_idx)?, + ); + } + } + Ok(()) } } diff --git a/mini-lsm-starter/src/tests.rs b/mini-lsm-starter/src/tests.rs index 688adfa83..466d6402c 100644 --- a/mini-lsm-starter/src/tests.rs +++ b/mini-lsm-starter/src/tests.rs @@ -1,2 +1,24 @@ //! DO NOT MODIFY -- Mini-LSM tests modules //! This file will be automatically rewritten by the copy-test command. + +mod harness; +mod week1_day1; +mod week1_day2; +mod week1_day3; +mod week1_day4; +mod week1_day5; +mod week1_day6; +mod week1_day7; +mod week2_day1; +mod week2_day2; +mod week2_day3; +mod week2_day4; +mod week2_day5; +mod week2_day6; +mod week3_day1; +mod week3_day2; +mod week3_day3; +mod week3_day4; +mod week3_day5; +mod week3_day6; +mod week3_day7; diff --git a/mini-lsm-starter/src/tests/harness.rs b/mini-lsm-starter/src/tests/harness.rs new file mode 100644 index 000000000..b41745b2f --- /dev/null +++ b/mini-lsm-starter/src/tests/harness.rs @@ -0,0 +1,442 @@ +use std::{ + collections::BTreeMap, ops::Bound, os::unix::fs::MetadataExt, path::Path, sync::Arc, + time::Duration, +}; + +use anyhow::{bail, Result}; +use bytes::Bytes; + +use crate::{ + compact::{ + CompactionOptions, LeveledCompactionOptions, SimpleLeveledCompactionOptions, + TieredCompactionOptions, + }, + iterators::{merge_iterator::MergeIterator, StorageIterator}, + key::{KeySlice, TS_ENABLED}, + lsm_storage::{BlockCache, LsmStorageInner, LsmStorageState, MiniLsm}, + table::{SsTable, SsTableBuilder, SsTableIterator}, +}; + +#[derive(Clone)] +pub struct MockIterator { + pub data: Vec<(Bytes, Bytes)>, + pub error_when: Option, + pub index: usize, +} + +impl MockIterator { + pub fn new(data: Vec<(Bytes, Bytes)>) -> Self { + Self { + data, + index: 0, + error_when: None, + } + } + + pub fn new_with_error(data: Vec<(Bytes, Bytes)>, error_when: usize) -> Self { + Self { + data, + index: 0, + error_when: Some(error_when), + } + } +} + +impl StorageIterator for MockIterator { + type KeyType<'a> = KeySlice<'a>; + + fn next(&mut self) -> Result<()> { + if self.index < self.data.len() { + self.index += 1; + } + if let Some(error_when) = self.error_when { + if self.index == error_when { + bail!("fake error!"); + } + } + Ok(()) + } + + fn key(&self) -> KeySlice { + if let Some(error_when) = self.error_when { + if self.index >= error_when { + panic!("invalid access after next returns an error!"); + } + } + KeySlice::for_testing_from_slice_no_ts(self.data[self.index].0.as_ref()) + } + + fn value(&self) -> &[u8] { + if let Some(error_when) = self.error_when { + if self.index >= error_when { + panic!("invalid access after next returns an error!"); + } + } + self.data[self.index].1.as_ref() + } + + fn is_valid(&self) -> bool { + if let Some(error_when) = self.error_when { + if self.index >= error_when { + panic!("invalid access after next returns an error!"); + } + } + self.index < self.data.len() + } +} + +pub fn as_bytes(x: &[u8]) -> Bytes { + Bytes::copy_from_slice(x) +} + +pub fn check_iter_result_by_key(iter: &mut I, expected: Vec<(Bytes, Bytes)>) +where + I: for<'a> StorageIterator = KeySlice<'a>>, +{ + for (k, v) in expected { + assert!(iter.is_valid()); + assert_eq!( + k, + iter.key().for_testing_key_ref(), + "expected key: {:?}, actual key: {:?}", + k, + as_bytes(iter.key().for_testing_key_ref()), + ); + assert_eq!( + v, + iter.value(), + "expected value: {:?}, actual value: {:?}", + v, + as_bytes(iter.value()), + ); + iter.next().unwrap(); + } + assert!(!iter.is_valid()); +} + +#[allow(dead_code)] +pub fn check_iter_result_by_key_and_ts(iter: &mut I, expected: Vec<((Bytes, u64), Bytes)>) +where + I: for<'a> StorageIterator = KeySlice<'a>>, +{ + for ((k, ts), v) in expected { + assert!(iter.is_valid()); + assert_eq!( + (&k[..], ts), + ( + iter.key().for_testing_key_ref(), + iter.key().for_testing_ts() + ), + "expected key: {:?}@{}, actual key: {:?}@{}", + k, + ts, + as_bytes(iter.key().for_testing_key_ref()), + iter.key().for_testing_ts(), + ); + assert_eq!( + v, + iter.value(), + "expected value: {:?}, actual value: {:?}", + v, + as_bytes(iter.value()), + ); + iter.next().unwrap(); + } + assert!(!iter.is_valid()); +} + +pub fn check_lsm_iter_result_by_key(iter: &mut I, expected: Vec<(Bytes, Bytes)>) +where + I: for<'a> StorageIterator = &'a [u8]>, +{ + for (k, v) in expected { + assert!(iter.is_valid()); + assert_eq!( + k, + iter.key(), + "expected key: {:?}, actual key: {:?}", + k, + as_bytes(iter.key()), + ); + assert_eq!( + v, + iter.value(), + "expected value: {:?}, actual value: {:?}", + v, + as_bytes(iter.value()), + ); + iter.next().unwrap(); + } + assert!(!iter.is_valid()); +} + +pub fn expect_iter_error(mut iter: impl StorageIterator) { + loop { + match iter.next() { + Ok(_) if iter.is_valid() => continue, + Ok(_) => panic!("expect an error"), + Err(_) => break, + } + } +} + +pub fn generate_sst( + id: usize, + path: impl AsRef, + data: Vec<(Bytes, Bytes)>, + block_cache: Option>, +) -> SsTable { + let mut builder = SsTableBuilder::new(128); + for (key, value) in data { + builder.add(KeySlice::for_testing_from_slice_no_ts(&key[..]), &value[..]); + } + builder.build(id, block_cache, path.as_ref()).unwrap() +} + +#[allow(dead_code)] +pub fn generate_sst_with_ts( + id: usize, + path: impl AsRef, + data: Vec<((Bytes, u64), Bytes)>, + block_cache: Option>, +) -> SsTable { + let mut builder = SsTableBuilder::new(128); + for ((key, ts), value) in data { + builder.add( + KeySlice::for_testing_from_slice_with_ts(&key[..], ts), + &value[..], + ); + } + builder.build(id, block_cache, path.as_ref()).unwrap() +} + +pub fn sync(storage: &LsmStorageInner) { + storage + .force_freeze_memtable(&storage.state_lock.lock()) + .unwrap(); + storage.force_flush_next_imm_memtable().unwrap(); +} + +pub fn compaction_bench(storage: Arc) { + let mut key_map = BTreeMap::::new(); + let gen_key = |i| format!("{:010}", i); // 10B + let gen_value = |i| format!("{:0110}", i); // 110B + let mut max_key = 0; + let overlaps = if TS_ENABLED { 10000 } else { 20000 }; + for iter in 0..10 { + let range_begin = iter * 5000; + for i in range_begin..(range_begin + overlaps) { + // 120B per key, 4MB data populated + let key: String = gen_key(i); + let version = key_map.get(&i).copied().unwrap_or_default() + 1; + let value = gen_value(version); + key_map.insert(i, version); + storage.put(key.as_bytes(), value.as_bytes()).unwrap(); + max_key = max_key.max(i); + } + } + + std::thread::sleep(Duration::from_secs(1)); // wait until all memtables flush + while { + let snapshot = storage.inner.state.read(); + !snapshot.imm_memtables.is_empty() + } { + storage.inner.force_flush_next_imm_memtable().unwrap(); + } + + let mut prev_snapshot = storage.inner.state.read().clone(); + while { + std::thread::sleep(Duration::from_secs(1)); + let snapshot = storage.inner.state.read().clone(); + let to_cont = prev_snapshot.levels != snapshot.levels + || prev_snapshot.l0_sstables != snapshot.l0_sstables; + prev_snapshot = snapshot; + to_cont + } { + println!("waiting for compaction to converge"); + } + + let mut expected_key_value_pairs = Vec::new(); + for i in 0..(max_key + 40000) { + let key = gen_key(i); + let value = storage.get(key.as_bytes()).unwrap(); + if let Some(val) = key_map.get(&i) { + let expected_value = gen_value(*val); + assert_eq!(value, Some(Bytes::from(expected_value.clone()))); + expected_key_value_pairs.push((Bytes::from(key), Bytes::from(expected_value))); + } else { + assert!(value.is_none()); + } + } + + check_lsm_iter_result_by_key( + &mut storage.scan(Bound::Unbounded, Bound::Unbounded).unwrap(), + expected_key_value_pairs, + ); + + storage.dump_structure(); + + println!("This test case does not guarantee your compaction algorithm produces a LSM state as expected. It only does minimal checks on the size of the levels. Please use the compaction simulator to check if the compaction is correctly going on."); +} + +pub fn check_compaction_ratio(storage: Arc) { + let state = storage.inner.state.read().clone(); + let compaction_options = storage.inner.options.compaction_options.clone(); + let mut level_size = Vec::new(); + let l0_sst_num = state.l0_sstables.len(); + for (_, files) in &state.levels { + let size = match &compaction_options { + CompactionOptions::Leveled(_) => files + .iter() + .map(|x| state.sstables.get(x).as_ref().unwrap().table_size()) + .sum::(), + CompactionOptions::Simple(_) | CompactionOptions::Tiered(_) => files.len() as u64, + _ => unreachable!(), + }; + level_size.push(size); + } + let extra_iterators = if TS_ENABLED { + 1 /* txn local iterator for OCC */ + } else { + 0 + }; + let num_iters = storage + .scan(Bound::Unbounded, Bound::Unbounded) + .unwrap() + .num_active_iterators(); + let num_memtables = storage.inner.state.read().imm_memtables.len() + 1; + match compaction_options { + CompactionOptions::NoCompaction => unreachable!(), + CompactionOptions::Simple(SimpleLeveledCompactionOptions { + size_ratio_percent, + level0_file_num_compaction_trigger, + max_levels, + }) => { + assert!(l0_sst_num < level0_file_num_compaction_trigger); + assert!(level_size.len() <= max_levels); + for idx in 1..level_size.len() { + let prev_size = level_size[idx - 1]; + let this_size = level_size[idx]; + if prev_size == 0 && this_size == 0 { + continue; + } + assert!( + this_size as f64 / prev_size as f64 >= size_ratio_percent as f64 / 100.0, + "L{}/L{}, {}/{}<{}%", + state.levels[idx - 1].0, + state.levels[idx].0, + this_size, + prev_size, + size_ratio_percent + ); + } + assert!( + num_iters <= l0_sst_num + num_memtables + max_levels + extra_iterators, + "we found {num_iters} iterators in your implementation, (l0_sst_num={l0_sst_num}, num_memtables={num_memtables}, max_levels={max_levels}) did you use concat iterators?" + ); + } + CompactionOptions::Leveled(LeveledCompactionOptions { + level_size_multiplier, + level0_file_num_compaction_trigger, + max_levels, + .. + }) => { + assert!(l0_sst_num < level0_file_num_compaction_trigger); + assert!(level_size.len() <= max_levels); + let last_level_size = *level_size.last().unwrap(); + let mut multiplier = 1.0; + for idx in (1..level_size.len()).rev() { + multiplier *= level_size_multiplier as f64; + let this_size = level_size[idx - 1]; + assert!( + // do not add hard requirement on level size multiplier considering bloom filters... + this_size as f64 / last_level_size as f64 <= 1.0 / multiplier + 0.5, + "L{}/L_max, {}/{}>>1.0/{}", + state.levels[idx - 1].0, + this_size, + last_level_size, + multiplier + ); + } + assert!( + num_iters <= l0_sst_num + num_memtables + max_levels + extra_iterators, + "we found {num_iters} iterators in your implementation, (l0_sst_num={l0_sst_num}, num_memtables={num_memtables}, max_levels={max_levels}) did you use concat iterators?" + ); + } + CompactionOptions::Tiered(TieredCompactionOptions { + num_tiers, + max_size_amplification_percent, + size_ratio, + min_merge_width, + }) => { + let size_ratio_trigger = (100.0 + size_ratio as f64) / 100.0; + assert_eq!(l0_sst_num, 0); + assert!(level_size.len() <= num_tiers); + let mut sum_size = level_size[0]; + for idx in 1..level_size.len() { + let this_size = level_size[idx]; + if level_size.len() > min_merge_width { + assert!( + sum_size as f64 / this_size as f64 <= size_ratio_trigger, + "violation of size ratio: sum(⬆️L{})/L{}, {}/{}>{}", + state.levels[idx - 1].0, + state.levels[idx].0, + sum_size, + this_size, + size_ratio_trigger + ); + } + if idx + 1 == level_size.len() { + assert!( + sum_size as f64 / this_size as f64 + <= max_size_amplification_percent as f64 / 100.0, + "violation of space amp: sum(⬆️L{})/L{}, {}/{}>{}%", + state.levels[idx - 1].0, + state.levels[idx].0, + sum_size, + this_size, + max_size_amplification_percent + ); + } + sum_size += this_size; + } + assert!( + num_iters <= num_memtables + num_tiers + extra_iterators, + "we found {num_iters} iterators in your implementation, (num_memtables={num_memtables}, num_tiers={num_tiers}) did you use concat iterators?" + ); + } + } +} + +pub fn dump_files_in_dir(path: impl AsRef) { + println!("--- DIR DUMP ---"); + for f in path.as_ref().read_dir().unwrap() { + let f = f.unwrap(); + print!("{}", f.path().display()); + println!( + ", size={:.3}KB", + f.metadata().unwrap().size() as f64 / 1024.0 + ); + } +} + +pub fn construct_merge_iterator_over_storage( + state: &LsmStorageState, +) -> MergeIterator { + let mut iters = Vec::new(); + for t in &state.l0_sstables { + iters.push(Box::new( + SsTableIterator::create_and_seek_to_first(state.sstables.get(t).cloned().unwrap()) + .unwrap(), + )); + } + for (_, files) in &state.levels { + for f in files { + iters.push(Box::new( + SsTableIterator::create_and_seek_to_first(state.sstables.get(f).cloned().unwrap()) + .unwrap(), + )); + } + } + MergeIterator::create(iters) +} diff --git a/mini-lsm-starter/src/tests/week1_day1.rs b/mini-lsm-starter/src/tests/week1_day1.rs new file mode 100644 index 000000000..25ff8ff7c --- /dev/null +++ b/mini-lsm-starter/src/tests/week1_day1.rs @@ -0,0 +1,149 @@ +use std::sync::Arc; + +use tempfile::tempdir; + +use crate::{ + lsm_storage::{LsmStorageInner, LsmStorageOptions}, + mem_table::MemTable, +}; + +#[test] +fn test_task1_memtable_get() { + let memtable = MemTable::create(0); + memtable.for_testing_put_slice(b"key1", b"value1").unwrap(); + memtable.for_testing_put_slice(b"key2", b"value2").unwrap(); + memtable.for_testing_put_slice(b"key3", b"value3").unwrap(); + assert_eq!( + &memtable.for_testing_get_slice(b"key1").unwrap()[..], + b"value1" + ); + assert_eq!( + &memtable.for_testing_get_slice(b"key2").unwrap()[..], + b"value2" + ); + assert_eq!( + &memtable.for_testing_get_slice(b"key3").unwrap()[..], + b"value3" + ); +} + +#[test] +fn test_task1_memtable_overwrite() { + let memtable = MemTable::create(0); + memtable.for_testing_put_slice(b"key1", b"value1").unwrap(); + memtable.for_testing_put_slice(b"key2", b"value2").unwrap(); + memtable.for_testing_put_slice(b"key3", b"value3").unwrap(); + memtable.for_testing_put_slice(b"key1", b"value11").unwrap(); + memtable.for_testing_put_slice(b"key2", b"value22").unwrap(); + memtable.for_testing_put_slice(b"key3", b"value33").unwrap(); + assert_eq!( + &memtable.for_testing_get_slice(b"key1").unwrap()[..], + b"value11" + ); + assert_eq!( + &memtable.for_testing_get_slice(b"key2").unwrap()[..], + b"value22" + ); + assert_eq!( + &memtable.for_testing_get_slice(b"key3").unwrap()[..], + b"value33" + ); +} + +#[test] +fn test_task2_storage_integration() { + let dir = tempdir().unwrap(); + let storage = Arc::new( + LsmStorageInner::open(dir.path(), LsmStorageOptions::default_for_week1_test()).unwrap(), + ); + assert_eq!(&storage.get(b"0").unwrap(), &None); + storage.put(b"1", b"233").unwrap(); + storage.put(b"2", b"2333").unwrap(); + storage.put(b"3", b"23333").unwrap(); + assert_eq!(&storage.get(b"1").unwrap().unwrap()[..], b"233"); + assert_eq!(&storage.get(b"2").unwrap().unwrap()[..], b"2333"); + assert_eq!(&storage.get(b"3").unwrap().unwrap()[..], b"23333"); + storage.delete(b"2").unwrap(); + assert!(storage.get(b"2").unwrap().is_none()); + storage.delete(b"0").unwrap(); // should NOT report any error +} + +#[test] +fn test_task3_storage_integration() { + let dir = tempdir().unwrap(); + let storage = Arc::new( + LsmStorageInner::open(dir.path(), LsmStorageOptions::default_for_week1_test()).unwrap(), + ); + storage.put(b"1", b"233").unwrap(); + storage.put(b"2", b"2333").unwrap(); + storage.put(b"3", b"23333").unwrap(); + storage + .force_freeze_memtable(&storage.state_lock.lock()) + .unwrap(); + assert_eq!(storage.state.read().imm_memtables.len(), 1); + let previous_approximate_size = storage.state.read().imm_memtables[0].approximate_size(); + assert!(previous_approximate_size >= 15); + storage.put(b"1", b"2333").unwrap(); + storage.put(b"2", b"23333").unwrap(); + storage.put(b"3", b"233333").unwrap(); + storage + .force_freeze_memtable(&storage.state_lock.lock()) + .unwrap(); + assert_eq!(storage.state.read().imm_memtables.len(), 2); + assert!( + storage.state.read().imm_memtables[1].approximate_size() == previous_approximate_size, + "wrong order of memtables?" + ); + assert!(storage.state.read().imm_memtables[0].approximate_size() > previous_approximate_size); +} + +#[test] +fn test_task3_freeze_on_capacity() { + let dir = tempdir().unwrap(); + let mut options = LsmStorageOptions::default_for_week1_test(); + options.target_sst_size = 1024; + options.num_memtable_limit = 1000; + let storage = Arc::new(LsmStorageInner::open(dir.path(), options).unwrap()); + for _ in 0..1000 { + storage.put(b"1", b"2333").unwrap(); + } + let num_imm_memtables = storage.state.read().imm_memtables.len(); + assert!(num_imm_memtables >= 1, "no memtable frozen?"); + for _ in 0..1000 { + storage.delete(b"1").unwrap(); + } + + assert!( + storage.state.read().imm_memtables.len() > num_imm_memtables, + "no more memtable frozen?" + ); +} + +#[test] +fn test_task4_storage_integration() { + let dir = tempdir().unwrap(); + let storage = Arc::new( + LsmStorageInner::open(dir.path(), LsmStorageOptions::default_for_week1_test()).unwrap(), + ); + assert_eq!(&storage.get(b"0").unwrap(), &None); + storage.put(b"1", b"233").unwrap(); + storage.put(b"2", b"2333").unwrap(); + storage.put(b"3", b"23333").unwrap(); + storage + .force_freeze_memtable(&storage.state_lock.lock()) + .unwrap(); + storage.delete(b"1").unwrap(); + storage.delete(b"2").unwrap(); + storage.put(b"3", b"2333").unwrap(); + storage.put(b"4", b"23333").unwrap(); + storage + .force_freeze_memtable(&storage.state_lock.lock()) + .unwrap(); + storage.put(b"1", b"233333").unwrap(); + storage.put(b"3", b"233333").unwrap(); + assert_eq!(storage.state.read().imm_memtables.len(), 2); + assert_eq!(&storage.get(b"1").unwrap().unwrap()[..], b"233333"); + assert_eq!(&storage.get(b"2").unwrap(), &None); + assert_eq!(&storage.get(b"3").unwrap().unwrap()[..], b"233333"); + assert_eq!(&storage.get(b"4").unwrap().unwrap()[..], b"23333"); +} diff --git a/mini-lsm-starter/src/tests/week1_day2.rs b/mini-lsm-starter/src/tests/week1_day2.rs new file mode 100644 index 000000000..6c1b002e1 --- /dev/null +++ b/mini-lsm-starter/src/tests/week1_day2.rs @@ -0,0 +1,313 @@ +use std::{ops::Bound, sync::Arc}; + +use bytes::Bytes; +use tempfile::tempdir; + +use crate::{ + iterators::{merge_iterator::MergeIterator, StorageIterator}, + lsm_iterator::FusedIterator, + lsm_storage::{LsmStorageInner, LsmStorageOptions}, + mem_table::MemTable, + tests::harness::check_lsm_iter_result_by_key, +}; + +use super::harness::{check_iter_result_by_key, expect_iter_error, MockIterator}; + +#[test] +fn test_task1_memtable_iter() { + use std::ops::Bound; + let memtable = MemTable::create(0); + memtable.for_testing_put_slice(b"key1", b"value1").unwrap(); + memtable.for_testing_put_slice(b"key2", b"value2").unwrap(); + memtable.for_testing_put_slice(b"key3", b"value3").unwrap(); + + { + let mut iter = memtable.for_testing_scan_slice(Bound::Unbounded, Bound::Unbounded); + assert_eq!(iter.key().for_testing_key_ref(), b"key1"); + assert_eq!(iter.value(), b"value1"); + assert!(iter.is_valid()); + iter.next().unwrap(); + assert_eq!(iter.key().for_testing_key_ref(), b"key2"); + assert_eq!(iter.value(), b"value2"); + assert!(iter.is_valid()); + iter.next().unwrap(); + assert_eq!(iter.key().for_testing_key_ref(), b"key3"); + assert_eq!(iter.value(), b"value3"); + assert!(iter.is_valid()); + iter.next().unwrap(); + assert!(!iter.is_valid()); + } + + { + let mut iter = + memtable.for_testing_scan_slice(Bound::Included(b"key1"), Bound::Included(b"key2")); + assert_eq!(iter.key().for_testing_key_ref(), b"key1"); + assert_eq!(iter.value(), b"value1"); + assert!(iter.is_valid()); + iter.next().unwrap(); + assert_eq!(iter.key().for_testing_key_ref(), b"key2"); + assert_eq!(iter.value(), b"value2"); + assert!(iter.is_valid()); + iter.next().unwrap(); + assert!(!iter.is_valid()); + } + + { + let mut iter = + memtable.for_testing_scan_slice(Bound::Excluded(b"key1"), Bound::Excluded(b"key3")); + assert_eq!(iter.key().for_testing_key_ref(), b"key2"); + assert_eq!(iter.value(), b"value2"); + assert!(iter.is_valid()); + iter.next().unwrap(); + assert!(!iter.is_valid()); + } +} + +#[test] +fn test_task1_empty_memtable_iter() { + use std::ops::Bound; + let memtable = MemTable::create(0); + { + let iter = + memtable.for_testing_scan_slice(Bound::Excluded(b"key1"), Bound::Excluded(b"key3")); + assert!(!iter.is_valid()); + } + { + let iter = + memtable.for_testing_scan_slice(Bound::Included(b"key1"), Bound::Included(b"key2")); + assert!(!iter.is_valid()); + } + { + let iter = memtable.for_testing_scan_slice(Bound::Unbounded, Bound::Unbounded); + assert!(!iter.is_valid()); + } +} + +#[test] +fn test_task2_merge_1() { + let i1 = MockIterator::new(vec![ + (Bytes::from("a"), Bytes::from("1.1")), + (Bytes::from("b"), Bytes::from("2.1")), + (Bytes::from("c"), Bytes::from("3.1")), + (Bytes::from("e"), Bytes::new()), + ]); + let i2 = MockIterator::new(vec![ + (Bytes::from("a"), Bytes::from("1.2")), + (Bytes::from("b"), Bytes::from("2.2")), + (Bytes::from("c"), Bytes::from("3.2")), + (Bytes::from("d"), Bytes::from("4.2")), + ]); + let i3 = MockIterator::new(vec![ + (Bytes::from("b"), Bytes::from("2.3")), + (Bytes::from("c"), Bytes::from("3.3")), + (Bytes::from("d"), Bytes::from("4.3")), + ]); + + let mut iter = MergeIterator::create(vec![ + Box::new(i1.clone()), + Box::new(i2.clone()), + Box::new(i3.clone()), + ]); + + check_iter_result_by_key( + &mut iter, + vec![ + (Bytes::from("a"), Bytes::from("1.1")), + (Bytes::from("b"), Bytes::from("2.1")), + (Bytes::from("c"), Bytes::from("3.1")), + (Bytes::from("d"), Bytes::from("4.2")), + (Bytes::from("e"), Bytes::new()), + ], + ); + + let mut iter = MergeIterator::create(vec![Box::new(i3), Box::new(i1), Box::new(i2)]); + + check_iter_result_by_key( + &mut iter, + vec![ + (Bytes::from("a"), Bytes::from("1.1")), + (Bytes::from("b"), Bytes::from("2.3")), + (Bytes::from("c"), Bytes::from("3.3")), + (Bytes::from("d"), Bytes::from("4.3")), + (Bytes::from("e"), Bytes::new()), + ], + ); +} + +#[test] +fn test_task2_merge_2() { + let i1 = MockIterator::new(vec![ + (Bytes::from("a"), Bytes::from("1.1")), + (Bytes::from("b"), Bytes::from("2.1")), + (Bytes::from("c"), Bytes::from("3.1")), + ]); + let i2 = MockIterator::new(vec![ + (Bytes::from("d"), Bytes::from("1.2")), + (Bytes::from("e"), Bytes::from("2.2")), + (Bytes::from("f"), Bytes::from("3.2")), + (Bytes::from("g"), Bytes::from("4.2")), + ]); + let i3 = MockIterator::new(vec![ + (Bytes::from("h"), Bytes::from("1.3")), + (Bytes::from("i"), Bytes::from("2.3")), + (Bytes::from("j"), Bytes::from("3.3")), + (Bytes::from("k"), Bytes::from("4.3")), + ]); + let i4 = MockIterator::new(vec![]); + let result = vec![ + (Bytes::from("a"), Bytes::from("1.1")), + (Bytes::from("b"), Bytes::from("2.1")), + (Bytes::from("c"), Bytes::from("3.1")), + (Bytes::from("d"), Bytes::from("1.2")), + (Bytes::from("e"), Bytes::from("2.2")), + (Bytes::from("f"), Bytes::from("3.2")), + (Bytes::from("g"), Bytes::from("4.2")), + (Bytes::from("h"), Bytes::from("1.3")), + (Bytes::from("i"), Bytes::from("2.3")), + (Bytes::from("j"), Bytes::from("3.3")), + (Bytes::from("k"), Bytes::from("4.3")), + ]; + + let mut iter = MergeIterator::create(vec![ + Box::new(i1.clone()), + Box::new(i2.clone()), + Box::new(i3.clone()), + Box::new(i4.clone()), + ]); + check_iter_result_by_key(&mut iter, result.clone()); + + let mut iter = MergeIterator::create(vec![ + Box::new(i2.clone()), + Box::new(i4.clone()), + Box::new(i3.clone()), + Box::new(i1.clone()), + ]); + check_iter_result_by_key(&mut iter, result.clone()); + + let mut iter = + MergeIterator::create(vec![Box::new(i4), Box::new(i3), Box::new(i2), Box::new(i1)]); + check_iter_result_by_key(&mut iter, result); +} + +#[test] +fn test_task2_merge_empty() { + let mut iter = MergeIterator::::create(vec![]); + check_iter_result_by_key(&mut iter, vec![]); + + let i1 = MockIterator::new(vec![ + (Bytes::from("a"), Bytes::from("1.1")), + (Bytes::from("b"), Bytes::from("2.1")), + (Bytes::from("c"), Bytes::from("3.1")), + ]); + let i2 = MockIterator::new(vec![]); + let mut iter = MergeIterator::::create(vec![Box::new(i1), Box::new(i2)]); + check_iter_result_by_key( + &mut iter, + vec![ + (Bytes::from("a"), Bytes::from("1.1")), + (Bytes::from("b"), Bytes::from("2.1")), + (Bytes::from("c"), Bytes::from("3.1")), + ], + ); +} + +#[test] +fn test_task2_merge_error() { + let mut iter = MergeIterator::::create(vec![]); + check_iter_result_by_key(&mut iter, vec![]); + + let i1 = MockIterator::new(vec![ + (Bytes::from("a"), Bytes::from("1.1")), + (Bytes::from("b"), Bytes::from("2.1")), + (Bytes::from("c"), Bytes::from("3.1")), + ]); + let i2 = MockIterator::new_with_error( + vec![ + (Bytes::from("a"), Bytes::from("1.1")), + (Bytes::from("b"), Bytes::from("2.1")), + (Bytes::from("c"), Bytes::from("3.1")), + ], + 1, + ); + let iter = MergeIterator::::create(vec![Box::new(i1), Box::new(i2)]); + // your implementation should correctly throw an error instead of panic + expect_iter_error(iter); +} + +#[test] +fn test_task3_fused_iterator() { + let iter = MockIterator::new(vec![]); + let mut fused_iter = FusedIterator::new(iter); + assert!(!fused_iter.is_valid()); + fused_iter.next().unwrap(); + fused_iter.next().unwrap(); + fused_iter.next().unwrap(); + assert!(!fused_iter.is_valid()); + + let iter = MockIterator::new_with_error( + vec![ + (Bytes::from("a"), Bytes::from("1.1")), + (Bytes::from("a"), Bytes::from("1.1")), + ], + 1, + ); + let mut fused_iter = FusedIterator::new(iter); + assert!(fused_iter.is_valid()); + assert!(fused_iter.next().is_err()); + assert!(!fused_iter.is_valid()); + assert!(fused_iter.next().is_err()); + assert!(fused_iter.next().is_err()); +} + +#[test] +fn test_task4_integration() { + let dir = tempdir().unwrap(); + let storage = Arc::new( + LsmStorageInner::open(dir.path(), LsmStorageOptions::default_for_week1_test()).unwrap(), + ); + storage.put(b"1", b"233").unwrap(); + storage.put(b"2", b"2333").unwrap(); + storage.put(b"3", b"23333").unwrap(); + storage + .force_freeze_memtable(&storage.state_lock.lock()) + .unwrap(); + storage.delete(b"1").unwrap(); + storage.delete(b"2").unwrap(); + storage.put(b"3", b"2333").unwrap(); + storage.put(b"4", b"23333").unwrap(); + storage + .force_freeze_memtable(&storage.state_lock.lock()) + .unwrap(); + storage.put(b"1", b"233333").unwrap(); + storage.put(b"3", b"233333").unwrap(); + { + let mut iter = storage.scan(Bound::Unbounded, Bound::Unbounded).unwrap(); + check_lsm_iter_result_by_key( + &mut iter, + vec![ + (Bytes::from_static(b"1"), Bytes::from_static(b"233333")), + (Bytes::from_static(b"3"), Bytes::from_static(b"233333")), + (Bytes::from_static(b"4"), Bytes::from_static(b"23333")), + ], + ); + assert!(!iter.is_valid()); + iter.next().unwrap(); + iter.next().unwrap(); + iter.next().unwrap(); + assert!(!iter.is_valid()); + } + { + let mut iter = storage + .scan(Bound::Included(b"2"), Bound::Included(b"3")) + .unwrap(); + check_lsm_iter_result_by_key( + &mut iter, + vec![(Bytes::from_static(b"3"), Bytes::from_static(b"233333"))], + ); + assert!(!iter.is_valid()); + iter.next().unwrap(); + iter.next().unwrap(); + iter.next().unwrap(); + assert!(!iter.is_valid()); + } +} diff --git a/mini-lsm-starter/src/tests/week1_day3.rs b/mini-lsm-starter/src/tests/week1_day3.rs new file mode 100644 index 000000000..91deea788 --- /dev/null +++ b/mini-lsm-starter/src/tests/week1_day3.rs @@ -0,0 +1,147 @@ +use std::sync::Arc; + +use bytes::Bytes; + +use crate::{ + block::{Block, BlockBuilder, BlockIterator}, + key::{KeySlice, KeyVec}, +}; + +#[test] +fn test_block_build_single_key() { + let mut builder = BlockBuilder::new(16); + assert!(builder.add(KeySlice::for_testing_from_slice_no_ts(b"233"), b"233333")); + builder.build(); +} + +#[test] +fn test_block_build_full() { + let mut builder = BlockBuilder::new(16); + assert!(builder.add(KeySlice::for_testing_from_slice_no_ts(b"11"), b"11")); + assert!(!builder.add(KeySlice::for_testing_from_slice_no_ts(b"22"), b"22")); + builder.build(); +} + +#[test] +fn test_block_build_large_1() { + let mut builder = BlockBuilder::new(16); + assert!(builder.add( + KeySlice::for_testing_from_slice_no_ts(b"11"), + &b"1".repeat(100) + )); + builder.build(); +} + +#[test] +fn test_block_build_large_2() { + let mut builder = BlockBuilder::new(16); + assert!(builder.add(KeySlice::for_testing_from_slice_no_ts(b"11"), b"1")); + assert!(!builder.add( + KeySlice::for_testing_from_slice_no_ts(b"11"), + &b"1".repeat(100) + )); +} + +fn key_of(idx: usize) -> KeyVec { + KeyVec::for_testing_from_vec_no_ts(format!("key_{:03}", idx * 5).into_bytes()) +} + +fn value_of(idx: usize) -> Vec { + format!("value_{:010}", idx).into_bytes() +} + +fn num_of_keys() -> usize { + 100 +} + +fn generate_block() -> Block { + let mut builder = BlockBuilder::new(10000); + for idx in 0..num_of_keys() { + let key = key_of(idx); + let value = value_of(idx); + assert!(builder.add(key.as_key_slice(), &value[..])); + } + builder.build() +} + +#[test] +fn test_block_build_all() { + generate_block(); +} + +#[test] +fn test_block_encode() { + let block = generate_block(); + block.encode(); +} + +#[test] +fn test_block_decode() { + let block = generate_block(); + let encoded = block.encode(); + let decoded_block = Block::decode(&encoded); + assert_eq!(block.offsets, decoded_block.offsets); + assert_eq!(block.data, decoded_block.data); +} + +fn as_bytes(x: &[u8]) -> Bytes { + Bytes::copy_from_slice(x) +} + +#[test] +fn test_block_iterator() { + let block = Arc::new(generate_block()); + let mut iter = BlockIterator::create_and_seek_to_first(block); + for _ in 0..5 { + for i in 0..num_of_keys() { + let key = iter.key(); + let value = iter.value(); + assert_eq!( + key.for_testing_key_ref(), + key_of(i).for_testing_key_ref(), + "expected key: {:?}, actual key: {:?}", + as_bytes(key_of(i).for_testing_key_ref()), + as_bytes(key.for_testing_key_ref()) + ); + assert_eq!( + value, + value_of(i), + "expected value: {:?}, actual value: {:?}", + as_bytes(&value_of(i)), + as_bytes(value) + ); + iter.next(); + } + iter.seek_to_first(); + } +} + +#[test] +fn test_block_seek_key() { + let block = Arc::new(generate_block()); + let mut iter = BlockIterator::create_and_seek_to_key(block, key_of(0).as_key_slice()); + for offset in 1..=5 { + for i in 0..num_of_keys() { + let key = iter.key(); + let value = iter.value(); + assert_eq!( + key.for_testing_key_ref(), + key_of(i).for_testing_key_ref(), + "expected key: {:?}, actual key: {:?}", + as_bytes(key_of(i).for_testing_key_ref()), + as_bytes(key.for_testing_key_ref()) + ); + assert_eq!( + value, + value_of(i), + "expected value: {:?}, actual value: {:?}", + as_bytes(&value_of(i)), + as_bytes(value) + ); + iter.seek_to_key(KeySlice::for_testing_from_slice_no_ts( + &format!("key_{:03}", i * 5 + offset).into_bytes(), + )); + } + iter.seek_to_key(KeySlice::for_testing_from_slice_no_ts(b"k")); + } +} diff --git a/mini-lsm-starter/src/tests/week1_day4.rs b/mini-lsm-starter/src/tests/week1_day4.rs new file mode 100644 index 000000000..690080694 --- /dev/null +++ b/mini-lsm-starter/src/tests/week1_day4.rs @@ -0,0 +1,141 @@ +use std::sync::Arc; + +use bytes::Bytes; +use tempfile::{tempdir, TempDir}; + +use crate::iterators::StorageIterator; +use crate::key::{KeySlice, KeyVec}; +use crate::table::{SsTable, SsTableBuilder, SsTableIterator}; + +#[test] +fn test_sst_build_single_key() { + let mut builder = SsTableBuilder::new(16); + builder.add(KeySlice::for_testing_from_slice_no_ts(b"233"), b"233333"); + let dir = tempdir().unwrap(); + builder.build_for_test(dir.path().join("1.sst")).unwrap(); +} + +#[test] +fn test_sst_build_two_blocks() { + let mut builder = SsTableBuilder::new(16); + builder.add(KeySlice::for_testing_from_slice_no_ts(b"11"), b"11"); + builder.add(KeySlice::for_testing_from_slice_no_ts(b"22"), b"22"); + builder.add(KeySlice::for_testing_from_slice_no_ts(b"33"), b"11"); + builder.add(KeySlice::for_testing_from_slice_no_ts(b"44"), b"22"); + builder.add(KeySlice::for_testing_from_slice_no_ts(b"55"), b"11"); + builder.add(KeySlice::for_testing_from_slice_no_ts(b"66"), b"22"); + assert!(builder.metas.len() >= 2); + let dir = tempdir().unwrap(); + builder.build_for_test(dir.path().join("1.sst")).unwrap(); +} + +fn key_of(idx: usize) -> KeyVec { + KeyVec::for_testing_from_vec_no_ts(format!("key_{:03}", idx * 5).into_bytes()) +} + +fn value_of(idx: usize) -> Vec { + format!("value_{:010}", idx).into_bytes() +} + +fn num_of_keys() -> usize { + 100 +} + +fn generate_sst() -> (TempDir, SsTable) { + let mut builder = SsTableBuilder::new(128); + for idx in 0..num_of_keys() { + let key = key_of(idx); + let value = value_of(idx); + builder.add(key.as_key_slice(), &value[..]); + } + let dir = tempdir().unwrap(); + let path = dir.path().join("1.sst"); + (dir, builder.build_for_test(path).unwrap()) +} + +#[test] +fn test_sst_build_all() { + generate_sst(); +} + +#[test] +fn test_sst_decode() { + let (_dir, sst) = generate_sst(); + let meta = sst.block_metas.clone(); + let new_sst = SsTable::open_for_test(sst.file).unwrap(); + assert_eq!(new_sst.block_metas, meta); + assert_eq!( + new_sst.first_key().for_testing_key_ref(), + key_of(0).for_testing_key_ref() + ); + assert_eq!( + new_sst.last_key().for_testing_key_ref(), + key_of(num_of_keys() - 1).for_testing_key_ref() + ); +} + +fn as_bytes(x: &[u8]) -> Bytes { + Bytes::copy_from_slice(x) +} + +#[test] +fn test_sst_iterator() { + let (_dir, sst) = generate_sst(); + let sst = Arc::new(sst); + let mut iter = SsTableIterator::create_and_seek_to_first(sst).unwrap(); + for _ in 0..5 { + for i in 0..num_of_keys() { + let key = iter.key(); + let value = iter.value(); + assert_eq!( + key.for_testing_key_ref(), + key_of(i).for_testing_key_ref(), + "expected key: {:?}, actual key: {:?}", + as_bytes(key_of(i).for_testing_key_ref()), + as_bytes(key.for_testing_key_ref()) + ); + assert_eq!( + value, + value_of(i), + "expected value: {:?}, actual value: {:?}", + as_bytes(&value_of(i)), + as_bytes(value) + ); + iter.next().unwrap(); + } + iter.seek_to_first().unwrap(); + } +} + +#[test] +fn test_sst_seek_key() { + let (_dir, sst) = generate_sst(); + let sst = Arc::new(sst); + let mut iter = SsTableIterator::create_and_seek_to_key(sst, key_of(0).as_key_slice()).unwrap(); + for offset in 1..=5 { + for i in 0..num_of_keys() { + let key = iter.key(); + let value = iter.value(); + assert_eq!( + key.for_testing_key_ref(), + key_of(i).for_testing_key_ref(), + "expected key: {:?}, actual key: {:?}", + as_bytes(key_of(i).for_testing_key_ref()), + as_bytes(key.for_testing_key_ref()) + ); + assert_eq!( + value, + value_of(i), + "expected value: {:?}, actual value: {:?}", + as_bytes(&value_of(i)), + as_bytes(value) + ); + iter.seek_to_key(KeySlice::for_testing_from_slice_no_ts( + &format!("key_{:03}", i * 5 + offset).into_bytes(), + )) + .unwrap(); + } + iter.seek_to_key(KeySlice::for_testing_from_slice_no_ts(b"k")) + .unwrap(); + } +} diff --git a/mini-lsm-starter/src/tests/week1_day5.rs b/mini-lsm-starter/src/tests/week1_day5.rs new file mode 100644 index 000000000..1bf2fc5a6 --- /dev/null +++ b/mini-lsm-starter/src/tests/week1_day5.rs @@ -0,0 +1,254 @@ +use std::ops::Bound; +use std::sync::Arc; + +use self::harness::{check_iter_result_by_key, MockIterator}; +use self::harness::{check_lsm_iter_result_by_key, generate_sst}; +use bytes::Bytes; +use tempfile::tempdir; + +use super::*; +use crate::{ + iterators::two_merge_iterator::TwoMergeIterator, + lsm_storage::{LsmStorageInner, LsmStorageOptions}, +}; + +#[test] +fn test_task1_merge_1() { + let i1 = MockIterator::new(vec![ + (Bytes::from("a"), Bytes::from("1.1")), + (Bytes::from("b"), Bytes::from("2.1")), + (Bytes::from("c"), Bytes::from("3.1")), + ]); + let i2 = MockIterator::new(vec![ + (Bytes::from("a"), Bytes::from("1.2")), + (Bytes::from("b"), Bytes::from("2.2")), + (Bytes::from("c"), Bytes::from("3.2")), + (Bytes::from("d"), Bytes::from("4.2")), + ]); + let mut iter = TwoMergeIterator::create(i1, i2).unwrap(); + check_iter_result_by_key( + &mut iter, + vec![ + (Bytes::from("a"), Bytes::from("1.1")), + (Bytes::from("b"), Bytes::from("2.1")), + (Bytes::from("c"), Bytes::from("3.1")), + (Bytes::from("d"), Bytes::from("4.2")), + ], + ) +} + +#[test] +fn test_task1_merge_2() { + let i2 = MockIterator::new(vec![ + (Bytes::from("a"), Bytes::from("1.1")), + (Bytes::from("b"), Bytes::from("2.1")), + (Bytes::from("c"), Bytes::from("3.1")), + ]); + let i1 = MockIterator::new(vec![ + (Bytes::from("a"), Bytes::from("1.2")), + (Bytes::from("b"), Bytes::from("2.2")), + (Bytes::from("c"), Bytes::from("3.2")), + (Bytes::from("d"), Bytes::from("4.2")), + ]); + let mut iter = TwoMergeIterator::create(i1, i2).unwrap(); + check_iter_result_by_key( + &mut iter, + vec![ + (Bytes::from("a"), Bytes::from("1.2")), + (Bytes::from("b"), Bytes::from("2.2")), + (Bytes::from("c"), Bytes::from("3.2")), + (Bytes::from("d"), Bytes::from("4.2")), + ], + ) +} + +#[test] +fn test_task1_merge_3() { + let i2 = MockIterator::new(vec![ + (Bytes::from("a"), Bytes::from("1.1")), + (Bytes::from("b"), Bytes::from("2.1")), + (Bytes::from("c"), Bytes::from("3.1")), + ]); + let i1 = MockIterator::new(vec![ + (Bytes::from("b"), Bytes::from("2.2")), + (Bytes::from("c"), Bytes::from("3.2")), + (Bytes::from("d"), Bytes::from("4.2")), + ]); + let mut iter = TwoMergeIterator::create(i1, i2).unwrap(); + check_iter_result_by_key( + &mut iter, + vec![ + (Bytes::from("a"), Bytes::from("1.1")), + (Bytes::from("b"), Bytes::from("2.2")), + (Bytes::from("c"), Bytes::from("3.2")), + (Bytes::from("d"), Bytes::from("4.2")), + ], + ) +} + +#[test] +fn test_task1_merge_4() { + let i2 = MockIterator::new(vec![]); + let i1 = MockIterator::new(vec![ + (Bytes::from("b"), Bytes::from("2.2")), + (Bytes::from("c"), Bytes::from("3.2")), + (Bytes::from("d"), Bytes::from("4.2")), + ]); + let mut iter = TwoMergeIterator::create(i1, i2).unwrap(); + check_iter_result_by_key( + &mut iter, + vec![ + (Bytes::from("b"), Bytes::from("2.2")), + (Bytes::from("c"), Bytes::from("3.2")), + (Bytes::from("d"), Bytes::from("4.2")), + ], + ); + let i1 = MockIterator::new(vec![]); + let i2 = MockIterator::new(vec![ + (Bytes::from("b"), Bytes::from("2.2")), + (Bytes::from("c"), Bytes::from("3.2")), + (Bytes::from("d"), Bytes::from("4.2")), + ]); + let mut iter = TwoMergeIterator::create(i1, i2).unwrap(); + check_iter_result_by_key( + &mut iter, + vec![ + (Bytes::from("b"), Bytes::from("2.2")), + (Bytes::from("c"), Bytes::from("3.2")), + (Bytes::from("d"), Bytes::from("4.2")), + ], + ); +} + +#[test] +fn test_task1_merge_5() { + let i2 = MockIterator::new(vec![]); + let i1 = MockIterator::new(vec![]); + let mut iter = TwoMergeIterator::create(i1, i2).unwrap(); + check_iter_result_by_key(&mut iter, vec![]) +} + +#[test] +fn test_task2_storage_scan() { + let dir = tempdir().unwrap(); + let storage = + Arc::new(LsmStorageInner::open(&dir, LsmStorageOptions::default_for_week1_test()).unwrap()); + storage.put(b"1", b"233").unwrap(); + storage.put(b"2", b"2333").unwrap(); + storage.put(b"00", b"2333").unwrap(); + storage + .force_freeze_memtable(&storage.state_lock.lock()) + .unwrap(); + storage.put(b"3", b"23333").unwrap(); + storage.delete(b"1").unwrap(); + let sst1 = generate_sst( + 10, + dir.path().join("10.sst"), + vec![ + (Bytes::from_static(b"0"), Bytes::from_static(b"2333333")), + (Bytes::from_static(b"00"), Bytes::from_static(b"2333333")), + (Bytes::from_static(b"4"), Bytes::from_static(b"23")), + ], + Some(storage.block_cache.clone()), + ); + let sst2 = generate_sst( + 11, + dir.path().join("11.sst"), + vec![(Bytes::from_static(b"4"), Bytes::from_static(b""))], + Some(storage.block_cache.clone()), + ); + { + let mut state = storage.state.write(); + let mut snapshot = state.as_ref().clone(); + snapshot.l0_sstables.push(sst2.sst_id()); // this is the latest SST + snapshot.l0_sstables.push(sst1.sst_id()); + snapshot.sstables.insert(sst2.sst_id(), sst2.into()); + snapshot.sstables.insert(sst1.sst_id(), sst1.into()); + *state = snapshot.into(); + } + + println!("dump structure"); + storage.dump_structure(); + + check_lsm_iter_result_by_key( + &mut storage.scan(Bound::Unbounded, Bound::Unbounded).unwrap(), + vec![ + (Bytes::from("0"), Bytes::from("2333333")), + (Bytes::from("00"), Bytes::from("2333")), + (Bytes::from("2"), Bytes::from("2333")), + (Bytes::from("3"), Bytes::from("23333")), + ], + ); + check_lsm_iter_result_by_key( + &mut storage + .scan(Bound::Included(b"1"), Bound::Included(b"2")) + .unwrap(), + vec![(Bytes::from("2"), Bytes::from("2333"))], + ); + check_lsm_iter_result_by_key( + &mut storage + .scan(Bound::Excluded(b"1"), Bound::Excluded(b"3")) + .unwrap(), + vec![(Bytes::from("2"), Bytes::from("2333"))], + ); +} + +#[test] +fn test_task3_storage_get() { + let dir = tempdir().unwrap(); + let storage = + Arc::new(LsmStorageInner::open(&dir, LsmStorageOptions::default_for_week1_test()).unwrap()); + storage.put(b"1", b"233").unwrap(); + storage.put(b"2", b"2333").unwrap(); + storage.put(b"00", b"2333").unwrap(); + storage + .force_freeze_memtable(&storage.state_lock.lock()) + .unwrap(); + storage.put(b"3", b"23333").unwrap(); + storage.delete(b"1").unwrap(); + let sst1 = generate_sst( + 10, + dir.path().join("10.sst"), + vec![ + (Bytes::from_static(b"0"), Bytes::from_static(b"2333333")), + (Bytes::from_static(b"00"), Bytes::from_static(b"2333333")), + (Bytes::from_static(b"4"), Bytes::from_static(b"23")), + ], + Some(storage.block_cache.clone()), + ); + let sst2 = generate_sst( + 11, + dir.path().join("11.sst"), + vec![(Bytes::from_static(b"4"), Bytes::from_static(b""))], + Some(storage.block_cache.clone()), + ); + { + let mut state = storage.state.write(); + let mut snapshot = state.as_ref().clone(); + snapshot.l0_sstables.push(sst2.sst_id()); // this is the latest SST + snapshot.l0_sstables.push(sst1.sst_id()); + snapshot.sstables.insert(sst2.sst_id(), sst2.into()); + snapshot.sstables.insert(sst1.sst_id(), sst1.into()); + *state = snapshot.into(); + } + + assert_eq!( + storage.get(b"0").unwrap(), + Some(Bytes::from_static(b"2333333")) + ); + assert_eq!( + storage.get(b"00").unwrap(), + Some(Bytes::from_static(b"2333")) + ); + assert_eq!( + storage.get(b"2").unwrap(), + Some(Bytes::from_static(b"2333")) + ); + assert_eq!( + storage.get(b"3").unwrap(), + Some(Bytes::from_static(b"23333")) + ); + assert_eq!(storage.get(b"4").unwrap(), None); + assert_eq!(storage.get(b"--").unwrap(), None); + assert_eq!(storage.get(b"555").unwrap(), None); +} diff --git a/mini-lsm-starter/src/tests/week1_day6.rs b/mini-lsm-starter/src/tests/week1_day6.rs new file mode 100644 index 000000000..12bda859f --- /dev/null +++ b/mini-lsm-starter/src/tests/week1_day6.rs @@ -0,0 +1,200 @@ +use std::{ops::Bound, sync::Arc, time::Duration}; + +use bytes::Bytes; +use tempfile::tempdir; + +use self::harness::{check_lsm_iter_result_by_key, sync}; + +use super::*; +use crate::{ + iterators::StorageIterator, + lsm_storage::{LsmStorageInner, LsmStorageOptions, MiniLsm}, +}; + +#[test] +fn test_task1_storage_scan() { + let dir = tempdir().unwrap(); + let storage = + Arc::new(LsmStorageInner::open(&dir, LsmStorageOptions::default_for_week1_test()).unwrap()); + storage.put(b"0", b"2333333").unwrap(); + storage.put(b"00", b"2333333").unwrap(); + storage.put(b"4", b"23").unwrap(); + sync(&storage); + + storage.delete(b"4").unwrap(); + sync(&storage); + + storage.put(b"1", b"233").unwrap(); + storage.put(b"2", b"2333").unwrap(); + storage + .force_freeze_memtable(&storage.state_lock.lock()) + .unwrap(); + storage.put(b"00", b"2333").unwrap(); + storage + .force_freeze_memtable(&storage.state_lock.lock()) + .unwrap(); + storage.put(b"3", b"23333").unwrap(); + storage.delete(b"1").unwrap(); + + { + let state = storage.state.read(); + assert_eq!(state.l0_sstables.len(), 2); + assert_eq!(state.imm_memtables.len(), 2); + } + + storage.dump_structure(); + check_lsm_iter_result_by_key( + &mut storage.scan(Bound::Unbounded, Bound::Unbounded).unwrap(), + vec![ + (Bytes::from("0"), Bytes::from("2333333")), + (Bytes::from("00"), Bytes::from("2333")), + (Bytes::from("2"), Bytes::from("2333")), + (Bytes::from("3"), Bytes::from("23333")), + ], + ); + check_lsm_iter_result_by_key( + &mut storage + .scan(Bound::Included(b"1"), Bound::Included(b"2")) + .unwrap(), + vec![(Bytes::from("2"), Bytes::from("2333"))], + ); + check_lsm_iter_result_by_key( + &mut storage + .scan(Bound::Excluded(b"1"), Bound::Excluded(b"3")) + .unwrap(), + vec![(Bytes::from("2"), Bytes::from("2333"))], + ); +} + +#[test] +fn test_task1_storage_get() { + let dir = tempdir().unwrap(); + let storage = + Arc::new(LsmStorageInner::open(&dir, LsmStorageOptions::default_for_week1_test()).unwrap()); + storage.put(b"0", b"2333333").unwrap(); + storage.put(b"00", b"2333333").unwrap(); + storage.put(b"4", b"23").unwrap(); + sync(&storage); + + storage.delete(b"4").unwrap(); + sync(&storage); + + storage.put(b"1", b"233").unwrap(); + storage.put(b"2", b"2333").unwrap(); + storage + .force_freeze_memtable(&storage.state_lock.lock()) + .unwrap(); + storage.put(b"00", b"2333").unwrap(); + storage + .force_freeze_memtable(&storage.state_lock.lock()) + .unwrap(); + storage.put(b"3", b"23333").unwrap(); + storage.delete(b"1").unwrap(); + + storage.dump_structure(); + + { + let state = storage.state.read(); + assert_eq!(state.l0_sstables.len(), 2); + assert_eq!(state.imm_memtables.len(), 2); + } + + assert_eq!( + storage.get(b"0").unwrap(), + Some(Bytes::from_static(b"2333333")) + ); + assert_eq!( + storage.get(b"00").unwrap(), + Some(Bytes::from_static(b"2333")) + ); + assert_eq!( + storage.get(b"2").unwrap(), + Some(Bytes::from_static(b"2333")) + ); + assert_eq!( + storage.get(b"3").unwrap(), + Some(Bytes::from_static(b"23333")) + ); + assert_eq!(storage.get(b"4").unwrap(), None); + assert_eq!(storage.get(b"--").unwrap(), None); + assert_eq!(storage.get(b"555").unwrap(), None); +} + +#[test] +fn test_task2_auto_flush() { + let dir = tempdir().unwrap(); + let storage = MiniLsm::open(&dir, LsmStorageOptions::default_for_week1_day6_test()).unwrap(); + + let value = "1".repeat(1024); // 1KB + + // approximately 6MB + for i in 0..6000 { + storage + .put(format!("{i}").as_bytes(), value.as_bytes()) + .unwrap(); + } + + std::thread::sleep(Duration::from_millis(500)); + + assert!(!storage.inner.state.read().l0_sstables.is_empty()); +} + +#[test] +fn test_task3_sst_filter() { + let dir = tempdir().unwrap(); + let storage = + Arc::new(LsmStorageInner::open(&dir, LsmStorageOptions::default_for_week1_test()).unwrap()); + + for i in 1..=10000 { + if i % 1000 == 0 { + sync(&storage); + } + storage + .put(format!("{:05}", i).as_bytes(), b"2333333") + .unwrap(); + } + + let iter = storage.scan(Bound::Unbounded, Bound::Unbounded).unwrap(); + assert!( + iter.num_active_iterators() >= 10, + "did you implement num_active_iterators? current active iterators = {}", + iter.num_active_iterators() + ); + let max_num = iter.num_active_iterators(); + let iter = storage + .scan( + Bound::Excluded(format!("{:05}", 10000).as_bytes()), + Bound::Unbounded, + ) + .unwrap(); + assert!(iter.num_active_iterators() < max_num); + let min_num = iter.num_active_iterators(); + let iter = storage + .scan( + Bound::Unbounded, + Bound::Excluded(format!("{:05}", 1).as_bytes()), + ) + .unwrap(); + assert_eq!(iter.num_active_iterators(), min_num); + let iter = storage + .scan( + Bound::Unbounded, + Bound::Included(format!("{:05}", 0).as_bytes()), + ) + .unwrap(); + assert_eq!(iter.num_active_iterators(), min_num); + let iter = storage + .scan( + Bound::Included(format!("{:05}", 10001).as_bytes()), + Bound::Unbounded, + ) + .unwrap(); + assert_eq!(iter.num_active_iterators(), min_num); + let iter = storage + .scan( + Bound::Included(format!("{:05}", 5000).as_bytes()), + Bound::Excluded(format!("{:05}", 6000).as_bytes()), + ) + .unwrap(); + assert!(min_num < iter.num_active_iterators() && iter.num_active_iterators() < max_num); +} diff --git a/mini-lsm-starter/src/tests/week1_day7.rs b/mini-lsm-starter/src/tests/week1_day7.rs new file mode 100644 index 000000000..ce51a66d8 --- /dev/null +++ b/mini-lsm-starter/src/tests/week1_day7.rs @@ -0,0 +1,91 @@ +use tempfile::tempdir; + +use crate::{ + key::{KeySlice, TS_ENABLED}, + table::{bloom::Bloom, FileObject, SsTable, SsTableBuilder}, +}; + +fn key_of(idx: usize) -> Vec { + format!("key_{:010}", idx * 5).into_bytes() +} + +fn value_of(idx: usize) -> Vec { + format!("value_{:010}", idx).into_bytes() +} + +fn num_of_keys() -> usize { + 100 +} + +#[test] +fn test_task1_bloom_filter() { + let mut key_hashes = Vec::new(); + for idx in 0..num_of_keys() { + let key = key_of(idx); + key_hashes.push(farmhash::fingerprint32(&key)); + } + let bits_per_key = Bloom::bloom_bits_per_key(key_hashes.len(), 0.01); + println!("bits per key: {}", bits_per_key); + let bloom = Bloom::build_from_key_hashes(&key_hashes, bits_per_key); + println!("bloom size: {}, k={}", bloom.filter.len(), bloom.k); + assert!(bloom.k < 30); + for idx in 0..num_of_keys() { + let key = key_of(idx); + assert!(bloom.may_contain(farmhash::fingerprint32(&key))); + } + let mut x = 0; + let mut cnt = 0; + for idx in num_of_keys()..(num_of_keys() * 10) { + let key = key_of(idx); + if bloom.may_contain(farmhash::fingerprint32(&key)) { + x += 1; + } + cnt += 1; + } + assert_ne!(x, cnt, "bloom filter not taking effect?"); + assert_ne!(x, 0, "bloom filter not taking effect?"); +} + +#[test] +fn test_task2_sst_decode() { + let mut builder = SsTableBuilder::new(128); + for idx in 0..num_of_keys() { + let key = key_of(idx); + let value = value_of(idx); + builder.add(KeySlice::for_testing_from_slice_no_ts(&key[..]), &value[..]); + } + let dir = tempdir().unwrap(); + let path = dir.path().join("1.sst"); + let sst = builder.build_for_test(&path).unwrap(); + let sst2 = SsTable::open(0, None, FileObject::open(&path).unwrap()).unwrap(); + let bloom_1 = sst.bloom.as_ref().unwrap(); + let bloom_2 = sst2.bloom.as_ref().unwrap(); + assert_eq!(bloom_1.k, bloom_2.k); + assert_eq!(bloom_1.filter, bloom_2.filter); +} + +#[test] +fn test_task3_block_key_compression() { + let mut builder = SsTableBuilder::new(128); + for idx in 0..num_of_keys() { + let key = key_of(idx); + let value = value_of(idx); + builder.add(KeySlice::for_testing_from_slice_no_ts(&key[..]), &value[..]); + } + let dir = tempdir().unwrap(); + let path = dir.path().join("1.sst"); + let sst = builder.build_for_test(path).unwrap(); + if TS_ENABLED { + assert!( + sst.block_metas.len() <= 34, + "you have {} blocks, expect 34", + sst.block_metas.len() + ); + } else { + assert!( + sst.block_metas.len() <= 25, + "you have {} blocks, expect 25", + sst.block_metas.len() + ); + } +} diff --git a/mini-lsm-starter/src/tests/week2_day1.rs b/mini-lsm-starter/src/tests/week2_day1.rs new file mode 100644 index 000000000..b681c1b41 --- /dev/null +++ b/mini-lsm-starter/src/tests/week2_day1.rs @@ -0,0 +1,252 @@ +use std::{ops::Bound, path::Path, sync::Arc}; + +use self::harness::{check_iter_result_by_key, check_lsm_iter_result_by_key, sync}; +use bytes::Bytes; +use tempfile::tempdir; +use week2_day1::harness::construct_merge_iterator_over_storage; + +use super::*; +use crate::{ + iterators::{concat_iterator::SstConcatIterator, StorageIterator}, + key::{KeySlice, TS_ENABLED}, + lsm_storage::{LsmStorageInner, LsmStorageOptions}, + table::{SsTable, SsTableBuilder}, +}; + +#[test] +fn test_task1_full_compaction() { + // We do not use LSM iterator in this test because it's implemented as part of task 3concat.rs + let dir = tempdir().unwrap(); + let storage = + Arc::new(LsmStorageInner::open(&dir, LsmStorageOptions::default_for_week1_test()).unwrap()); + #[allow(clippy::let_unit_value)] + let _txn = storage.new_txn().unwrap(); + storage.put(b"0", b"v1").unwrap(); + sync(&storage); + storage.put(b"0", b"v2").unwrap(); + storage.put(b"1", b"v2").unwrap(); + storage.put(b"2", b"v2").unwrap(); + sync(&storage); + storage.delete(b"0").unwrap(); + storage.delete(b"2").unwrap(); + sync(&storage); + assert_eq!(storage.state.read().l0_sstables.len(), 3); + let mut iter = construct_merge_iterator_over_storage(&storage.state.read()); + if TS_ENABLED { + check_iter_result_by_key( + &mut iter, + vec![ + (Bytes::from_static(b"0"), Bytes::from_static(b"")), + (Bytes::from_static(b"0"), Bytes::from_static(b"v2")), + (Bytes::from_static(b"0"), Bytes::from_static(b"v1")), + (Bytes::from_static(b"1"), Bytes::from_static(b"v2")), + (Bytes::from_static(b"2"), Bytes::from_static(b"")), + (Bytes::from_static(b"2"), Bytes::from_static(b"v2")), + ], + ); + } else { + check_iter_result_by_key( + &mut iter, + vec![ + (Bytes::from_static(b"0"), Bytes::from_static(b"")), + (Bytes::from_static(b"1"), Bytes::from_static(b"v2")), + (Bytes::from_static(b"2"), Bytes::from_static(b"")), + ], + ); + } + storage.force_full_compaction().unwrap(); + storage.dump_structure(); + assert!(storage.state.read().l0_sstables.is_empty()); + let mut iter = construct_merge_iterator_over_storage(&storage.state.read()); + if TS_ENABLED { + check_iter_result_by_key( + &mut iter, + vec![ + (Bytes::from_static(b"0"), Bytes::from_static(b"")), + (Bytes::from_static(b"0"), Bytes::from_static(b"v2")), + (Bytes::from_static(b"0"), Bytes::from_static(b"v1")), + (Bytes::from_static(b"1"), Bytes::from_static(b"v2")), + (Bytes::from_static(b"2"), Bytes::from_static(b"")), + (Bytes::from_static(b"2"), Bytes::from_static(b"v2")), + ], + ); + } else { + check_iter_result_by_key( + &mut iter, + vec![(Bytes::from_static(b"1"), Bytes::from_static(b"v2"))], + ); + } + storage.put(b"0", b"v3").unwrap(); + storage.put(b"2", b"v3").unwrap(); + sync(&storage); + storage.delete(b"1").unwrap(); + sync(&storage); + let mut iter = construct_merge_iterator_over_storage(&storage.state.read()); + if TS_ENABLED { + check_iter_result_by_key( + &mut iter, + vec![ + (Bytes::from_static(b"0"), Bytes::from_static(b"v3")), + (Bytes::from_static(b"0"), Bytes::from_static(b"")), + (Bytes::from_static(b"0"), Bytes::from_static(b"v2")), + (Bytes::from_static(b"0"), Bytes::from_static(b"v1")), + (Bytes::from_static(b"1"), Bytes::from_static(b"")), + (Bytes::from_static(b"1"), Bytes::from_static(b"v2")), + (Bytes::from_static(b"2"), Bytes::from_static(b"v3")), + (Bytes::from_static(b"2"), Bytes::from_static(b"")), + (Bytes::from_static(b"2"), Bytes::from_static(b"v2")), + ], + ); + } else { + check_iter_result_by_key( + &mut iter, + vec![ + (Bytes::from_static(b"0"), Bytes::from_static(b"v3")), + (Bytes::from_static(b"1"), Bytes::from_static(b"")), + (Bytes::from_static(b"2"), Bytes::from_static(b"v3")), + ], + ); + } + storage.force_full_compaction().unwrap(); + assert!(storage.state.read().l0_sstables.is_empty()); + let mut iter = construct_merge_iterator_over_storage(&storage.state.read()); + if TS_ENABLED { + check_iter_result_by_key( + &mut iter, + vec![ + (Bytes::from_static(b"0"), Bytes::from_static(b"v3")), + (Bytes::from_static(b"0"), Bytes::from_static(b"")), + (Bytes::from_static(b"0"), Bytes::from_static(b"v2")), + (Bytes::from_static(b"0"), Bytes::from_static(b"v1")), + (Bytes::from_static(b"1"), Bytes::from_static(b"")), + (Bytes::from_static(b"1"), Bytes::from_static(b"v2")), + (Bytes::from_static(b"2"), Bytes::from_static(b"v3")), + (Bytes::from_static(b"2"), Bytes::from_static(b"")), + (Bytes::from_static(b"2"), Bytes::from_static(b"v2")), + ], + ); + } else { + check_iter_result_by_key( + &mut iter, + vec![ + (Bytes::from_static(b"0"), Bytes::from_static(b"v3")), + (Bytes::from_static(b"2"), Bytes::from_static(b"v3")), + ], + ); + } +} + +fn generate_concat_sst( + start_key: usize, + end_key: usize, + dir: impl AsRef, + id: usize, +) -> SsTable { + let mut builder = SsTableBuilder::new(128); + for idx in start_key..end_key { + let key = format!("{:05}", idx); + builder.add( + KeySlice::for_testing_from_slice_no_ts(key.as_bytes()), + b"test", + ); + } + let path = dir.as_ref().join(format!("{id}.sst")); + builder.build_for_test(path).unwrap() +} + +#[test] +fn test_task2_concat_iterator() { + let dir = tempdir().unwrap(); + let mut sstables = Vec::new(); + for i in 1..=10 { + sstables.push(Arc::new(generate_concat_sst( + i * 10, + (i + 1) * 10, + dir.path(), + i, + ))); + } + for key in 0..120 { + let iter = SstConcatIterator::create_and_seek_to_key( + sstables.clone(), + KeySlice::for_testing_from_slice_no_ts(format!("{:05}", key).as_bytes()), + ) + .unwrap(); + if key < 10 { + assert!(iter.is_valid()); + assert_eq!(iter.key().for_testing_key_ref(), b"00010"); + } else if key >= 110 { + assert!(!iter.is_valid()); + } else { + assert!(iter.is_valid()); + assert_eq!( + iter.key().for_testing_key_ref(), + format!("{:05}", key).as_bytes() + ); + } + } + let iter = SstConcatIterator::create_and_seek_to_first(sstables.clone()).unwrap(); + assert!(iter.is_valid()); + assert_eq!(iter.key().for_testing_key_ref(), b"00010"); +} + +#[test] +fn test_task3_integration() { + let dir = tempdir().unwrap(); + let storage = + Arc::new(LsmStorageInner::open(&dir, LsmStorageOptions::default_for_week1_test()).unwrap()); + storage.put(b"0", b"2333333").unwrap(); + storage.put(b"00", b"2333333").unwrap(); + storage.put(b"4", b"23").unwrap(); + sync(&storage); + + storage.delete(b"4").unwrap(); + sync(&storage); + + storage.force_full_compaction().unwrap(); + assert!(storage.state.read().l0_sstables.is_empty()); + assert!(!storage.state.read().levels[0].1.is_empty()); + + storage.put(b"1", b"233").unwrap(); + storage.put(b"2", b"2333").unwrap(); + sync(&storage); + + storage.put(b"00", b"2333").unwrap(); + storage.put(b"3", b"23333").unwrap(); + storage.delete(b"1").unwrap(); + sync(&storage); + storage.force_full_compaction().unwrap(); + + assert!(storage.state.read().l0_sstables.is_empty()); + assert!(!storage.state.read().levels[0].1.is_empty()); + + check_lsm_iter_result_by_key( + &mut storage.scan(Bound::Unbounded, Bound::Unbounded).unwrap(), + vec![ + (Bytes::from("0"), Bytes::from("2333333")), + (Bytes::from("00"), Bytes::from("2333")), + (Bytes::from("2"), Bytes::from("2333")), + (Bytes::from("3"), Bytes::from("23333")), + ], + ); + + assert_eq!( + storage.get(b"0").unwrap(), + Some(Bytes::from_static(b"2333333")) + ); + assert_eq!( + storage.get(b"00").unwrap(), + Some(Bytes::from_static(b"2333")) + ); + assert_eq!( + storage.get(b"2").unwrap(), + Some(Bytes::from_static(b"2333")) + ); + assert_eq!( + storage.get(b"3").unwrap(), + Some(Bytes::from_static(b"23333")) + ); + assert_eq!(storage.get(b"4").unwrap(), None); + assert_eq!(storage.get(b"--").unwrap(), None); + assert_eq!(storage.get(b"555").unwrap(), None); +} diff --git a/mini-lsm-starter/src/tests/week2_day2.rs b/mini-lsm-starter/src/tests/week2_day2.rs new file mode 100644 index 000000000..5064afc5e --- /dev/null +++ b/mini-lsm-starter/src/tests/week2_day2.rs @@ -0,0 +1,27 @@ +use tempfile::tempdir; + +use crate::{ + compact::{CompactionOptions, SimpleLeveledCompactionOptions}, + lsm_storage::{LsmStorageOptions, MiniLsm}, +}; + +use super::harness::{check_compaction_ratio, compaction_bench}; + +#[test] +fn test_integration() { + let dir = tempdir().unwrap(); + let storage = MiniLsm::open( + &dir, + LsmStorageOptions::default_for_week2_test(CompactionOptions::Simple( + SimpleLeveledCompactionOptions { + level0_file_num_compaction_trigger: 2, + max_levels: 3, + size_ratio_percent: 200, + }, + )), + ) + .unwrap(); + + compaction_bench(storage.clone()); + check_compaction_ratio(storage.clone()); +} diff --git a/mini-lsm-starter/src/tests/week2_day3.rs b/mini-lsm-starter/src/tests/week2_day3.rs new file mode 100644 index 000000000..6e124db0b --- /dev/null +++ b/mini-lsm-starter/src/tests/week2_day3.rs @@ -0,0 +1,28 @@ +use tempfile::tempdir; + +use crate::{ + compact::{CompactionOptions, TieredCompactionOptions}, + lsm_storage::{LsmStorageOptions, MiniLsm}, +}; + +use super::harness::{check_compaction_ratio, compaction_bench}; + +#[test] +fn test_integration() { + let dir = tempdir().unwrap(); + let storage = MiniLsm::open( + &dir, + LsmStorageOptions::default_for_week2_test(CompactionOptions::Tiered( + TieredCompactionOptions { + num_tiers: 3, + max_size_amplification_percent: 200, + size_ratio: 1, + min_merge_width: 2, + }, + )), + ) + .unwrap(); + + compaction_bench(storage.clone()); + check_compaction_ratio(storage.clone()); +} diff --git a/mini-lsm-starter/src/tests/week2_day4.rs b/mini-lsm-starter/src/tests/week2_day4.rs new file mode 100644 index 000000000..a2fd9007e --- /dev/null +++ b/mini-lsm-starter/src/tests/week2_day4.rs @@ -0,0 +1,28 @@ +use tempfile::tempdir; + +use crate::{ + compact::{CompactionOptions, LeveledCompactionOptions}, + lsm_storage::{LsmStorageOptions, MiniLsm}, +}; + +use super::harness::{check_compaction_ratio, compaction_bench}; + +#[test] +fn test_integration() { + let dir = tempdir().unwrap(); + let storage = MiniLsm::open( + &dir, + LsmStorageOptions::default_for_week2_test(CompactionOptions::Leveled( + LeveledCompactionOptions { + level0_file_num_compaction_trigger: 2, + level_size_multiplier: 2, + base_level_size_mb: 1, + max_levels: 4, + }, + )), + ) + .unwrap(); + + compaction_bench(storage.clone()); + check_compaction_ratio(storage.clone()); +} diff --git a/mini-lsm-starter/src/tests/week2_day5.rs b/mini-lsm-starter/src/tests/week2_day5.rs new file mode 100644 index 000000000..9ceedb33f --- /dev/null +++ b/mini-lsm-starter/src/tests/week2_day5.rs @@ -0,0 +1,81 @@ +use tempfile::tempdir; + +use crate::{ + compact::{ + CompactionOptions, LeveledCompactionOptions, SimpleLeveledCompactionOptions, + TieredCompactionOptions, + }, + lsm_storage::{LsmStorageOptions, MiniLsm}, + tests::harness::dump_files_in_dir, +}; + +#[test] +fn test_integration_leveled() { + test_integration(CompactionOptions::Leveled(LeveledCompactionOptions { + level_size_multiplier: 2, + level0_file_num_compaction_trigger: 2, + max_levels: 3, + base_level_size_mb: 1, + })) +} + +#[test] +fn test_integration_tiered() { + test_integration(CompactionOptions::Tiered(TieredCompactionOptions { + num_tiers: 3, + max_size_amplification_percent: 200, + size_ratio: 1, + min_merge_width: 3, + })) +} + +#[test] +fn test_integration_simple() { + test_integration(CompactionOptions::Simple(SimpleLeveledCompactionOptions { + size_ratio_percent: 200, + level0_file_num_compaction_trigger: 2, + max_levels: 3, + })); +} + +fn test_integration(compaction_options: CompactionOptions) { + let dir = tempdir().unwrap(); + let storage = MiniLsm::open( + &dir, + LsmStorageOptions::default_for_week2_test(compaction_options.clone()), + ) + .unwrap(); + for i in 0..=20 { + storage.put(b"0", format!("v{}", i).as_bytes()).unwrap(); + if i % 2 == 0 { + storage.put(b"1", format!("v{}", i).as_bytes()).unwrap(); + } else { + storage.delete(b"1").unwrap(); + } + if i % 2 == 1 { + storage.put(b"2", format!("v{}", i).as_bytes()).unwrap(); + } else { + storage.delete(b"2").unwrap(); + } + storage + .inner + .force_freeze_memtable(&storage.inner.state_lock.lock()) + .unwrap(); + } + storage.close().unwrap(); + // ensure all SSTs are flushed + assert!(storage.inner.state.read().memtable.is_empty()); + assert!(storage.inner.state.read().imm_memtables.is_empty()); + storage.dump_structure(); + drop(storage); + dump_files_in_dir(&dir); + + let storage = MiniLsm::open( + &dir, + LsmStorageOptions::default_for_week2_test(compaction_options.clone()), + ) + .unwrap(); + assert_eq!(&storage.get(b"0").unwrap().unwrap()[..], b"v20".as_slice()); + assert_eq!(&storage.get(b"1").unwrap().unwrap()[..], b"v20".as_slice()); + assert_eq!(storage.get(b"2").unwrap(), None); +} diff --git a/mini-lsm-starter/src/tests/week2_day6.rs b/mini-lsm-starter/src/tests/week2_day6.rs new file mode 100644 index 000000000..befd10059 --- /dev/null +++ b/mini-lsm-starter/src/tests/week2_day6.rs @@ -0,0 +1,77 @@ +use tempfile::tempdir; + +use crate::{ + compact::{ + CompactionOptions, LeveledCompactionOptions, SimpleLeveledCompactionOptions, + TieredCompactionOptions, + }, + lsm_storage::{LsmStorageOptions, MiniLsm}, + tests::harness::dump_files_in_dir, +}; + +#[test] +fn test_integration_leveled() { + test_integration(CompactionOptions::Leveled(LeveledCompactionOptions { + level_size_multiplier: 2, + level0_file_num_compaction_trigger: 2, + max_levels: 3, + base_level_size_mb: 1, + })) +} + +#[test] +fn test_integration_tiered() { + test_integration(CompactionOptions::Tiered(TieredCompactionOptions { + num_tiers: 3, + max_size_amplification_percent: 200, + size_ratio: 1, + min_merge_width: 3, + })) +} + +#[test] +fn test_integration_simple() { + test_integration(CompactionOptions::Simple(SimpleLeveledCompactionOptions { + size_ratio_percent: 200, + level0_file_num_compaction_trigger: 2, + max_levels: 3, + })); +} + +fn test_integration(compaction_options: CompactionOptions) { + let dir = tempdir().unwrap(); + let mut options = LsmStorageOptions::default_for_week2_test(compaction_options); + options.enable_wal = true; + let storage = MiniLsm::open(&dir, options.clone()).unwrap(); + for i in 0..=20 { + storage.put(b"0", format!("v{}", i).as_bytes()).unwrap(); + if i % 2 == 0 { + storage.put(b"1", format!("v{}", i).as_bytes()).unwrap(); + } else { + storage.delete(b"1").unwrap(); + } + if i % 2 == 1 { + storage.put(b"2", format!("v{}", i).as_bytes()).unwrap(); + } else { + storage.delete(b"2").unwrap(); + } + storage + .inner + .force_freeze_memtable(&storage.inner.state_lock.lock()) + .unwrap(); + } + storage.close().unwrap(); + // ensure some SSTs are not flushed + assert!( + !storage.inner.state.read().memtable.is_empty() + || !storage.inner.state.read().imm_memtables.is_empty() + ); + storage.dump_structure(); + drop(storage); + dump_files_in_dir(&dir); + + let storage = MiniLsm::open(&dir, options).unwrap(); + assert_eq!(&storage.get(b"0").unwrap().unwrap()[..], b"v20".as_slice()); + assert_eq!(&storage.get(b"1").unwrap().unwrap()[..], b"v20".as_slice()); + assert_eq!(storage.get(b"2").unwrap(), None); +} diff --git a/mini-lsm-starter/src/tests/week3_day1.rs b/mini-lsm-starter/src/tests/week3_day1.rs new file mode 100644 index 000000000..df5597968 --- /dev/null +++ b/mini-lsm-starter/src/tests/week3_day1.rs @@ -0,0 +1,54 @@ +use std::sync::Arc; + +use bytes::Bytes; +use tempfile::tempdir; + +use crate::key::KeySlice; +use crate::table::{FileObject, SsTable, SsTableBuilder, SsTableIterator}; + +use super::harness::{check_iter_result_by_key_and_ts, generate_sst_with_ts}; + +#[test] +fn test_sst_build_multi_version_simple() { + let mut builder = SsTableBuilder::new(16); + builder.add( + KeySlice::for_testing_from_slice_with_ts(b"233", 233), + b"233333", + ); + builder.add( + KeySlice::for_testing_from_slice_with_ts(b"233", 0), + b"2333333", + ); + let dir = tempdir().unwrap(); + builder.build_for_test(dir.path().join("1.sst")).unwrap(); +} + +fn generate_test_data() -> Vec<((Bytes, u64), Bytes)> { + (0..100) + .map(|id| { + ( + (Bytes::from(format!("key{:05}", id / 5)), 5 - (id % 5)), + Bytes::from(format!("value{:05}", id)), + ) + }) + .collect() +} + +#[test] +fn test_sst_build_multi_version_hard() { + let dir = tempdir().unwrap(); + let data = generate_test_data(); + generate_sst_with_ts(1, dir.path().join("1.sst"), data.clone(), None); + let sst = Arc::new( + SsTable::open( + 1, + None, + FileObject::open(&dir.path().join("1.sst")).unwrap(), + ) + .unwrap(), + ); + check_iter_result_by_key_and_ts( + &mut SsTableIterator::create_and_seek_to_first(sst).unwrap(), + data, + ); +} diff --git a/mini-lsm-starter/src/tests/week3_day2.rs b/mini-lsm-starter/src/tests/week3_day2.rs new file mode 100644 index 000000000..df1f3ce89 --- /dev/null +++ b/mini-lsm-starter/src/tests/week3_day2.rs @@ -0,0 +1,61 @@ +use std::time::Duration; + +use tempfile::tempdir; + +use crate::{ + compact::CompactionOptions, + lsm_storage::{LsmStorageOptions, MiniLsm}, + tests::harness::dump_files_in_dir, +}; + +#[test] +fn test_task3_compaction_integration() { + let dir = tempdir().unwrap(); + let mut options = LsmStorageOptions::default_for_week2_test(CompactionOptions::NoCompaction); + options.enable_wal = true; + let storage = MiniLsm::open(&dir, options.clone()).unwrap(); + let _txn = storage.new_txn().unwrap(); + for i in 0..=20000 { + storage + .put(b"0", format!("{:02000}", i).as_bytes()) + .unwrap(); + } + std::thread::sleep(Duration::from_secs(1)); // wait until all memtables flush + while { + let snapshot = storage.inner.state.read(); + !snapshot.imm_memtables.is_empty() + } { + storage.inner.force_flush_next_imm_memtable().unwrap(); + } + assert!(storage.inner.state.read().l0_sstables.len() > 1); + storage.force_full_compaction().unwrap(); + storage.dump_structure(); + dump_files_in_dir(&dir); + assert!(storage.inner.state.read().l0_sstables.is_empty()); + assert_eq!(storage.inner.state.read().levels.len(), 1); + // same key in the same SST + assert_eq!(storage.inner.state.read().levels[0].1.len(), 1); + for i in 0..=100 { + storage + .put(b"1", format!("{:02000}", i).as_bytes()) + .unwrap(); + } + storage + .inner + .force_freeze_memtable(&storage.inner.state_lock.lock()) + .unwrap(); + std::thread::sleep(Duration::from_secs(1)); // wait until all memtables flush + while { + let snapshot = storage.inner.state.read(); + !snapshot.imm_memtables.is_empty() + } { + storage.inner.force_flush_next_imm_memtable().unwrap(); + } + storage.force_full_compaction().unwrap(); + storage.dump_structure(); + dump_files_in_dir(&dir); + assert!(storage.inner.state.read().l0_sstables.is_empty()); + assert_eq!(storage.inner.state.read().levels.len(), 1); + // same key in the same SST, now we should split two + assert_eq!(storage.inner.state.read().levels[0].1.len(), 2); +} diff --git a/mini-lsm-starter/src/tests/week3_day3.rs b/mini-lsm-starter/src/tests/week3_day3.rs new file mode 100644 index 000000000..a0b2532f0 --- /dev/null +++ b/mini-lsm-starter/src/tests/week3_day3.rs @@ -0,0 +1,276 @@ +use std::ops::Bound; + +use bytes::Bytes; +use tempfile::tempdir; + +use crate::{ + compact::CompactionOptions, + key::KeySlice, + lsm_storage::{LsmStorageOptions, MiniLsm}, + table::SsTableBuilder, + tests::harness::check_lsm_iter_result_by_key, +}; + +#[test] +fn test_task2_memtable_mvcc() { + let dir = tempdir().unwrap(); + let mut options = LsmStorageOptions::default_for_week2_test(CompactionOptions::NoCompaction); + options.enable_wal = true; + let storage = MiniLsm::open(&dir, options.clone()).unwrap(); + storage.put(b"a", b"1").unwrap(); + storage.put(b"b", b"1").unwrap(); + let snapshot1 = storage.new_txn().unwrap(); + storage.put(b"a", b"2").unwrap(); + let snapshot2 = storage.new_txn().unwrap(); + storage.delete(b"b").unwrap(); + storage.put(b"c", b"1").unwrap(); + let snapshot3 = storage.new_txn().unwrap(); + assert_eq!(snapshot1.get(b"a").unwrap(), Some(Bytes::from_static(b"1"))); + assert_eq!(snapshot1.get(b"b").unwrap(), Some(Bytes::from_static(b"1"))); + assert_eq!(snapshot1.get(b"c").unwrap(), None); + check_lsm_iter_result_by_key( + &mut snapshot1.scan(Bound::Unbounded, Bound::Unbounded).unwrap(), + vec![ + (Bytes::from("a"), Bytes::from("1")), + (Bytes::from("b"), Bytes::from("1")), + ], + ); + assert_eq!(snapshot2.get(b"a").unwrap(), Some(Bytes::from_static(b"2"))); + assert_eq!(snapshot2.get(b"b").unwrap(), Some(Bytes::from_static(b"1"))); + assert_eq!(snapshot2.get(b"c").unwrap(), None); + check_lsm_iter_result_by_key( + &mut snapshot2.scan(Bound::Unbounded, Bound::Unbounded).unwrap(), + vec![ + (Bytes::from("a"), Bytes::from("2")), + (Bytes::from("b"), Bytes::from("1")), + ], + ); + assert_eq!(snapshot3.get(b"a").unwrap(), Some(Bytes::from_static(b"2"))); + assert_eq!(snapshot3.get(b"b").unwrap(), None); + assert_eq!(snapshot3.get(b"c").unwrap(), Some(Bytes::from_static(b"1"))); + check_lsm_iter_result_by_key( + &mut snapshot3.scan(Bound::Unbounded, Bound::Unbounded).unwrap(), + vec![ + (Bytes::from("a"), Bytes::from("2")), + (Bytes::from("c"), Bytes::from("1")), + ], + ); + storage + .inner + .force_freeze_memtable(&storage.inner.state_lock.lock()) + .unwrap(); + storage.put(b"a", b"3").unwrap(); + storage.put(b"b", b"3").unwrap(); + let snapshot4 = storage.new_txn().unwrap(); + storage.put(b"a", b"4").unwrap(); + let snapshot5 = storage.new_txn().unwrap(); + storage.delete(b"b").unwrap(); + storage.put(b"c", b"5").unwrap(); + let snapshot6 = storage.new_txn().unwrap(); + assert_eq!(snapshot1.get(b"a").unwrap(), Some(Bytes::from_static(b"1"))); + assert_eq!(snapshot1.get(b"b").unwrap(), Some(Bytes::from_static(b"1"))); + assert_eq!(snapshot1.get(b"c").unwrap(), None); + check_lsm_iter_result_by_key( + &mut snapshot1.scan(Bound::Unbounded, Bound::Unbounded).unwrap(), + vec![ + (Bytes::from("a"), Bytes::from("1")), + (Bytes::from("b"), Bytes::from("1")), + ], + ); + assert_eq!(snapshot2.get(b"a").unwrap(), Some(Bytes::from_static(b"2"))); + assert_eq!(snapshot2.get(b"b").unwrap(), Some(Bytes::from_static(b"1"))); + assert_eq!(snapshot2.get(b"c").unwrap(), None); + check_lsm_iter_result_by_key( + &mut snapshot2.scan(Bound::Unbounded, Bound::Unbounded).unwrap(), + vec![ + (Bytes::from("a"), Bytes::from("2")), + (Bytes::from("b"), Bytes::from("1")), + ], + ); + assert_eq!(snapshot3.get(b"a").unwrap(), Some(Bytes::from_static(b"2"))); + assert_eq!(snapshot3.get(b"b").unwrap(), None); + assert_eq!(snapshot3.get(b"c").unwrap(), Some(Bytes::from_static(b"1"))); + check_lsm_iter_result_by_key( + &mut snapshot3.scan(Bound::Unbounded, Bound::Unbounded).unwrap(), + vec![ + (Bytes::from("a"), Bytes::from("2")), + (Bytes::from("c"), Bytes::from("1")), + ], + ); + assert_eq!(snapshot4.get(b"a").unwrap(), Some(Bytes::from_static(b"3"))); + assert_eq!(snapshot4.get(b"b").unwrap(), Some(Bytes::from_static(b"3"))); + assert_eq!(snapshot4.get(b"c").unwrap(), Some(Bytes::from_static(b"1"))); + check_lsm_iter_result_by_key( + &mut snapshot4.scan(Bound::Unbounded, Bound::Unbounded).unwrap(), + vec![ + (Bytes::from("a"), Bytes::from("3")), + (Bytes::from("b"), Bytes::from("3")), + (Bytes::from("c"), Bytes::from("1")), + ], + ); + assert_eq!(snapshot5.get(b"a").unwrap(), Some(Bytes::from_static(b"4"))); + assert_eq!(snapshot5.get(b"b").unwrap(), Some(Bytes::from_static(b"3"))); + assert_eq!(snapshot5.get(b"c").unwrap(), Some(Bytes::from_static(b"1"))); + check_lsm_iter_result_by_key( + &mut snapshot5.scan(Bound::Unbounded, Bound::Unbounded).unwrap(), + vec![ + (Bytes::from("a"), Bytes::from("4")), + (Bytes::from("b"), Bytes::from("3")), + (Bytes::from("c"), Bytes::from("1")), + ], + ); + assert_eq!(snapshot6.get(b"a").unwrap(), Some(Bytes::from_static(b"4"))); + assert_eq!(snapshot6.get(b"b").unwrap(), None); + assert_eq!(snapshot6.get(b"c").unwrap(), Some(Bytes::from_static(b"5"))); + check_lsm_iter_result_by_key( + &mut snapshot6.scan(Bound::Unbounded, Bound::Unbounded).unwrap(), + vec![ + (Bytes::from("a"), Bytes::from("4")), + (Bytes::from("c"), Bytes::from("5")), + ], + ); +} + +#[test] +fn test_task2_lsm_iterator_mvcc() { + let dir = tempdir().unwrap(); + let mut options = LsmStorageOptions::default_for_week2_test(CompactionOptions::NoCompaction); + options.enable_wal = true; + let storage = MiniLsm::open(&dir, options.clone()).unwrap(); + storage.put(b"a", b"1").unwrap(); + storage.put(b"b", b"1").unwrap(); + let snapshot1 = storage.new_txn().unwrap(); + storage.put(b"a", b"2").unwrap(); + let snapshot2 = storage.new_txn().unwrap(); + storage.delete(b"b").unwrap(); + storage.put(b"c", b"1").unwrap(); + let snapshot3 = storage.new_txn().unwrap(); + storage.force_flush().unwrap(); + assert_eq!(snapshot1.get(b"a").unwrap(), Some(Bytes::from_static(b"1"))); + assert_eq!(snapshot1.get(b"b").unwrap(), Some(Bytes::from_static(b"1"))); + assert_eq!(snapshot1.get(b"c").unwrap(), None); + check_lsm_iter_result_by_key( + &mut snapshot1.scan(Bound::Unbounded, Bound::Unbounded).unwrap(), + vec![ + (Bytes::from("a"), Bytes::from("1")), + (Bytes::from("b"), Bytes::from("1")), + ], + ); + assert_eq!(snapshot2.get(b"a").unwrap(), Some(Bytes::from_static(b"2"))); + assert_eq!(snapshot2.get(b"b").unwrap(), Some(Bytes::from_static(b"1"))); + assert_eq!(snapshot2.get(b"c").unwrap(), None); + check_lsm_iter_result_by_key( + &mut snapshot2.scan(Bound::Unbounded, Bound::Unbounded).unwrap(), + vec![ + (Bytes::from("a"), Bytes::from("2")), + (Bytes::from("b"), Bytes::from("1")), + ], + ); + assert_eq!(snapshot3.get(b"a").unwrap(), Some(Bytes::from_static(b"2"))); + assert_eq!(snapshot3.get(b"b").unwrap(), None); + assert_eq!(snapshot3.get(b"c").unwrap(), Some(Bytes::from_static(b"1"))); + check_lsm_iter_result_by_key( + &mut snapshot3.scan(Bound::Unbounded, Bound::Unbounded).unwrap(), + vec![ + (Bytes::from("a"), Bytes::from("2")), + (Bytes::from("c"), Bytes::from("1")), + ], + ); + storage.put(b"a", b"3").unwrap(); + storage.put(b"b", b"3").unwrap(); + let snapshot4 = storage.new_txn().unwrap(); + storage.put(b"a", b"4").unwrap(); + let snapshot5 = storage.new_txn().unwrap(); + storage.delete(b"b").unwrap(); + storage.put(b"c", b"5").unwrap(); + let snapshot6 = storage.new_txn().unwrap(); + storage.force_flush().unwrap(); + assert_eq!(snapshot1.get(b"a").unwrap(), Some(Bytes::from_static(b"1"))); + assert_eq!(snapshot1.get(b"b").unwrap(), Some(Bytes::from_static(b"1"))); + assert_eq!(snapshot1.get(b"c").unwrap(), None); + check_lsm_iter_result_by_key( + &mut snapshot1.scan(Bound::Unbounded, Bound::Unbounded).unwrap(), + vec![ + (Bytes::from("a"), Bytes::from("1")), + (Bytes::from("b"), Bytes::from("1")), + ], + ); + assert_eq!(snapshot2.get(b"a").unwrap(), Some(Bytes::from_static(b"2"))); + assert_eq!(snapshot2.get(b"b").unwrap(), Some(Bytes::from_static(b"1"))); + assert_eq!(snapshot2.get(b"c").unwrap(), None); + check_lsm_iter_result_by_key( + &mut snapshot2.scan(Bound::Unbounded, Bound::Unbounded).unwrap(), + vec![ + (Bytes::from("a"), Bytes::from("2")), + (Bytes::from("b"), Bytes::from("1")), + ], + ); + assert_eq!(snapshot3.get(b"a").unwrap(), Some(Bytes::from_static(b"2"))); + assert_eq!(snapshot3.get(b"b").unwrap(), None); + assert_eq!(snapshot3.get(b"c").unwrap(), Some(Bytes::from_static(b"1"))); + check_lsm_iter_result_by_key( + &mut snapshot3.scan(Bound::Unbounded, Bound::Unbounded).unwrap(), + vec![ + (Bytes::from("a"), Bytes::from("2")), + (Bytes::from("c"), Bytes::from("1")), + ], + ); + assert_eq!(snapshot4.get(b"a").unwrap(), Some(Bytes::from_static(b"3"))); + assert_eq!(snapshot4.get(b"b").unwrap(), Some(Bytes::from_static(b"3"))); + assert_eq!(snapshot4.get(b"c").unwrap(), Some(Bytes::from_static(b"1"))); + check_lsm_iter_result_by_key( + &mut snapshot4.scan(Bound::Unbounded, Bound::Unbounded).unwrap(), + vec![ + (Bytes::from("a"), Bytes::from("3")), + (Bytes::from("b"), Bytes::from("3")), + (Bytes::from("c"), Bytes::from("1")), + ], + ); + assert_eq!(snapshot5.get(b"a").unwrap(), Some(Bytes::from_static(b"4"))); + assert_eq!(snapshot5.get(b"b").unwrap(), Some(Bytes::from_static(b"3"))); + assert_eq!(snapshot5.get(b"c").unwrap(), Some(Bytes::from_static(b"1"))); + check_lsm_iter_result_by_key( + &mut snapshot5.scan(Bound::Unbounded, Bound::Unbounded).unwrap(), + vec![ + (Bytes::from("a"), Bytes::from("4")), + (Bytes::from("b"), Bytes::from("3")), + (Bytes::from("c"), Bytes::from("1")), + ], + ); + assert_eq!(snapshot6.get(b"a").unwrap(), Some(Bytes::from_static(b"4"))); + assert_eq!(snapshot6.get(b"b").unwrap(), None); + assert_eq!(snapshot6.get(b"c").unwrap(), Some(Bytes::from_static(b"5"))); + check_lsm_iter_result_by_key( + &mut snapshot6.scan(Bound::Unbounded, Bound::Unbounded).unwrap(), + vec![ + (Bytes::from("a"), Bytes::from("4")), + (Bytes::from("c"), Bytes::from("5")), + ], + ); + check_lsm_iter_result_by_key( + &mut snapshot6 + .scan(Bound::Included(b"a"), Bound::Included(b"a")) + .unwrap(), + vec![(Bytes::from("a"), Bytes::from("4"))], + ); + check_lsm_iter_result_by_key( + &mut snapshot6 + .scan(Bound::Excluded(b"a"), Bound::Excluded(b"c")) + .unwrap(), + vec![], + ); +} + +#[test] +fn test_task3_sst_ts() { + let mut builder = SsTableBuilder::new(16); + builder.add(KeySlice::for_testing_from_slice_with_ts(b"11", 1), b"11"); + builder.add(KeySlice::for_testing_from_slice_with_ts(b"22", 2), b"22"); + builder.add(KeySlice::for_testing_from_slice_with_ts(b"33", 3), b"11"); + builder.add(KeySlice::for_testing_from_slice_with_ts(b"44", 4), b"22"); + builder.add(KeySlice::for_testing_from_slice_with_ts(b"55", 5), b"11"); + builder.add(KeySlice::for_testing_from_slice_with_ts(b"66", 6), b"22"); + let dir = tempdir().unwrap(); + let sst = builder.build_for_test(dir.path().join("1.sst")).unwrap(); + assert_eq!(sst.max_ts(), 6); +} diff --git a/mini-lsm-starter/src/tests/week3_day4.rs b/mini-lsm-starter/src/tests/week3_day4.rs new file mode 100644 index 000000000..428a2e605 --- /dev/null +++ b/mini-lsm-starter/src/tests/week3_day4.rs @@ -0,0 +1,189 @@ +use bytes::Bytes; +use tempfile::tempdir; + +use crate::{ + compact::CompactionOptions, + lsm_storage::{LsmStorageOptions, MiniLsm, WriteBatchRecord}, + mvcc::watermark::Watermark, +}; + +use super::harness::{check_iter_result_by_key, construct_merge_iterator_over_storage}; + +#[test] +fn test_task1_watermark() { + let mut watermark = Watermark::new(); + watermark.add_reader(0); + for i in 1..=1000 { + watermark.add_reader(i); + assert_eq!(watermark.watermark(), Some(0)); + assert_eq!(watermark.num_retained_snapshots(), i as usize + 1); + } + let mut cnt = 1001; + for i in 0..500 { + watermark.remove_reader(i); + assert_eq!(watermark.watermark(), Some(i + 1)); + cnt -= 1; + assert_eq!(watermark.num_retained_snapshots(), cnt); + } + for i in (501..=1000).rev() { + watermark.remove_reader(i); + assert_eq!(watermark.watermark(), Some(500)); + cnt -= 1; + assert_eq!(watermark.num_retained_snapshots(), cnt); + } + watermark.remove_reader(500); + assert_eq!(watermark.watermark(), None); + assert_eq!(watermark.num_retained_snapshots(), 0); + watermark.add_reader(2000); + watermark.add_reader(2000); + watermark.add_reader(2001); + assert_eq!(watermark.num_retained_snapshots(), 2); + assert_eq!(watermark.watermark(), Some(2000)); + watermark.remove_reader(2000); + assert_eq!(watermark.num_retained_snapshots(), 2); + assert_eq!(watermark.watermark(), Some(2000)); + watermark.remove_reader(2000); + assert_eq!(watermark.num_retained_snapshots(), 1); + assert_eq!(watermark.watermark(), Some(2001)); +} + +#[test] +fn test_task2_snapshot_watermark() { + let dir = tempdir().unwrap(); + let options = LsmStorageOptions::default_for_week2_test(CompactionOptions::NoCompaction); + let storage = MiniLsm::open(&dir, options.clone()).unwrap(); + let txn1 = storage.new_txn().unwrap(); + let txn2 = storage.new_txn().unwrap(); + storage.put(b"233", b"23333").unwrap(); + let txn3 = storage.new_txn().unwrap(); + assert_eq!(storage.inner.mvcc().watermark(), txn1.read_ts); + drop(txn1); + assert_eq!(storage.inner.mvcc().watermark(), txn2.read_ts); + drop(txn2); + assert_eq!(storage.inner.mvcc().watermark(), txn3.read_ts); + drop(txn3); + assert_eq!( + storage.inner.mvcc().watermark(), + storage.inner.mvcc().latest_commit_ts() + ); +} + +#[test] +fn test_task3_mvcc_compaction() { + let dir = tempdir().unwrap(); + let options = LsmStorageOptions::default_for_week2_test(CompactionOptions::NoCompaction); + let storage = MiniLsm::open(&dir, options.clone()).unwrap(); + let snapshot0 = storage.new_txn().unwrap(); + storage + .write_batch(&[ + WriteBatchRecord::Put(b"a", b"1"), + WriteBatchRecord::Put(b"b", b"1"), + ]) + .unwrap(); + let snapshot1 = storage.new_txn().unwrap(); + storage + .write_batch(&[ + WriteBatchRecord::Put(b"a", b"2"), + WriteBatchRecord::Put(b"d", b"2"), + ]) + .unwrap(); + let snapshot2 = storage.new_txn().unwrap(); + storage + .write_batch(&[ + WriteBatchRecord::Put(b"a", b"3"), + WriteBatchRecord::Del(b"d"), + ]) + .unwrap(); + let snapshot3 = storage.new_txn().unwrap(); + storage + .write_batch(&[ + WriteBatchRecord::Put(b"c", b"4"), + WriteBatchRecord::Del(b"a"), + ]) + .unwrap(); + + storage.force_flush().unwrap(); + storage.force_full_compaction().unwrap(); + storage.dump_structure(); + + let mut iter = construct_merge_iterator_over_storage(&storage.inner.state.read()); + check_iter_result_by_key( + &mut iter, + vec![ + (Bytes::from("a"), Bytes::new()), + (Bytes::from("a"), Bytes::from("3")), + (Bytes::from("a"), Bytes::from("2")), + (Bytes::from("a"), Bytes::from("1")), + (Bytes::from("b"), Bytes::from("1")), + (Bytes::from("c"), Bytes::from("4")), + (Bytes::from("d"), Bytes::new()), + (Bytes::from("d"), Bytes::from("2")), + ], + ); + + drop(snapshot0); + storage.force_full_compaction().unwrap(); + storage.dump_structure(); + + let mut iter = construct_merge_iterator_over_storage(&storage.inner.state.read()); + check_iter_result_by_key( + &mut iter, + vec![ + (Bytes::from("a"), Bytes::new()), + (Bytes::from("a"), Bytes::from("3")), + (Bytes::from("a"), Bytes::from("2")), + (Bytes::from("a"), Bytes::from("1")), + (Bytes::from("b"), Bytes::from("1")), + (Bytes::from("c"), Bytes::from("4")), + (Bytes::from("d"), Bytes::new()), + (Bytes::from("d"), Bytes::from("2")), + ], + ); + + drop(snapshot1); + storage.force_full_compaction().unwrap(); + storage.dump_structure(); + + println!("snapshot2: {:?}", snapshot2.read_ts); + let mut iter = construct_merge_iterator_over_storage(&storage.inner.state.read()); + check_iter_result_by_key( + &mut iter, + vec![ + (Bytes::from("a"), Bytes::new()), + (Bytes::from("a"), Bytes::from("3")), + (Bytes::from("a"), Bytes::from("2")), + (Bytes::from("b"), Bytes::from("1")), + (Bytes::from("c"), Bytes::from("4")), + (Bytes::from("d"), Bytes::new()), + (Bytes::from("d"), Bytes::from("2")), + ], + ); + + drop(snapshot2); + storage.force_full_compaction().unwrap(); + storage.dump_structure(); + + let mut iter = construct_merge_iterator_over_storage(&storage.inner.state.read()); + check_iter_result_by_key( + &mut iter, + vec![ + (Bytes::from("a"), Bytes::new()), + (Bytes::from("a"), Bytes::from("3")), + (Bytes::from("b"), Bytes::from("1")), + (Bytes::from("c"), Bytes::from("4")), + ], + ); + + drop(snapshot3); + storage.force_full_compaction().unwrap(); + storage.dump_structure(); + + let mut iter = construct_merge_iterator_over_storage(&storage.inner.state.read()); + check_iter_result_by_key( + &mut iter, + vec![ + (Bytes::from("b"), Bytes::from("1")), + (Bytes::from("c"), Bytes::from("4")), + ], + ); +} diff --git a/mini-lsm-starter/src/tests/week3_day5.rs b/mini-lsm-starter/src/tests/week3_day5.rs new file mode 100644 index 000000000..6a6b3bdbf --- /dev/null +++ b/mini-lsm-starter/src/tests/week3_day5.rs @@ -0,0 +1,75 @@ +use std::ops::Bound; + +use bytes::Bytes; +use tempfile::tempdir; + +use crate::{ + compact::CompactionOptions, + lsm_storage::{LsmStorageOptions, MiniLsm}, + tests::harness::check_lsm_iter_result_by_key, +}; + +#[test] +fn test_txn_integration() { + let dir = tempdir().unwrap(); + let options = LsmStorageOptions::default_for_week2_test(CompactionOptions::NoCompaction); + let storage = MiniLsm::open(&dir, options.clone()).unwrap(); + let txn1 = storage.new_txn().unwrap(); + let txn2 = storage.new_txn().unwrap(); + txn1.put(b"test1", b"233"); + txn2.put(b"test2", b"233"); + check_lsm_iter_result_by_key( + &mut txn1.scan(Bound::Unbounded, Bound::Unbounded).unwrap(), + vec![(Bytes::from("test1"), Bytes::from("233"))], + ); + check_lsm_iter_result_by_key( + &mut txn2.scan(Bound::Unbounded, Bound::Unbounded).unwrap(), + vec![(Bytes::from("test2"), Bytes::from("233"))], + ); + let txn3 = storage.new_txn().unwrap(); + check_lsm_iter_result_by_key( + &mut txn3.scan(Bound::Unbounded, Bound::Unbounded).unwrap(), + vec![], + ); + txn1.commit().unwrap(); + txn2.commit().unwrap(); + check_lsm_iter_result_by_key( + &mut txn3.scan(Bound::Unbounded, Bound::Unbounded).unwrap(), + vec![], + ); + drop(txn3); + check_lsm_iter_result_by_key( + &mut storage.scan(Bound::Unbounded, Bound::Unbounded).unwrap(), + vec![ + (Bytes::from("test1"), Bytes::from("233")), + (Bytes::from("test2"), Bytes::from("233")), + ], + ); + let txn4 = storage.new_txn().unwrap(); + assert_eq!(txn4.get(b"test1").unwrap(), Some(Bytes::from("233"))); + assert_eq!(txn4.get(b"test2").unwrap(), Some(Bytes::from("233"))); + check_lsm_iter_result_by_key( + &mut txn4.scan(Bound::Unbounded, Bound::Unbounded).unwrap(), + vec![ + (Bytes::from("test1"), Bytes::from("233")), + (Bytes::from("test2"), Bytes::from("233")), + ], + ); + txn4.put(b"test2", b"2333"); + assert_eq!(txn4.get(b"test1").unwrap(), Some(Bytes::from("233"))); + assert_eq!(txn4.get(b"test2").unwrap(), Some(Bytes::from("2333"))); + check_lsm_iter_result_by_key( + &mut txn4.scan(Bound::Unbounded, Bound::Unbounded).unwrap(), + vec![ + (Bytes::from("test1"), Bytes::from("233")), + (Bytes::from("test2"), Bytes::from("2333")), + ], + ); + txn4.delete(b"test2"); + assert_eq!(txn4.get(b"test1").unwrap(), Some(Bytes::from("233"))); + assert_eq!(txn4.get(b"test2").unwrap(), None); + check_lsm_iter_result_by_key( + &mut txn4.scan(Bound::Unbounded, Bound::Unbounded).unwrap(), + vec![(Bytes::from("test1"), Bytes::from("233"))], + ); +} diff --git a/mini-lsm-starter/src/tests/week3_day6.rs b/mini-lsm-starter/src/tests/week3_day6.rs new file mode 100644 index 000000000..aa194a300 --- /dev/null +++ b/mini-lsm-starter/src/tests/week3_day6.rs @@ -0,0 +1,108 @@ +use std::ops::Bound; + +use bytes::Bytes; +use tempfile::tempdir; + +use crate::{ + compact::CompactionOptions, + iterators::StorageIterator, + lsm_storage::{LsmStorageOptions, MiniLsm}, +}; + +#[test] +fn test_serializable_1() { + let dir = tempdir().unwrap(); + let mut options = LsmStorageOptions::default_for_week2_test(CompactionOptions::NoCompaction); + options.serializable = true; + let storage = MiniLsm::open(&dir, options.clone()).unwrap(); + storage.put(b"key1", b"1").unwrap(); + storage.put(b"key2", b"2").unwrap(); + let txn1 = storage.new_txn().unwrap(); + let txn2 = storage.new_txn().unwrap(); + txn1.put(b"key1", &txn1.get(b"key2").unwrap().unwrap()); + txn2.put(b"key2", &txn2.get(b"key1").unwrap().unwrap()); + txn1.commit().unwrap(); + assert!(txn2.commit().is_err()); + drop(txn2); + assert_eq!(storage.get(b"key1").unwrap(), Some(Bytes::from("2"))); + assert_eq!(storage.get(b"key2").unwrap(), Some(Bytes::from("2"))); +} + +#[test] +fn test_serializable_2() { + let dir = tempdir().unwrap(); + let mut options = LsmStorageOptions::default_for_week2_test(CompactionOptions::NoCompaction); + options.serializable = true; + let storage = MiniLsm::open(&dir, options.clone()).unwrap(); + let txn1 = storage.new_txn().unwrap(); + let txn2 = storage.new_txn().unwrap(); + txn1.put(b"key1", b"1"); + txn2.put(b"key1", b"2"); + txn1.commit().unwrap(); + txn2.commit().unwrap(); + assert_eq!(storage.get(b"key1").unwrap(), Some(Bytes::from("2"))); +} + +#[test] +fn test_serializable_3_ts_range() { + let dir = tempdir().unwrap(); + let mut options = LsmStorageOptions::default_for_week2_test(CompactionOptions::NoCompaction); + options.serializable = true; + let storage = MiniLsm::open(&dir, options.clone()).unwrap(); + storage.put(b"key1", b"1").unwrap(); + storage.put(b"key2", b"2").unwrap(); + let txn1 = storage.new_txn().unwrap(); + txn1.put(b"key1", &txn1.get(b"key2").unwrap().unwrap()); + txn1.commit().unwrap(); + let txn2 = storage.new_txn().unwrap(); + txn2.put(b"key2", &txn2.get(b"key1").unwrap().unwrap()); + txn2.commit().unwrap(); + drop(txn2); + assert_eq!(storage.get(b"key1").unwrap(), Some(Bytes::from("2"))); + assert_eq!(storage.get(b"key2").unwrap(), Some(Bytes::from("2"))); +} + +#[test] +fn test_serializable_4_scan() { + let dir = tempdir().unwrap(); + let mut options = LsmStorageOptions::default_for_week2_test(CompactionOptions::NoCompaction); + options.serializable = true; + let storage = MiniLsm::open(&dir, options.clone()).unwrap(); + storage.put(b"key1", b"1").unwrap(); + storage.put(b"key2", b"2").unwrap(); + let txn1 = storage.new_txn().unwrap(); + let txn2 = storage.new_txn().unwrap(); + txn1.put(b"key1", &txn1.get(b"key2").unwrap().unwrap()); + txn1.commit().unwrap(); + let mut iter = txn2.scan(Bound::Unbounded, Bound::Unbounded).unwrap(); + while iter.is_valid() { + iter.next().unwrap(); + } + txn2.put(b"key2", b"1"); + assert!(txn2.commit().is_err()); + drop(txn2); + assert_eq!(storage.get(b"key1").unwrap(), Some(Bytes::from("2"))); + assert_eq!(storage.get(b"key2").unwrap(), Some(Bytes::from("2"))); +} + +#[test] +fn test_serializable_5_read_only() { + let dir = tempdir().unwrap(); + let mut options = LsmStorageOptions::default_for_week2_test(CompactionOptions::NoCompaction); + options.serializable = true; + let storage = MiniLsm::open(&dir, options.clone()).unwrap(); + storage.put(b"key1", b"1").unwrap(); + storage.put(b"key2", b"2").unwrap(); + let txn1 = storage.new_txn().unwrap(); + txn1.put(b"key1", &txn1.get(b"key2").unwrap().unwrap()); + txn1.commit().unwrap(); + let txn2 = storage.new_txn().unwrap(); + txn2.get(b"key1").unwrap().unwrap(); + let mut iter = txn2.scan(Bound::Unbounded, Bound::Unbounded).unwrap(); + while iter.is_valid() { + iter.next().unwrap(); + } + txn2.commit().unwrap(); + assert_eq!(storage.get(b"key1").unwrap(), Some(Bytes::from("2"))); + assert_eq!(storage.get(b"key2").unwrap(), Some(Bytes::from("2"))); +} diff --git a/mini-lsm-starter/src/tests/week3_day7.rs b/mini-lsm-starter/src/tests/week3_day7.rs new file mode 100644 index 000000000..bfbc05d7a --- /dev/null +++ b/mini-lsm-starter/src/tests/week3_day7.rs @@ -0,0 +1,70 @@ +use bytes::Bytes; +use tempfile::tempdir; + +use crate::{ + compact::CompactionOptions, + lsm_storage::{CompactionFilter, LsmStorageOptions, MiniLsm, WriteBatchRecord}, +}; + +use super::harness::{check_iter_result_by_key, construct_merge_iterator_over_storage}; + +#[test] +fn test_task3_mvcc_compaction() { + let dir = tempdir().unwrap(); + let options = LsmStorageOptions::default_for_week2_test(CompactionOptions::NoCompaction); + let storage = MiniLsm::open(&dir, options.clone()).unwrap(); + storage + .write_batch(&[ + WriteBatchRecord::Put("table1_a", "1"), + WriteBatchRecord::Put("table1_b", "1"), + WriteBatchRecord::Put("table1_c", "1"), + WriteBatchRecord::Put("table2_a", "1"), + WriteBatchRecord::Put("table2_b", "1"), + WriteBatchRecord::Put("table2_c", "1"), + ]) + .unwrap(); + storage.force_flush().unwrap(); + let snapshot0 = storage.new_txn().unwrap(); + storage + .write_batch(&[ + WriteBatchRecord::Put("table1_a", "2"), + WriteBatchRecord::Del("table1_b"), + WriteBatchRecord::Put("table1_c", "2"), + WriteBatchRecord::Put("table2_a", "2"), + WriteBatchRecord::Del("table2_b"), + WriteBatchRecord::Put("table2_c", "2"), + ]) + .unwrap(); + storage.force_flush().unwrap(); + storage.add_compaction_filter(CompactionFilter::Prefix(Bytes::from("table2_"))); + storage.force_full_compaction().unwrap(); + + let mut iter = construct_merge_iterator_over_storage(&storage.inner.state.read()); + check_iter_result_by_key( + &mut iter, + vec![ + (Bytes::from("table1_a"), Bytes::from("2")), + (Bytes::from("table1_a"), Bytes::from("1")), + (Bytes::from("table1_b"), Bytes::new()), + (Bytes::from("table1_b"), Bytes::from("1")), + (Bytes::from("table1_c"), Bytes::from("2")), + (Bytes::from("table1_c"), Bytes::from("1")), + (Bytes::from("table2_a"), Bytes::from("2")), + (Bytes::from("table2_b"), Bytes::new()), + (Bytes::from("table2_c"), Bytes::from("2")), + ], + ); + + drop(snapshot0); + + storage.force_full_compaction().unwrap(); + + let mut iter = construct_merge_iterator_over_storage(&storage.inner.state.read()); + check_iter_result_by_key( + &mut iter, + vec![ + (Bytes::from("table1_a"), Bytes::from("2")), + (Bytes::from("table1_c"), Bytes::from("2")), + ], + ); +} diff --git a/mini-lsm-starter/src/wal.rs b/mini-lsm-starter/src/wal.rs index 2b31d436a..1f1e93537 100644 --- a/mini-lsm-starter/src/wal.rs +++ b/mini-lsm-starter/src/wal.rs @@ -1,33 +1,102 @@ #![allow(dead_code)] // REMOVE THIS LINE after fully implementing this functionality -use std::fs::File; -use std::io::BufWriter; +use std::fs::{File, OpenOptions}; +use std::hash::Hasher; +use std::io::{BufWriter, Read, Write}; use std::path::Path; use std::sync::Arc; -use anyhow::Result; -use bytes::Bytes; +use anyhow::{Context, Result}; +use bytes::{Buf, BufMut, Bytes}; use crossbeam_skiplist::SkipMap; use parking_lot::Mutex; +use crate::key::{KeyBytes, KeySlice}; + +// | key_len | key | value_len | value | checksum | +#[derive(Debug)] pub struct Wal { file: Arc>>, } impl Wal { - pub fn create(_path: impl AsRef) -> Result { - unimplemented!() + pub fn create(path: impl AsRef) -> Result { + Ok(Self { + file: Arc::new(Mutex::new(BufWriter::new( + OpenOptions::new() + .read(true) + .create_new(true) + .write(true) + .open(path) + .context("[WAL.create] failed to create WAL")?, + ))), + }) } - pub fn recover(_path: impl AsRef, _skiplist: &SkipMap) -> Result { - unimplemented!() + pub fn recover(path: impl AsRef, skiplist: &SkipMap) -> Result { + let path = path.as_ref(); + let mut file = OpenOptions::new() + .read(true) + .append(true) + .open(path) + .context("[WAL.recover] failed to recover from WAL")?; + let mut buf = Vec::new(); + file.read_to_end(&mut buf)?; + // read buf to insert into skiplist + let mut buf = buf.as_slice(); + while !buf.is_empty() { + let mut hasher = crc32fast::Hasher::new(); + let key_len = buf.get_u16() as usize; + hasher.write_u16(key_len as u16); + let key = Bytes::copy_from_slice(&buf[..key_len]); + hasher.write(&key); + buf.advance(key_len); + // read ts + let ts = buf.get_u64(); + hasher.write_u64(ts); + let value_len = buf.get_u16() as usize; + hasher.write_u16(value_len as u16); + let value = Bytes::copy_from_slice(&buf[..value_len]); + hasher.write(&value); + buf.advance(value_len); + let checksum = buf.get_u32(); + if checksum != hasher.finalize() { + return Err(anyhow::anyhow!("WAL checksum mismatch")); + } + skiplist.insert(KeyBytes::from_bytes_with_ts(key, ts), value); + } + Ok(Self { + file: Arc::new(Mutex::new(BufWriter::new(file))), + }) } - pub fn put(&self, _key: &[u8], _value: &[u8]) -> Result<()> { - unimplemented!() + pub fn put(&self, key: KeySlice, value: &[u8]) -> Result<()> { + let mut file = self.file.lock(); + let mut buf: Vec = Vec::with_capacity( + key.raw_len() + value.len() + std::mem::size_of::() + std::mem::size_of::(), + ); + // use a crc32fast::Hasher to compute the checksum incrementally on each field. + let mut hasher = crc32fast::Hasher::new(); + buf.put_u16(key.key_len() as u16); + hasher.write_u16(key.key_len() as u16); + buf.put_slice(key.key_ref()); + hasher.write(key.key_ref()); + // write ts + buf.put_u64(key.ts()); + hasher.write_u64(key.ts()); + buf.put_u16(value.len() as u16); + hasher.write_u16(value.len() as u16); + buf.put_slice(value); + hasher.write(value); + buf.put_u32(hasher.finalize()); + file.write_all(&buf)?; + Ok(()) } pub fn sync(&self) -> Result<()> { - unimplemented!() + let mut file = self.file.lock(); + file.flush()?; + file.get_mut().sync_all()?; + Ok(()) } } diff --git a/mini-lsm/Cargo.toml b/mini-lsm/Cargo.toml index 093616082..000e05a46 100644 --- a/mini-lsm/Cargo.toml +++ b/mini-lsm/Cargo.toml @@ -25,6 +25,8 @@ serde_json = { version = "1.0" } serde = { version = "1.0", features = ["derive"] } farmhash = "1" crc32fast = "1.3.2" +nom = "7.1.3" +rustyline = "13.0.0" [dev-dependencies] tempfile = "3" diff --git a/mini-lsm/src/iterators/merge_iterator.rs b/mini-lsm/src/iterators/merge_iterator.rs index c4abc8d3e..b1f5bdf77 100644 --- a/mini-lsm/src/iterators/merge_iterator.rs +++ b/mini-lsm/src/iterators/merge_iterator.rs @@ -37,7 +37,7 @@ impl Ord for HeapWrapper { } /// Merge multiple iterators of the same type. If the same key occurs multiple times in some -/// iterators, perfer the one with smaller index. +/// iterators, prefer the one with smaller index. pub struct MergeIterator { iters: BinaryHeap>, current: Option>, diff --git a/mini-lsm/src/lsm_iterator.rs b/mini-lsm/src/lsm_iterator.rs index 044769cf6..448989d47 100644 --- a/mini-lsm/src/lsm_iterator.rs +++ b/mini-lsm/src/lsm_iterator.rs @@ -106,14 +106,14 @@ impl StorageIterator for FusedIterator { } fn key(&self) -> Self::KeyType<'_> { - if self.has_errored || !self.iter.is_valid() { + if !self.is_valid() { panic!("invalid access to the underlying iterator"); } self.iter.key() } fn value(&self) -> &[u8] { - if self.has_errored || !self.iter.is_valid() { + if !self.is_valid() { panic!("invalid access to the underlying iterator"); } self.iter.value() diff --git a/mini-lsm/src/lsm_storage.rs b/mini-lsm/src/lsm_storage.rs index 738fdbde5..0ee8497af 100644 --- a/mini-lsm/src/lsm_storage.rs +++ b/mini-lsm/src/lsm_storage.rs @@ -148,6 +148,11 @@ fn key_within(user_key: &[u8], table_begin: KeySlice, table_end: KeySlice) -> bo table_begin.raw_ref() <= user_key && user_key <= table_end.raw_ref() } +#[derive(Clone, Debug)] +pub enum CompactionFilter { + Prefix(Bytes), +} + /// The storage interface of the LSM tree. pub(crate) struct LsmStorageInner { pub(crate) state: Arc>>, @@ -160,6 +165,8 @@ pub(crate) struct LsmStorageInner { pub(crate) manifest: Option, #[allow(dead_code)] pub(crate) mvcc: Option, + #[allow(dead_code)] + pub(crate) compaction_filters: Arc>>, } /// A thin wrapper for `LsmStorageInner` and the user interface for MiniLSM. @@ -243,6 +250,10 @@ impl MiniLsm { })) } + pub fn add_compaction_filter(&self, compaction_filter: CompactionFilter) { + self.inner.add_compaction_filter(compaction_filter) + } + pub fn get(&self, key: &[u8]) -> Result> { self.inner.get(key) } @@ -418,6 +429,7 @@ impl LsmStorageInner { manifest: Some(manifest), options: options.into(), mvcc: None, + compaction_filters: Arc::new(Mutex::new(Vec::new())), }; storage.sync_dir()?; @@ -428,6 +440,11 @@ impl LsmStorageInner { self.state.read().memtable.sync_wal() } + pub fn add_compaction_filter(&self, compaction_filter: CompactionFilter) { + let mut compaction_filters = self.compaction_filters.lock(); + compaction_filters.push(compaction_filter); + } + /// Get a key from the storage. In day 7, this can be further optimized by using a bloom filter. pub fn get(&self, key: &[u8]) -> Result> { let snapshot = { diff --git a/mini-lsm/src/mem_table.rs b/mini-lsm/src/mem_table.rs index d5efc5152..89da268f5 100644 --- a/mini-lsm/src/mem_table.rs +++ b/mini-lsm/src/mem_table.rs @@ -119,8 +119,7 @@ impl MemTable { item: (Bytes::new(), Bytes::new()), } .build(); - let entry = iter.with_iter_mut(|iter| MemTableIterator::entry_to_item(iter.next())); - iter.with_mut(|x| *x.item = entry); + iter.next().unwrap(); iter } diff --git a/mini-lsm/src/tests/harness.rs b/mini-lsm/src/tests/harness.rs index 4b0d7e30d..b41745b2f 100644 --- a/mini-lsm/src/tests/harness.rs +++ b/mini-lsm/src/tests/harness.rs @@ -295,6 +295,11 @@ pub fn check_compaction_ratio(storage: Arc) { }; level_size.push(size); } + let extra_iterators = if TS_ENABLED { + 1 /* txn local iterator for OCC */ + } else { + 0 + }; let num_iters = storage .scan(Bound::Unbounded, Bound::Unbounded) .unwrap() @@ -326,8 +331,8 @@ pub fn check_compaction_ratio(storage: Arc) { ); } assert!( - num_iters <= l0_sst_num + num_memtables + max_levels, - "did you use concat iterators?" + num_iters <= l0_sst_num + num_memtables + max_levels + extra_iterators, + "we found {num_iters} iterators in your implementation, (l0_sst_num={l0_sst_num}, num_memtables={num_memtables}, max_levels={max_levels}) did you use concat iterators?" ); } CompactionOptions::Leveled(LeveledCompactionOptions { @@ -338,23 +343,24 @@ pub fn check_compaction_ratio(storage: Arc) { }) => { assert!(l0_sst_num < level0_file_num_compaction_trigger); assert!(level_size.len() <= max_levels); - for idx in 1..level_size.len() { - let prev_size = level_size[idx - 1]; - let this_size = level_size[idx]; + let last_level_size = *level_size.last().unwrap(); + let mut multiplier = 1.0; + for idx in (1..level_size.len()).rev() { + multiplier *= level_size_multiplier as f64; + let this_size = level_size[idx - 1]; assert!( // do not add hard requirement on level size multiplier considering bloom filters... - this_size as f64 / prev_size as f64 >= (level_size_multiplier as f64 - 0.5), - "L{}/L{}, {}/{}<<{}", - state.levels[idx].0, + this_size as f64 / last_level_size as f64 <= 1.0 / multiplier + 0.5, + "L{}/L_max, {}/{}>>1.0/{}", state.levels[idx - 1].0, this_size, - prev_size, - level_size_multiplier + last_level_size, + multiplier ); } assert!( - num_iters <= l0_sst_num + num_memtables + max_levels, - "did you use concat iterators?" + num_iters <= l0_sst_num + num_memtables + max_levels + extra_iterators, + "we found {num_iters} iterators in your implementation, (l0_sst_num={l0_sst_num}, num_memtables={num_memtables}, max_levels={max_levels}) did you use concat iterators?" ); } CompactionOptions::Tiered(TieredCompactionOptions { @@ -395,8 +401,8 @@ pub fn check_compaction_ratio(storage: Arc) { sum_size += this_size; } assert!( - num_iters <= num_memtables + num_tiers, - "did you use concat iterators?" + num_iters <= num_memtables + num_tiers + extra_iterators, + "we found {num_iters} iterators in your implementation, (num_memtables={num_memtables}, num_tiers={num_tiers}) did you use concat iterators?" ); } } diff --git a/mini-lsm/src/tests/week1_day2.rs b/mini-lsm/src/tests/week1_day2.rs index 6c1b002e1..92b5730a8 100644 --- a/mini-lsm/src/tests/week1_day2.rs +++ b/mini-lsm/src/tests/week1_day2.rs @@ -229,7 +229,11 @@ fn test_task2_merge_error() { ], 1, ); - let iter = MergeIterator::::create(vec![Box::new(i1), Box::new(i2)]); + let iter = MergeIterator::::create(vec![ + Box::new(i1.clone()), + Box::new(i1), + Box::new(i2), + ]); // your implementation should correctly throw an error instead of panic expect_iter_error(iter); } diff --git a/mini-lsm/src/tests/week1_day6.rs b/mini-lsm/src/tests/week1_day6.rs index a435d75dd..cb6a9c836 100644 --- a/mini-lsm/src/tests/week1_day6.rs +++ b/mini-lsm/src/tests/week1_day6.rs @@ -193,5 +193,5 @@ fn test_task3_sst_filter() { Bound::Excluded(format!("{:05}", 6000).as_bytes()), ) .unwrap(); - assert!(min_num < iter.num_active_iterators() && iter.num_active_iterators() < max_num); + assert!(min_num <= iter.num_active_iterators() && iter.num_active_iterators() < max_num); }