diff --git a/Cargo.lock b/Cargo.lock index 9af204564a7..40261cdb2cf 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -89,9 +89,9 @@ dependencies = [ [[package]] name = "anyhow" -version = "1.0.26" +version = "1.0.75" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7825f6833612eb2414095684fcf6c635becf3ce97fe48cf6421321e93bfbd53c" +checksum = "a4668cab20f66d8d020e1fbc0ebe47217433c1b6c8f2040faf858554e394ace6" [[package]] name = "api_version" @@ -147,7 +147,7 @@ dependencies = [ "lazy_static", "lexical-core", "multiversion", - "num 0.4.0", + "num 0.4.3", "rand 0.8.5", "regex", "serde", @@ -198,9 +198,8 @@ dependencies = [ [[package]] name = "async-speed-limit" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "481ce9cb6a828f4679495f7376cb6779978d925dd9790b99b48d1bbde6d0f00b" +version = "0.4.1" +source = "git+https://github.com/tikv/async-speed-limit?branch=master#a113aef3cc24bf7fa5faf2b7025abaf02fc53fe3" dependencies = [ "futures-core", "futures-io", @@ -252,13 +251,13 @@ dependencies = [ [[package]] name = "async-trait" -version = "0.1.58" +version = "0.1.69" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e805d94e6b5001b651426cf4cd446b1ab5f319d27bab5c644f61de0a804360c" +checksum = "7b2d0f03b3640e3a630367e40c468cb7f309529c708ed1d88597047b0e7c6ef7" dependencies = [ "proc-macro2", "quote", - "syn 1.0.103", + "syn 2.0.79", ] [[package]] @@ -657,6 +656,7 @@ dependencies = [ "collections", "criterion", "crossbeam", + "dashmap", "derive_more", "fail", "file_system", @@ -757,7 +757,7 @@ dependencies = [ "regex", "rustc-hash", "shlex 1.1.0", - "syn 2.0.18", + "syn 2.0.79", ] [[package]] @@ -828,9 +828,9 @@ checksum = "cdead85bdec19c194affaeeb670c0e41fe23de31459efd1c174d049269cf02cc" [[package]] name = "byteorder" -version = "1.3.4" +version = "1.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08c48aae112d48ed9f069b33538ea9e3e90aa263cfa3d1c24309612b1f7472de" +checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" [[package]] name = "bytes" @@ -1032,9 +1032,9 @@ dependencies = [ [[package]] name = "chrono-tz" -version = "0.5.1" +version = "0.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e0e430fad0384e4defc3dc6b1223d1b886087a8bf9b7080e5ae027f73851ea15" +checksum = "2554a3155fec064362507487171dcc4edc3df60cb10f3a1fb10ed8094822b120" dependencies = [ "chrono", "parse-zoneinfo", @@ -1305,7 +1305,7 @@ dependencies = [ "crossbeam-deque", "crossbeam-epoch", "crossbeam-queue", - "crossbeam-utils 0.8.8", + "crossbeam-utils", ] [[package]] @@ -1315,7 +1315,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c2dd04ddaf88237dc3b8d8f9a3c1004b506b54b3313403944054d23c0870c521" dependencies = [ "cfg-if 1.0.0", - "crossbeam-utils 0.8.8", + "crossbeam-utils", ] [[package]] @@ -1326,7 +1326,7 @@ checksum = "ce6fd6f855243022dcecf8702fef0c297d4338e226845fe067f6341ad9fa0cef" dependencies = [ "cfg-if 1.0.0", "crossbeam-epoch", - "crossbeam-utils 0.8.8", + "crossbeam-utils", ] [[package]] @@ -1337,7 +1337,7 @@ checksum = "1145cf131a2c6ba0615079ab6a638f7e1973ac9c2634fcbeaaad6114246efe8c" dependencies = [ "autocfg", "cfg-if 1.0.0", - "crossbeam-utils 0.8.8", + "crossbeam-utils", "lazy_static", "memoffset 0.6.4", "scopeguard", @@ -1350,7 +1350,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1f25d8400f4a7a5778f0e4e52384a48cbd9b5c495d110786187fc750075277a2" dependencies = [ "cfg-if 1.0.0", - "crossbeam-utils 0.8.8", + "crossbeam-utils", ] [[package]] @@ -1361,21 +1361,10 @@ checksum = "883a5821d7d079fcf34ac55f27a833ee61678110f6b97637cc74513c0d0b42fc" dependencies = [ "cfg-if 1.0.0", "crossbeam-epoch", - "crossbeam-utils 0.8.8", + "crossbeam-utils", "scopeguard", ] -[[package]] -name = "crossbeam-utils" -version = "0.7.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c3c7c73a2d1e9fc0886a08b93e98eb643461230d5f1925e4036204d5f2e261a8" -dependencies = [ - "autocfg", - "cfg-if 0.1.10", - "lazy_static", -] - [[package]] name = "crossbeam-utils" version = "0.8.8" @@ -1465,9 +1454,9 @@ dependencies = [ [[package]] name = "dashmap" -version = "5.1.0" +version = "5.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c0834a35a3fce649144119e18da2a4d8ed12ef3862f47183fd46f625d072d96c" +checksum = "4c8858831f7781322e539ea39e72449c46b059638250c14344fec8d0aa6e539c" dependencies = [ "cfg-if 1.0.0", "num_cpus", @@ -2102,7 +2091,7 @@ dependencies = [ "bcc", "collections", "crc32fast", - "crossbeam-utils 0.8.8", + "crossbeam-utils", "fs2", "lazy_static", "libc 0.2.146", @@ -2231,10 +2220,11 @@ checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b" [[package]] name = "form_urlencoded" -version = "1.2.0" +version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a62bc1cf6f830c2ec14a513a9fb124d0a213a629668a4186f329db21fe045652" +checksum = "5fc25a87fa4fd2094bffb06925852034d90a17f0d1e05197d4956d3555752191" dependencies = [ + "matches", "percent-encoding", ] @@ -2317,9 +2307,9 @@ dependencies = [ [[package]] name = "futures-channel" -version = "0.3.15" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e682a68b29a882df0545c143dc3646daefe80ba479bcdede94d5a703de2871e2" +checksum = "2dff15bf788c671c1934e366d07e30c1814a8ef514e1af724a602e8a2fbe1b10" dependencies = [ "futures-core", "futures-sink", @@ -2327,9 +2317,9 @@ dependencies = [ [[package]] name = "futures-core" -version = "0.3.15" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0402f765d8a89a26043b889b26ce3c4679d268fa6bb22cd7c6aad98340e179d1" +checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e" [[package]] name = "futures-executor" @@ -2345,9 +2335,9 @@ dependencies = [ [[package]] name = "futures-io" -version = "0.3.15" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "acc499defb3b348f8d8f3f66415835a9131856ff7714bf10dadfc4ec4bdb29a1" +checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6" [[package]] name = "futures-lite" @@ -2366,28 +2356,26 @@ dependencies = [ [[package]] name = "futures-macro" -version = "0.3.15" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4c40298486cdf52cc00cd6d6987892ba502c7656a16a4192a9992b1ccedd121" +checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" dependencies = [ - "autocfg", - "proc-macro-hack", "proc-macro2", "quote", - "syn 1.0.103", + "syn 2.0.79", ] [[package]] name = "futures-sink" -version = "0.3.15" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a57bead0ceff0d6dde8f465ecd96c9338121bb7717d3e7b108059531870c4282" +checksum = "e575fab7d1e0dcb8d0c7bcf9a63ee213816ab51902e6d244a95819acacf1d4f7" [[package]] name = "futures-task" -version = "0.3.15" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a16bef9fc1a4dddb5bee51c989e3fbba26569cbb0e31f5b303c184e3dd33dae" +checksum = "f90f7dce0722e95104fcb095585910c0977252f286e354b5e3bd38902cd99988" [[package]] name = "futures-timer" @@ -2397,11 +2385,10 @@ checksum = "e64b03909df88034c26dc1547e8970b91f98bdb65165d6a4e9110d94263dbb2c" [[package]] name = "futures-util" -version = "0.3.15" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "feb5c238d27e2bf94ffdfd27b2c29e3df4a68c4193bb6427384259e2bf191967" +checksum = "9fa08315bb612088cc391249efdc3bc77536f16c91f6cf495e6fbe85b20a4a81" dependencies = [ - "autocfg", "futures 0.1.31", "futures-channel", "futures-core", @@ -2412,8 +2399,6 @@ dependencies = [ "memchr", "pin-project-lite", "pin-utils", - "proc-macro-hack", - "proc-macro-nested", "slab", ] @@ -2675,6 +2660,12 @@ dependencies = [ "allocator-api2", ] +[[package]] +name = "hashbrown" +version = "0.15.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf151400ff0baff5465007dd2f3e717f3fe502074ca563069ce3a6629d07b289" + [[package]] name = "heck" version = "0.3.1" @@ -2882,10 +2873,11 @@ checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39" [[package]] name = "idna" -version = "0.4.0" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7d20d6b07bfbc108882d88ed8e37d39636dcc260e15e30c45e6ba089610b917c" +checksum = "02e2673c30ee86b5b96a9cb52ad15718aa1f966f5ab9ad54a8b95d5ca33120a9" dependencies = [ + "matches", "unicode-bidi", "unicode-normalization", ] @@ -2908,12 +2900,12 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.0.0" +version = "2.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d5477fe2230a79769d8dc68e0eabf5437907c0457a5614a9e8dddb67f65eb65d" +checksum = "62f822373a4fe84d4bb149bf54e584a7f4abec90e072ed49cda0edea5b95471f" dependencies = [ "equivalent", - "hashbrown 0.14.0", + "hashbrown 0.15.2", ] [[package]] @@ -3099,7 +3091,7 @@ dependencies = [ [[package]] name = "kvproto" version = "0.0.2" -source = "git+https://github.com/pingcap/kvproto.git#7b612d935bf96f9daf7a537db379bcc88b4644e0" +source = "git+https://github.com/pingcap/kvproto.git?branch=release-7.5#c4a09794a10c8564d8b4645f45b4092b8ff0b29c" dependencies = [ "futures 0.3.15", "grpcio", @@ -3122,9 +3114,9 @@ checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" [[package]] name = "lexical-core" -version = "0.8.3" +version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "92912c4af2e7d9075be3e5e3122c4d7263855fa6cce34fbece4dd08e5884624d" +checksum = "2cde5de06e8d4c2faabc400238f9ae1c74d5412d03a7bd067645ccbc47070e46" dependencies = [ "lexical-parse-float", "lexical-parse-integer", @@ -3135,9 +3127,9 @@ dependencies = [ [[package]] name = "lexical-parse-float" -version = "0.8.3" +version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f518eed87c3be6debe6d26b855c97358d8a11bf05acec137e5f53080f5ad2dd8" +checksum = "683b3a5ebd0130b8fb52ba0bdc718cc56815b6a097e28ae5a6997d0ad17dc05f" dependencies = [ "lexical-parse-integer", "lexical-util", @@ -3146,9 +3138,9 @@ dependencies = [ [[package]] name = "lexical-parse-integer" -version = "0.8.3" +version = "0.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "afc852ec67c6538bbb2b9911116a385b24510e879a69ab516e6a151b15a79168" +checksum = "6d0994485ed0c312f6d965766754ea177d07f9c00c9b82a5ee62ed5b47945ee9" dependencies = [ "lexical-util", "static_assertions", @@ -3156,18 +3148,18 @@ dependencies = [ [[package]] name = "lexical-util" -version = "0.8.3" +version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c72a9d52c5c4e62fa2cdc2cb6c694a39ae1382d9c2a17a466f18e272a0930eb1" +checksum = "5255b9ff16ff898710eb9eb63cb39248ea8a5bb036bea8085b1a767ff6c4e3fc" dependencies = [ "static_assertions", ] [[package]] name = "lexical-write-float" -version = "0.8.4" +version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a89ec1d062e481210c309b672f73a0567b7855f21e7d2fae636df44d12e97f9" +checksum = "accabaa1c4581f05a3923d1b4cfd124c329352288b7b9da09e766b0668116862" dependencies = [ "lexical-util", "lexical-write-integer", @@ -3176,9 +3168,9 @@ dependencies = [ [[package]] name = "lexical-write-integer" -version = "0.8.3" +version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "094060bd2a7c2ff3a16d5304a6ae82727cb3cc9d1c70f813cc73f744c319337e" +checksum = "e1b6f3d1f4422866b68192d62f77bc5c700bee84f3069f2469d7bc8c77852446" dependencies = [ "lexical-util", "static_assertions", @@ -3228,7 +3220,7 @@ dependencies = [ [[package]] name = "librocksdb_sys" version = "0.1.0" -source = "git+https://github.com/tikv/rust-rocksdb.git#0c78f4072d766b152e83b25d3068b5c72b5feca1" +source = "git+https://github.com/tikv/rust-rocksdb.git?branch=tikv-7.5#268d20d61b8bf097f064a87a9af3cb91725ff179" dependencies = [ "bindgen 0.65.1", "bzip2-sys", @@ -3247,7 +3239,7 @@ dependencies = [ [[package]] name = "libtitan_sys" version = "0.0.1" -source = "git+https://github.com/tikv/rust-rocksdb.git#0c78f4072d766b152e83b25d3068b5c72b5feca1" +source = "git+https://github.com/tikv/rust-rocksdb.git?branch=tikv-7.5#268d20d61b8bf097f064a87a9af3cb91725ff179" dependencies = [ "bzip2-sys", "cc", @@ -3317,9 +3309,9 @@ dependencies = [ [[package]] name = "lz4-sys" -version = "1.9.2" +version = "1.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dca79aa95d8b3226213ad454d328369853be3a1382d89532a854f4d69640acae" +checksum = "e9764018d143cc854c9f17f0b907de70f14393b1f502da6375dce70f00514eb3" dependencies = [ "cc", "libc 0.2.146", @@ -3392,18 +3384,18 @@ dependencies = [ [[package]] name = "memmap2" -version = "0.5.3" +version = "0.5.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "057a3db23999c867821a7a59feb06a578fcb03685e983dff90daf9e7d24ac08f" +checksum = "83faa42c0a078c393f6b29d5db232d8be22776a891f8f56e5284faee4a20b327" dependencies = [ "libc 0.2.146", ] [[package]] name = "memmap2" -version = "0.7.0" +version = "0.9.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "180d4b35be83d33392d1d1bfbd2ae1eca7ff5de1a94d3fc87faaa99a069e7cbd" +checksum = "45fd3a57831bf88bc63f8cebc0cf956116276e97fef3966103e96416209f7c92" dependencies = [ "libc 0.2.146", ] @@ -3810,25 +3802,24 @@ dependencies = [ [[package]] name = "num" -version = "0.4.0" +version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "43db66d1170d347f9a065114077f7dccb00c1b9478c89384490a3425279a4606" +checksum = "35bd024e8b2ff75562e5f34e7f4905839deb4b22955ef5e73d2fea1b9813cb23" dependencies = [ "num-bigint", - "num-complex 0.4.1", + "num-complex 0.4.6", "num-integer", "num-iter", - "num-rational 0.4.0", + "num-rational 0.4.2", "num-traits", ] [[package]] name = "num-bigint" -version = "0.4.3" +version = "0.4.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f93ab6289c7b344a8a9f60f88d80aa20032336fe78da341afc91c8a2341fc75f" +checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9" dependencies = [ - "autocfg", "num-integer", "num-traits", ] @@ -3844,9 +3835,9 @@ dependencies = [ [[package]] name = "num-complex" -version = "0.4.1" +version = "0.4.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97fbc387afefefd5e9e39493299f3069e14a140dd34dc19b4c1c1a8fddb6a790" +checksum = "73f88a1307638156682bada9d7604135552957b7818057dcef22705b4d509495" dependencies = [ "num-traits", ] @@ -3862,6 +3853,17 @@ dependencies = [ "syn 1.0.103", ] +[[package]] +name = "num-derive" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed3955f1a9c7c0c15e092f9c887db08b1fc683305fdf6eb6684f22555355e202" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.79", +] + [[package]] name = "num-format" version = "0.4.0" @@ -3874,19 +3876,18 @@ dependencies = [ [[package]] name = "num-integer" -version = "0.1.44" +version = "0.1.46" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d2cc698a63b549a70bc047073d2949cce27cd1c7b0a4a862d08a8031bc2801db" +checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f" dependencies = [ - "autocfg", "num-traits", ] [[package]] name = "num-iter" -version = "0.1.42" +version = "0.1.45" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b2021c8337a54d21aca0d59a92577a029af9431cb59b909b03252b9c164fad59" +checksum = "1429034a0490724d0075ebb2bc9e875d6503c3cf69e235a8941aa757d83ef5bf" dependencies = [ "autocfg", "num-integer", @@ -3906,11 +3907,10 @@ dependencies = [ [[package]] name = "num-rational" -version = "0.4.0" +version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d41702bd167c2df5520b384281bc111a4b5efcf7fbc4c9c222c815b07e0a6a6a" +checksum = "f83d14da390562dca69fc84082e73e548e1ad308d24accdedd2720017cb37824" dependencies = [ - "autocfg", "num-bigint", "num-integer", "num-traits", @@ -3918,9 +3918,9 @@ dependencies = [ [[package]] name = "num-traits" -version = "0.2.14" +version = "0.2.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a64b1ec5cda2586e284722486d802acf1f7dbdc623e2bfc57e65ca1cd099290" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" dependencies = [ "autocfg", ] @@ -4142,9 +4142,9 @@ dependencies = [ [[package]] name = "parse-zoneinfo" -version = "0.2.0" +version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "089a398ccdcdd77b8c38909d5a1e4b67da1bc4c9dbfe6d5b536c828eddb779e5" +checksum = "c705f256449c60da65e11ff6626e0c16a0a0b96aaa348de61376b249bc340f41" dependencies = [ "regex", ] @@ -4199,9 +4199,9 @@ checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099" [[package]] name = "percent-encoding" -version = "2.3.0" +version = "2.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b2a4787296e9989611394c33f193f676704af1686e70b8f8033ab5ba9a35a94" +checksum = "d4fd5641d01c8f18a23da7b6fe29298ff4b55afcccdf78973b24cf3175fee32e" [[package]] name = "perfcnt" @@ -4377,14 +4377,14 @@ version = "0.3.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e30165d31df606f5726b090ec7592c308a0eaf61721ff64c9a3018e344a8753e" dependencies = [ - "portable-atomic 1.4.2", + "portable-atomic 1.10.0", ] [[package]] name = "portable-atomic" -version = "1.4.2" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f32154ba0af3a075eefa1eda8bb414ee928f62303a54ea85b8d6638ff1a6ee9e" +checksum = "280dc24453071f1b63954171985a0b0d30058d287960968b9b2aca264c8d4ee6" [[package]] name = "pprof" @@ -4432,7 +4432,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3b69d39aab54d069e7f2fe8cb970493e7834601ca2d8c65fd7bbd183578080d1" dependencies = [ "proc-macro2", - "syn 2.0.18", + "syn 2.0.79", ] [[package]] @@ -4469,23 +4469,11 @@ dependencies = [ "version_check 0.9.4", ] -[[package]] -name = "proc-macro-hack" -version = "0.5.19" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dbf0c48bc1d91375ae5c3cd81e3722dff1abcf81a30960240640d223f59fe0e5" - -[[package]] -name = "proc-macro-nested" -version = "0.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "369a6ed065f249a159e06c45752c780bda2fb53c995718f9e484d08daa9eb42e" - [[package]] name = "proc-macro2" -version = "1.0.60" +version = "1.0.87" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dec2b086b7a862cf4de201096214fa870344cf922b2b30c167badb3af3195406" +checksum = "b3e4daa0dcf6feba26f985457cdf104d4b4256fc5a09547140f3631bb076b19a" dependencies = [ "unicode-ident", ] @@ -4891,9 +4879,9 @@ dependencies = [ [[package]] name = "quote" -version = "1.0.28" +version = "1.0.37" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b9ab9c7eadfd8df19006f1cf1a4aed13540ed5cbc047010ece5826e10825488" +checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af" dependencies = [ "proc-macro2", ] @@ -4901,7 +4889,7 @@ dependencies = [ [[package]] name = "raft" version = "0.7.0" -source = "git+https://github.com/tikv/raft-rs?branch=master#9d360a3b0cdb691da8e500a4f73c457b605a1d73" +source = "git+https://github.com/tikv/raft-rs?branch=master#f60fb9e143e5b93f7db8917ea376cda04effcbb4" dependencies = [ "bytes", "fxhash", @@ -4915,8 +4903,8 @@ dependencies = [ [[package]] name = "raft-engine" -version = "0.3.0" -source = "git+https://github.com/tikv/raft-engine.git#de3ad04a2db9cdf795b1c82d7413b9b53bac92a8" +version = "0.4.1" +source = "git+https://github.com/tikv/raft-engine.git?branch=tikv-7.5#8cb23f127f0caf48076a3d3d0e4e2a27a8c8b0ed" dependencies = [ "byteorder", "crc32fast", @@ -4930,15 +4918,16 @@ dependencies = [ "libc 0.2.146", "log", "lz4-sys", - "memmap2 0.7.0", + "memmap2 0.9.3", "nix 0.26.2", - "num-derive", + "num-derive 0.4.2", "num-traits", "parking_lot 0.12.1", "prometheus", "prometheus-static-metric", "protobuf", "rayon", + "rhai", "scopeguard", "serde", "serde_repr", @@ -4949,7 +4938,7 @@ dependencies = [ [[package]] name = "raft-proto" version = "0.7.0" -source = "git+https://github.com/tikv/raft-rs?branch=master#9d360a3b0cdb691da8e500a4f73c457b605a1d73" +source = "git+https://github.com/tikv/raft-rs?branch=master#f60fb9e143e5b93f7db8917ea376cda04effcbb4" dependencies = [ "bytes", "protobuf", @@ -5039,6 +5028,8 @@ dependencies = [ "slog-global", "smallvec", "sst_importer", + "strum 0.20.0", + "strum_macros 0.24.3", "tempfile", "test_sst_importer", "thiserror", @@ -5264,7 +5255,7 @@ checksum = "258bcdb5ac6dad48491bb2992db6b7cf74878b0384908af124823d118c99683f" dependencies = [ "crossbeam-channel", "crossbeam-deque", - "crossbeam-utils 0.8.8", + "crossbeam-utils", "num_cpus", ] @@ -5482,6 +5473,32 @@ dependencies = [ "bytemuck", ] +[[package]] +name = "rhai" +version = "1.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f06953bb8b9e4307cb7ccc0d9d018e2ddd25a30d32831f631ce4fe8f17671f7" +dependencies = [ + "ahash 0.7.4", + "bitflags", + "instant", + "num-traits", + "rhai_codegen", + "smallvec", + "smartstring", +] + +[[package]] +name = "rhai_codegen" +version = "1.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75a39bc2aa9258b282ee5518dac493491a9c4c11a6d7361b9d2644c922fc6488" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.103", +] + [[package]] name = "ring" version = "0.16.16" @@ -5500,7 +5517,7 @@ dependencies = [ [[package]] name = "rocksdb" version = "0.3.0" -source = "git+https://github.com/tikv/rust-rocksdb.git#0c78f4072d766b152e83b25d3068b5c72b5feca1" +source = "git+https://github.com/tikv/rust-rocksdb.git?branch=tikv-7.5#268d20d61b8bf097f064a87a9af3cb91725ff179" dependencies = [ "libc 0.2.146", "librocksdb_sys", @@ -5816,9 +5833,9 @@ dependencies = [ [[package]] name = "serde" -version = "1.0.147" +version = "1.0.194" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d193d69bae983fc11a79df82342761dfbf28a99fc8d203dca4c3c1b590948965" +checksum = "0b114498256798c94a0689e1a15fec6005dee8ac1f41de56404b67afc2a4b773" dependencies = [ "serde_derive", ] @@ -5835,13 +5852,13 @@ dependencies = [ [[package]] name = "serde_derive" -version = "1.0.147" +version = "1.0.194" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4f1d362ca8fc9c3e3a7484440752472d68a6caa98f1ab81d99b5dfe517cec852" +checksum = "a3385e45322e8f9931410f01b3031ec534c3947d0e94c18049af4d9f9907d4e0" dependencies = [ "proc-macro2", "quote", - "syn 1.0.103", + "syn 2.0.79", ] [[package]] @@ -5951,7 +5968,6 @@ dependencies = [ "error_code", "fail", "file_system", - "fs2", "futures 0.3.15", "grpcio", "grpcio-health", @@ -6146,6 +6162,17 @@ version = "1.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f2dd574626839106c320a323308629dcb1acfc96e32a8cba364ddc61ac23ee83" +[[package]] +name = "smartstring" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3fb72c633efbaa2dd666986505016c32c3044395ceaf881518399d2f4127ee29" +dependencies = [ + "autocfg", + "static_assertions", + "version_check 0.9.4", +] + [[package]] name = "snap_recovery" version = "0.1.0" @@ -6184,7 +6211,7 @@ dependencies = [ [[package]] name = "snappy-sys" version = "0.1.0" -source = "git+https://github.com/busyjay/rust-snappy.git?branch=static-link#8c12738bad811397600455d6982aff754ea2ac44" +source = "git+https://github.com/tikv/rust-snappy.git?branch=static-link#8c12738bad811397600455d6982aff754ea2ac44" dependencies = [ "cmake", "libc 0.2.146", @@ -6336,7 +6363,7 @@ version = "0.25.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "290d54ea6f91c969195bdbcd7442c8c2a2ba87da8bf60a7ee86a235d4bc1e125" dependencies = [ - "strum_macros 0.25.0", + "strum_macros 0.25.3", ] [[package]] @@ -6353,15 +6380,28 @@ dependencies = [ [[package]] name = "strum_macros" -version = "0.25.0" +version = "0.24.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fe9f3bd7d2e45dcc5e265fbb88d6513e4747d8ef9444cf01a533119bce28a157" +checksum = "1e385be0d24f186b4ce2f9982191e7101bb737312ad61c1f2f984f34bcf85d59" dependencies = [ "heck 0.4.1", "proc-macro2", "quote", "rustversion", - "syn 2.0.18", + "syn 1.0.103", +] + +[[package]] +name = "strum_macros" +version = "0.25.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "23dc1fa9ac9c169a78ba62f0b841814b7abae11bdd047b9c58f893439e309ea0" +dependencies = [ + "heck 0.4.1", + "proc-macro2", + "quote", + "rustversion", + "syn 2.0.79", ] [[package]] @@ -6377,7 +6417,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ac457d054f793cedfde6f32d21d692b8351cfec9084fefd0470c0373f6d799bc" dependencies = [ "debugid", - "memmap2 0.5.3", + "memmap2 0.5.10", "stable_deref_trait", "uuid 1.2.1", ] @@ -6406,9 +6446,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.18" +version = "2.0.79" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32d41677bcbe24c20c52e7c70b0d8db04134c5d1066bf98662e2871ad200ea3e" +checksum = "89132cd0bf050864e1d38dc3bbc07a0eb8e7530af26344d3d2bbbef83499f590" dependencies = [ "proc-macro2", "quote", @@ -6547,6 +6587,7 @@ dependencies = [ "collections", "concurrency_manager", "crc64fast", + "engine_rocks", "engine_traits", "external_storage_export", "file_system", @@ -6556,6 +6597,7 @@ dependencies = [ "grpcio", "kvproto", "protobuf", + "raftstore", "rand 0.8.5", "tempfile", "test_raftstore", @@ -6785,7 +6827,6 @@ name = "tests" version = "0.0.1" dependencies = [ "api_version", - "arrow", "async-trait", "batch-system", "byteorder", @@ -6979,7 +7020,7 @@ dependencies = [ "match-template", "nom 7.1.0", "num 0.3.0", - "num-derive", + "num-derive 0.3.0", "num-traits", "ordered-float", "protobuf", @@ -7065,7 +7106,7 @@ dependencies = [ [[package]] name = "tikv" -version = "7.4.0-alpha" +version = "7.5.4" dependencies = [ "anyhow", "api_version", @@ -7293,6 +7334,7 @@ dependencies = [ "derive_more", "error_code", "fail", + "fs2", "futures 0.3.15", "futures-util", "gag", @@ -7309,6 +7351,7 @@ dependencies = [ "num_cpus", "online_config", "openssl", + "ordered-float", "page_size", "panic_hook", "parking_lot_core 0.9.1", @@ -7393,21 +7436,6 @@ dependencies = [ "serde_json", ] -[[package]] -name = "tinyvec" -version = "1.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87cc5ceb3875bb20c2890005a4e226a4651264a5c75edb2421b52861a0a0cb50" -dependencies = [ - "tinyvec_macros", -] - -[[package]] -name = "tinyvec_macros" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" - [[package]] name = "tipb" version = "0.0.1" @@ -7450,11 +7478,10 @@ dependencies = [ [[package]] name = "tokio-executor" -version = "0.1.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fb2d1b8f4548dbf5e1f7818512e9c406860678f29c300cdf0ebac72d1a3a1671" +version = "0.1.9" +source = "git+https://github.com/tikv/tokio?branch=tokio-timer-hotfix#4394380fa3c1f7f2c702a4ccc5ff01384746fdfd" dependencies = [ - "crossbeam-utils 0.7.2", + "crossbeam-utils", "futures 0.1.31", ] @@ -7515,9 +7542,9 @@ dependencies = [ [[package]] name = "tokio-timer" version = "0.2.13" -source = "git+https://github.com/tikv/tokio?branch=tokio-timer-hotfix#e8ac149d93f4a9bf49ea569d8d313ee40c5eb448" +source = "git+https://github.com/tikv/tokio?branch=tokio-timer-hotfix#4394380fa3c1f7f2c702a4ccc5ff01384746fdfd" dependencies = [ - "crossbeam-utils 0.7.2", + "crossbeam-utils", "futures 0.1.31", "slab", "tokio-executor", @@ -7549,9 +7576,9 @@ dependencies = [ [[package]] name = "toml_datetime" -version = "0.6.3" +version = "0.6.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7cda73e2f1397b1262d6dfdcef8aafae14d1de7748d66822d3bfeeb6d03e5e4b" +checksum = "0dd7358ecb8fc2f8d014bf86f6f638ce72ba252a2c3a2572f2a795f1d23efb41" [[package]] name = "toml_edit" @@ -7559,7 +7586,7 @@ version = "0.19.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c500344a19072298cd05a7224b3c0c629348b78692bf48466c5238656e315a78" dependencies = [ - "indexmap 2.0.0", + "indexmap 2.7.0", "toml_datetime", "winnow", ] @@ -7708,7 +7735,7 @@ name = "tracker" version = "0.0.1" dependencies = [ "collections", - "crossbeam-utils 0.8.8", + "crossbeam-utils", "kvproto", "lazy_static", "parking_lot 0.12.1", @@ -7735,9 +7762,13 @@ dependencies = [ [[package]] name = "twox-hash" -version = "1.5.0" +version = "1.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3bfd5b7557925ce778ff9b9ef90e3ade34c524b5ff10e239c69a42d546d2af56" +checksum = "97fee6b57c6a41524a810daee9286c02d7752c4253064d0b05472833a438f675" +dependencies = [ + "cfg-if 1.0.0", + "static_assertions", +] [[package]] name = "txn_types" @@ -7779,9 +7810,12 @@ checksum = "eeba86d422ce181a719445e51872fa30f1f7413b62becb52e95ec91aa262d85c" [[package]] name = "unicode-bidi" -version = "0.3.13" +version = "0.3.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "92888ba5573ff080736b3648696b70cafad7d250551175acbaa4e0385b3e1460" +checksum = "49f2bd0c6468a8230e1db229cff8029217cf623c767ea5d60bfbd42729ea54d5" +dependencies = [ + "matches", +] [[package]] name = "unicode-ident" @@ -7791,11 +7825,11 @@ checksum = "6ceab39d59e4c9499d4e5a8ee0e2735b891bb7308ac83dfb4e80cad195c9f6f3" [[package]] name = "unicode-normalization" -version = "0.1.22" +version = "0.1.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c5713f0fc4b5db668a2ac63cdb7bb4469d8c9fed047b1d0292cc7b0ce2ba921" +checksum = "5479532badd04e128284890390c1e876ef7a993d0570b3597ae43dfa1d59afa4" dependencies = [ - "tinyvec", + "smallvec", ] [[package]] @@ -7824,12 +7858,13 @@ checksum = "a156c684c91ea7d62626509bce3cb4e1d9ed5c4d978f7b4352658f96a4c26b4a" [[package]] name = "url" -version = "2.4.0" +version = "2.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50bff7831e19200a85b17131d085c25d7811bc4e186efdaf54bbd132994a88cb" +checksum = "a507c383b2d33b5fc35d1861e77e6b383d158b2da5e14fe51b83dfedf6fd578c" dependencies = [ "form_urlencoded", "idna", + "matches", "percent-encoding", "serde", ] @@ -8258,7 +8293,7 @@ source = "git+https://github.com/tikv/yatp.git?branch=master#5572a78702572087cab dependencies = [ "crossbeam-deque", "crossbeam-skiplist", - "crossbeam-utils 0.8.8", + "crossbeam-utils", "dashmap", "fail", "lazy_static", diff --git a/Cargo.toml b/Cargo.toml index b8839bcb1f4..c9857bfdf8d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "tikv" -version = "7.4.0-alpha" +version = "7.5.4" authors = ["The TiKV Authors"] description = "A distributed transactional key-value database powered by Rust and Raft" license = "Apache-2.0" @@ -12,7 +12,13 @@ edition = "2021" publish = false [features] -default = ["test-engine-kv-rocksdb", "test-engine-raft-raft-engine", "cloud-aws", "cloud-gcp", "cloud-azure"] +default = [ + "test-engine-kv-rocksdb", + "test-engine-raft-raft-engine", + "cloud-aws", + "cloud-gcp", + "cloud-azure", +] trace-tablet-lifetime = ["engine_rocks/trace-lifetime"] tcmalloc = ["tikv_alloc/tcmalloc"] jemalloc = ["tikv_alloc/jemalloc", "engine_rocks/jemalloc"] @@ -21,11 +27,24 @@ snmalloc = ["tikv_alloc/snmalloc"] portable = ["engine_rocks/portable"] sse = ["engine_rocks/sse"] mem-profiling = ["tikv_alloc/mem-profiling"] -failpoints = ["fail/failpoints", "raftstore/failpoints", "tikv_util/failpoints", "engine_rocks/failpoints"] +failpoints = [ + "fail/failpoints", + "raftstore/failpoints", + "tikv_util/failpoints", + "engine_rocks/failpoints", + "raft_log_engine/failpoints", +] cloud-aws = ["encryption_export/cloud-aws", "sst_importer/cloud-aws"] cloud-gcp = ["encryption_export/cloud-gcp", "sst_importer/cloud-gcp"] cloud-azure = ["encryption_export/cloud-azure", "sst_importer/cloud-azure"] -testexport = ["raftstore/testexport", "api_version/testexport", "causal_ts/testexport", "engine_traits/testexport", "engine_rocks/testexport", "engine_panic/testexport"] +testexport = [ + "raftstore/testexport", + "api_version/testexport", + "causal_ts/testexport", + "engine_traits/testexport", + "engine_rocks/testexport", + "engine_panic/testexport", +] test-engine-kv-rocksdb = ["engine_test/test-engine-kv-rocksdb"] test-engine-raft-raft-engine = ["engine_test/test-engine-raft-raft-engine"] test-engines-rocksdb = ["engine_test/test-engines-rocksdb"] @@ -73,7 +92,10 @@ flate2 = { version = "1.0", default-features = false, features = ["zlib"] } futures = { version = "0.3", features = ["thread-pool", "compat"] } futures-executor = "0.3.1" futures-timer = "3.0" -futures-util = { version = "0.3.1", default-features = false, features = ["io", "async-await"] } +futures-util = { version = "0.3.1", default-features = false, features = [ + "io", + "async-await", +] } fxhash = "0.2.1" getset = "0.1" grpcio = { workspace = true } @@ -90,7 +112,10 @@ kvproto = { workspace = true } lazy_static = "1.3" libc = "0.2" libloading = "0.7" -log = { version = "0.4", features = ["max_level_trace", "release_max_level_debug"] } +log = { version = "0.4", features = [ + "max_level_trace", + "release_max_level_debug", +] } log_wrappers = { workspace = true } match-template = "0.0.1" memory_trace_macros = { workspace = true } @@ -108,7 +133,10 @@ paste = "1.0" pd_client = { workspace = true } pin-project = "1.0" pnet_datalink = "0.23" -pprof = { version = "0.11", default-features = false, features = ["flamegraph", "protobuf-codec"] } +pprof = { version = "0.11", default-features = false, features = [ + "flamegraph", + "protobuf-codec", +] } prometheus = { version = "0.13", features = ["nightly"] } prometheus-static-metric = "0.5" protobuf = { version = "2.8", features = ["bytes"] } @@ -161,7 +189,7 @@ yatp = { workspace = true } [dev-dependencies] api_version = { workspace = true, features = ["testexport"] } -example_coprocessor_plugin = { workspace = true } # should be a binary dependency +example_coprocessor_plugin = { workspace = true } # should be a binary dependency hyper-openssl = "0.9" panic_hook = { workspace = true } raftstore = { workspace = true, features = ["testexport"] } @@ -189,7 +217,7 @@ rusoto_mock = { git = "https://github.com/tikv/rusoto", branch = "gh1482-s3-addr rusoto_s3 = { git = "https://github.com/tikv/rusoto", branch = "gh1482-s3-addr-styles" } rusoto_sts = { git = "https://github.com/tikv/rusoto", branch = "gh1482-s3-addr-styles" } -snappy-sys = { git = "https://github.com/busyjay/rust-snappy.git", branch = "static-link" } +snappy-sys = { git = "https://github.com/tikv/rust-snappy.git", branch = "static-link" } # remove this when https://github.com/danburkert/fs2-rs/pull/42 is merged. fs2 = { git = "https://github.com/tabokie/fs2-rs", branch = "tikv" } @@ -234,9 +262,9 @@ members = [ "components/encryption", "components/encryption/export", "components/engine_rocks_helper", -# Only enable tirocks in local development, otherwise it can slow down compilation. -# TODO: always enable tirocks and remove engine_rocks. -# "components/engine_tirocks", + # Only enable tirocks in local development, otherwise it can slow down compilation. + # TODO: always enable tirocks and remove engine_rocks. + # "components/engine_tirocks", "components/error_code", "components/external_storage", "components/external_storage/export", @@ -344,9 +372,9 @@ resource_metering = { path = "components/resource_metering" } security = { path = "components/security" } server = { path = "components/server" } service = { path = "components/service" } -snap_recovery = { path = "components/snap_recovery" } +snap_recovery = { path = "components/snap_recovery", default-features = false } sst_importer = { path = "components/sst_importer", default-features = false } -test_backup = { path = "components/test_backup" } +test_backup = { path = "components/test_backup", default-features = false } test_coprocessor = { path = "components/test_coprocessor", default-features = false } example_coprocessor_plugin = { path = "components/test_coprocessor_plugin/example_plugin" } test_pd = { path = "components/test_pd" } @@ -371,14 +399,30 @@ tipb_helper = { path = "components/tipb_helper" } tracker = { path = "components/tracker" } txn_types = { path = "components/txn_types" } # External libs -raft = { version = "0.7.0", default-features = false, features = ["protobuf-codec"] } -grpcio = { version = "0.10.4", default-features = false, features = ["openssl-vendored", "protobuf-codec", "nightly"] } -grpcio-health = { version = "0.10.4", default-features = false, features = ["protobuf-codec"] } +raft = { version = "0.7.0", default-features = false, features = [ + "protobuf-codec", +] } +raft-engine = { git = "https://github.com/tikv/raft-engine.git", branch = "tikv-7.5", features = [ + "swap", +] } +raft-engine-ctl = { git = "https://github.com/tikv/raft-engine.git", branch = "tikv-7.5" } +grpcio = { version = "0.10.4", default-features = false, features = [ + "openssl-vendored", + "protobuf-codec", + "nightly", +] } +grpcio-health = { version = "0.10.4", default-features = false, features = [ + "protobuf-codec", +] } tipb = { git = "https://github.com/pingcap/tipb.git" } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { git = "https://github.com/pingcap/kvproto.git", branch = "release-7.5" } yatp = { git = "https://github.com/tikv/yatp.git", branch = "master" } tokio-timer = { git = "https://github.com/tikv/tokio", branch = "tokio-timer-hotfix" } -slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } +tokio-executor = { git = "https://github.com/tikv/tokio", branch = "tokio-timer-hotfix" } +slog = { version = "2.3", features = [ + "max_level_trace", + "release_max_level_debug", +] } slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } # TiFlash libs diff --git a/OWNERS b/OWNERS new file mode 100644 index 00000000000..f70f2cbf0ea --- /dev/null +++ b/OWNERS @@ -0,0 +1,79 @@ +# See the OWNERS docs at https://go.k8s.io/owners +approvers: + - 5kbpers + - AndreMouche + - andylokandy + - breezewish + - brson + - bufferflies + - BusyJay + - cfzjywxk + - Connor1996 + - coocood + - crazycs520 + - disksing + - ekexium + - gengliqi + - glorv + - hicqu + - hunterlxt + - imtbkcat + - innerr + - iosmanthus + - jackysp + - kennytm + - Little-Wallace + - liuzix + - lonng + - LykxSassinator + - lysu + - marsishandsome + - MyonKeminta + - niedhui + - NingLin-P + - nrc + - overvenus + - pingyu + - skyzh + - SpadeA-Tang + - sticnarf + - sunxiaoguang + - tabokie + - TennyZhuang + - tonyxuqqi + - yiwu-arbug + - you06 + - youjiali1995 + - YuJuncen + - zhangjinpeng87 + - zhongzc + - zhouqiang-cl + - zyguan +reviewers: + - 3AceShowHand + - 3pointer + - CalvinNeo + - ethercflow + - fredchenbj + - Fullstop000 + - gozssky + - haojinming + - hbisheng + - HuSharp + - jayzhan211 + - Jibbow + - JmPotato + - Leavrth + - lhy1024 + - longfangsong + - lzmhhh123 + - Mossaka + - MrCroxx + - nolouch + - rleungx + - Rustin170506 + - tier-cap + - v01dstar + - wjhuang2016 + - wshwsh12 + - Xuanwo diff --git a/OWNERS_ALIASES b/OWNERS_ALIASES new file mode 100644 index 00000000000..cd4a74373a9 --- /dev/null +++ b/OWNERS_ALIASES @@ -0,0 +1,14 @@ +# Sort the member alphabetically. +aliases: + sig-critical-approvers-config-components: + - easonn7 + - kevin-xianliu + - zhangjinpeng87 + sig-critical-approvers-config-src: + - easonn7 + - kevin-xianliu + - cfzjywxk + - zhangjinpeng87 + sig-critical-approvers-config-cdc: + - BenMeadowcroft + - kevin-xianliu diff --git a/cmd/tikv-ctl/Cargo.toml b/cmd/tikv-ctl/Cargo.toml index a36e72b3c64..b8854f98f46 100644 --- a/cmd/tikv-ctl/Cargo.toml +++ b/cmd/tikv-ctl/Cargo.toml @@ -6,7 +6,13 @@ edition = "2021" publish = false [features] -default = ["test-engine-kv-rocksdb", "test-engine-raft-raft-engine", "cloud-aws", "cloud-gcp", "cloud-azure"] +default = [ + "test-engine-kv-rocksdb", + "test-engine-raft-raft-engine", + "cloud-aws", + "cloud-gcp", + "cloud-azure", +] tcmalloc = ["tikv/tcmalloc"] jemalloc = ["tikv/jemalloc"] mimalloc = ["tikv/mimalloc"] @@ -15,32 +21,15 @@ portable = ["tikv/portable"] sse = ["tikv/sse"] mem-profiling = ["tikv/mem-profiling"] failpoints = ["tikv/failpoints"] -cloud-aws = [ - "encryption_export/cloud-aws", - "backup/cloud-aws", -] -cloud-gcp = [ - "encryption_export/cloud-gcp", - "backup/cloud-gcp", -] -cloud-azure = [ - "encryption_export/cloud-azure", - "backup/cloud-azure", -] +cloud-aws = ["encryption_export/cloud-aws", "backup/cloud-aws"] +cloud-gcp = ["encryption_export/cloud-gcp", "backup/cloud-gcp"] +cloud-azure = ["encryption_export/cloud-azure", "backup/cloud-azure"] cloud-storage-grpc = ["backup/cloud-storage-grpc"] cloud-storage-dylib = ["backup/cloud-storage-dylib"] -test-engine-kv-rocksdb = [ - "tikv/test-engine-kv-rocksdb" -] -test-engine-raft-raft-engine = [ - "tikv/test-engine-raft-raft-engine" -] -test-engines-rocksdb = [ - "tikv/test-engines-rocksdb", -] -test-engines-panic = [ - "tikv/test-engines-panic", -] +test-engine-kv-rocksdb = ["tikv/test-engine-kv-rocksdb"] +test-engine-raft-raft-engine = ["tikv/test-engine-raft-raft-engine"] +test-engines-rocksdb = ["tikv/test-engines-rocksdb"] +test-engines-panic = ["tikv/test-engines-panic"] nortcheck = ["engine_rocks/nortcheck"] @@ -65,14 +54,17 @@ hex = "0.4" keys = { workspace = true } kvproto = { workspace = true } libc = "0.2" -log = { version = "0.4", features = ["max_level_trace", "release_max_level_debug"] } +log = { version = "0.4", features = [ + "max_level_trace", + "release_max_level_debug", +] } log_wrappers = { workspace = true } pd_client = { workspace = true } prometheus = { version = "0.13", features = ["nightly"] } protobuf = { version = "2.8", features = ["bytes"] } raft = { workspace = true } -raft-engine = { git = "https://github.com/tikv/raft-engine.git" } -raft-engine-ctl = { git = "https://github.com/tikv/raft-engine.git" } +raft-engine = { workspace = true } +raft-engine-ctl = { workspace = true } raft_log_engine = { workspace = true } raftstore = { workspace = true } rand = "0.8" diff --git a/cmd/tikv-ctl/src/executor.rs b/cmd/tikv-ctl/src/executor.rs index a145118acea..80b1898032d 100644 --- a/cmd/tikv-ctl/src/executor.rs +++ b/cmd/tikv-ctl/src/executor.rs @@ -240,6 +240,7 @@ pub trait DebugExecutor { json!({ "region": json!({ "id": r.get_id(), + "state": format!("{:?}", s.get_state()), "start_key": hex::encode_upper(r.get_start_key()), "end_key": hex::encode_upper(r.get_end_key()), "region_epoch": json!({ @@ -715,7 +716,7 @@ pub trait DebugExecutor { _key_range: KeyRange, _start_ts: u64, _commit_ts: u64, - ) -> Result<(), KeyRange>; + ) -> Result<(), (KeyRange, grpcio::Error)>; fn get_region_read_progress(&self, region_id: u64, log: bool, min_start_ts: u64); } @@ -948,7 +949,7 @@ impl DebugExecutor for DebugClient { key_range: KeyRange, start_ts: u64, commit_ts: u64, - ) -> Result<(), KeyRange> { + ) -> Result<(), (KeyRange, grpcio::Error)> { let mut req = FlashbackToVersionRequest::default(); req.set_version(version); req.set_region_id(region_id); @@ -963,7 +964,7 @@ impl DebugExecutor for DebugClient { "flashback key_range {:?} with start_ts {:?}, commit_ts {:?} need to retry, err is {:?}", key_range, start_ts, commit_ts, err ); - Err(key_range) + Err((key_range, err)) } } } @@ -1293,7 +1294,7 @@ where _key_range: KeyRange, _start_ts: u64, _commit_ts: u64, - ) -> Result<(), KeyRange> { + ) -> Result<(), (KeyRange, grpcio::Error)> { unimplemented!("only available for remote mode"); } @@ -1332,11 +1333,16 @@ impl DebugExecutor for DebuggerImplV2 { } fn get_region_size(&self, region: u64, cfs: Vec<&str>) -> Vec<(String, usize)> { - self.region_size(region, cfs) - .unwrap_or_else(|e| perror_and_exit("Debugger::region_size", e)) - .into_iter() - .map(|(cf, size)| (cf.to_owned(), size)) - .collect() + match self.region_size(region, cfs) { + Ok(v) => v + .into_iter() + .map(|(cf, size)| (cf.to_owned(), size)) + .collect(), + Err(e) => { + println!("Debugger::region_size: {}", e); + vec![] + } + } } fn get_region_info(&self, region: u64) -> RegionInfo { @@ -1510,7 +1516,7 @@ impl DebugExecutor for DebuggerImplV2 { _key_range: KeyRange, _start_ts: u64, _commit_ts: u64, - ) -> Result<(), KeyRange> { + ) -> Result<(), (KeyRange, grpcio::Error)> { unimplemented!("only available for remote mode"); } diff --git a/cmd/tikv-ctl/src/main.rs b/cmd/tikv-ctl/src/main.rs index 6baa1fe6c39..86bfa724608 100644 --- a/cmd/tikv-ctl/src/main.rs +++ b/cmd/tikv-ctl/src/main.rs @@ -776,12 +776,18 @@ fn compact_whole_cluster( threads: u32, bottommost: BottommostLevelCompaction, ) { - let stores = pd_client + let all_stores = pd_client .get_all_stores(true) // Exclude tombstone stores. .unwrap_or_else(|e| perror_and_exit("Get all cluster stores from PD failed", e)); + let tikv_stores = all_stores.iter().filter(|s| { + !s.get_labels() + .iter() + .any(|l| l.get_key() == "engine" && l.get_value() == "tiflash") + }); + let mut handles = Vec::new(); - for s in stores { + for s in tikv_stores { let cfg = cfg.clone(); let mgr = Arc::clone(&mgr); let addr = s.address.clone(); @@ -906,7 +912,7 @@ fn flashback_whole_cluster( .await { Ok(res) => { - if let Err(key_range) = res { + if let Err((key_range, _)) = res { // Retry specific key range to prepare flashback. let stale_key_range = (key_range.start_key.clone(), key_range.end_key.clone()); let mut key_range_to_prepare = key_range_to_prepare.write().unwrap(); @@ -986,7 +992,21 @@ fn flashback_whole_cluster( { Ok(res) => match res { Ok(_) => break, - Err(_) => { + Err((key_range, err)) => { + // Retry `NotLeader` or `RegionNotFound`. + if err.to_string().contains("not leader") || err.to_string().contains("not found") { + // When finished `PrepareFlashback`, the region may change leader in the `flashback in progress` + // Neet to retry specific key range to finish flashback. + let stale_key_range = (key_range.start_key.clone(), key_range.end_key.clone()); + let mut key_range_to_finish = key_range_to_finish.write().unwrap(); + // Remove stale key range. + key_range_to_finish.remove(&stale_key_range); + load_key_range(&pd_client, stale_key_range.0.clone(), stale_key_range.1.clone()) + .into_iter().for_each(|(key_range, region_info)| { + // Need to update `key_range_to_finish` to replace stale key range. + key_range_to_finish.insert(key_range, region_info); + }); + } thread::sleep(Duration::from_micros(WAIT_APPLY_FLASHBACK_STATE)); continue; } @@ -1048,7 +1068,7 @@ fn build_rocks_opts(cfg: &TikvConfig) -> engine_rocks::RocksDbOptions { .unwrap() .map(Arc::new); let env = get_env(key_manager, None /* io_rate_limiter */).unwrap(); - let resource = cfg.rocksdb.build_resources(env); + let resource = cfg.rocksdb.build_resources(env, cfg.storage.engine); cfg.rocksdb.build_opt(&resource, cfg.storage.engine) } diff --git a/cmd/tikv-server/Cargo.toml b/cmd/tikv-server/Cargo.toml index 409dc84a62d..6ac8ae03714 100644 --- a/cmd/tikv-server/Cargo.toml +++ b/cmd/tikv-server/Cargo.toml @@ -6,7 +6,13 @@ edition = "2021" publish = false [features] -default = ["test-engine-kv-rocksdb", "test-engine-raft-raft-engine", "cloud-aws", "cloud-gcp", "cloud-azure"] +default = [ + "test-engine-kv-rocksdb", + "test-engine-raft-raft-engine", + "cloud-aws", + "cloud-gcp", + "cloud-azure", +] trace-tablet-lifetime = ["tikv/trace-tablet-lifetime"] tcmalloc = ["server/tcmalloc"] jemalloc = ["server/jemalloc"] @@ -18,12 +24,8 @@ failpoints = ["server/failpoints"] cloud-aws = ["server/cloud-aws"] cloud-gcp = ["server/cloud-gcp"] cloud-azure = ["server/cloud-azure"] -test-engine-kv-rocksdb = [ - "server/test-engine-kv-rocksdb" -] -test-engine-raft-raft-engine = [ - "server/test-engine-raft-raft-engine" -] +test-engine-kv-rocksdb = ["server/test-engine-kv-rocksdb"] +test-engine-raft-raft-engine = ["server/test-engine-raft-raft-engine"] test-engines-rocksdb = ["server/test-engines-rocksdb"] test-engines-panic = ["server/test-engines-panic"] @@ -37,7 +39,7 @@ encryption_export = { workspace = true } engine_traits = { workspace = true } keys = { workspace = true } kvproto = { workspace = true } -raft-engine = { git = "https://github.com/tikv/raft-engine.git" } +raft-engine = { workspace = true } regex = "1" serde_json = { version = "1.0", features = ["preserve_order"] } server = { workspace = true } diff --git a/cmd/tikv-server/src/main.rs b/cmd/tikv-server/src/main.rs index 9fdcad81c58..3f4372c32cc 100644 --- a/cmd/tikv-server/src/main.rs +++ b/cmd/tikv-server/src/main.rs @@ -217,6 +217,9 @@ fn main() { process::exit(1) } + // Init memory related settings. + config.memory.init(); + let (service_event_tx, service_event_rx) = tikv_util::mpsc::unbounded(); // pipe for controling service match config.storage.engine { EngineType::RaftKv => server::server::run_tikv(config, service_event_tx, service_event_rx), diff --git a/components/backup-stream/src/checkpoint_manager.rs b/components/backup-stream/src/checkpoint_manager.rs index d32c2ea7c00..fabd8695c19 100644 --- a/components/backup-stream/src/checkpoint_manager.rs +++ b/components/backup-stream/src/checkpoint_manager.rs @@ -14,7 +14,7 @@ use kvproto::{ metapb::Region, }; use pd_client::PdClient; -use tikv_util::{box_err, defer, info, time::Instant, warn, worker::Scheduler}; +use tikv_util::{box_err, defer, info, warn, worker::Scheduler}; use txn_types::TimeStamp; use uuid::Uuid; @@ -35,6 +35,7 @@ use crate::{ #[derive(Default)] pub struct CheckpointManager { checkpoint_ts: HashMap, + frozen_resolved_ts: HashMap, resolved_ts: HashMap, manager_handle: Option>, } @@ -185,15 +186,29 @@ impl CheckpointManager { sub.main_loop() } + /// update the "dynamic" part, which is `resolved_ts`. + /// We call it "dynamic" because the data corresponding to the incoming data + /// part (in contrast of the flushing data part which is about to be write + /// to external storage and cannot be appended.) pub fn resolve_regions(&mut self, region_and_checkpoint: Vec) { for res in region_and_checkpoint { self.do_update(res.region, res.checkpoint); } } - pub fn flush(&mut self) { - info!("log backup checkpoint manager flushing."; "resolved_ts_len" => %self.resolved_ts.len(), "resolved_ts" => ?self.get_resolved_ts()); - self.checkpoint_ts = std::mem::take(&mut self.resolved_ts); + /// flush the `frozen_resolved_ts` to `checkpoint_ts`, and notify the + /// subscribers, with a possible final update to the checkpoint ts. + /// You may provide some extra resolve result from the `last_dive` argument. + /// They will be applied directly to the final checkpoint ts. It is the + /// caller's duty to make sure the resolve result is safe (i.e. All events + /// are surely flushed.) + pub fn flush_and_notify(&mut self, last_dive: Vec) { + info!("Notifying the flush result."; "last_dive_len" => last_dive.len()); + for rr in last_dive { + Self::update_ts(&mut self.frozen_resolved_ts, rr.region, rr.checkpoint); + } + // Replace the storage directly with the content of this run. + self.checkpoint_ts = std::mem::take(&mut self.frozen_resolved_ts); // Clippy doesn't know this iterator borrows `self.checkpoint_ts` :( #[allow(clippy::needless_collect)] let items = self @@ -205,6 +220,38 @@ impl CheckpointManager { self.notify(items.into_iter()); } + /// "freeze" the current resolved ts to the checkpoint ts. + /// This is usually called before we are going to flush and after freezing + /// the current batch of mutations. + /// + /// When a flush of the data collector triggered: + /// + /// ```text + /// ----------------------|-----------------> + /// ^^^ + /// Flushing data-+|+- Incoming data. + /// | + /// Flush Freeze Tempfiles + /// ``` + /// + /// Resolving over incoming data shouldn't advance the checkpoint of the + /// flushing data. So the current progress should be "freezed" when we are + /// about to flush. + pub fn freeze(&mut self) { + info!("log backup checkpoint manager freezing."; + "resolved_ts_len" => %self.resolved_ts.len(), + "resolved_ts" => ?self.get_resolved_ts(), + "frozen" => self.checkpoint_ts.len(), + ); + self.frozen_resolved_ts = std::mem::take(&mut self.resolved_ts); + } + + #[cfg(test)] + fn freeze_and_flush(&mut self) { + self.freeze(); + self.flush_and_notify(vec![]); + } + /// update a region checkpoint in need. #[cfg(test)] fn update_region_checkpoint(&mut self, region: &Region, checkpoint: TimeStamp) { @@ -224,6 +271,15 @@ impl CheckpointManager { e.and_modify(|old_cp| { let old_ver = old_cp.region.get_region_epoch().get_version(); let checkpoint_is_newer = old_cp.checkpoint < checkpoint; + if !checkpoint_is_newer { + warn!("received older checkpoint, maybe region merge."; + "region_id" => old_cp.region.get_id(), + "old_ver" => old_ver, + "new_ver" => ver, + "old_checkpoint" => old_cp.checkpoint.into_inner(), + "new_checkpoint" => checkpoint.into_inner() + ); + } if old_ver < ver || (old_ver == ver && checkpoint_is_newer) { *old_cp = LastFlushTsOfRegion { checkpoint, @@ -474,7 +530,6 @@ pub struct CheckpointV3FlushObserver { checkpoints: Vec, global_checkpoint_cache: HashMap, - start_time: Instant, } impl CheckpointV3FlushObserver { @@ -486,7 +541,6 @@ impl CheckpointV3FlushObserver { // We almost always have only one entry. global_checkpoint_cache: HashMap::with_capacity(1), baseline, - start_time: Instant::now(), } } } @@ -521,12 +575,9 @@ where } async fn after(&mut self, task: &str, _rts: u64) -> Result<()> { - let resolve_task = Task::RegionCheckpointsOp(RegionCheckpointOperation::Resolved { - checkpoints: std::mem::take(&mut self.checkpoints), - start_time: self.start_time, - }); - let flush_task = Task::RegionCheckpointsOp(RegionCheckpointOperation::Flush); - try_send!(self.sched, resolve_task); + let flush_task = Task::RegionCheckpointsOp(RegionCheckpointOperation::FlushWith( + std::mem::take(&mut self.checkpoints), + )); try_send!(self.sched, flush_task); let global_checkpoint = self.get_checkpoint(task).await?; @@ -685,7 +736,7 @@ pub mod tests { .unwrap(); mgr.resolve_regions(vec![simple_resolve_result()]); - mgr.flush(); + mgr.freeze_and_flush(); mgr.sync_with_subs_mgr(|_| {}); assert_eq!(trivial_sink.0.lock().unwrap().items.len(), 1); } @@ -703,7 +754,7 @@ pub mod tests { rt.block_on(mgr.add_subscriber(error_sink.clone())).unwrap(); mgr.resolve_regions(vec![simple_resolve_result()]); - mgr.flush(); + mgr.freeze_and_flush(); assert_eq!(mgr.sync_with_subs_mgr(|item| { item.subscribers.len() }), 0); let sink = error_sink.0.lock().unwrap(); assert_eq!(sink.items.len(), 0); @@ -721,12 +772,12 @@ pub mod tests { let r = mgr.get_from_region(RegionIdWithVersion::new(1, 32)); assert_matches::assert_matches!(r, GetCheckpointResult::NotFound { .. }); - mgr.flush(); + mgr.freeze_and_flush(); let r = mgr.get_from_region(RegionIdWithVersion::new(1, 32)); assert_matches::assert_matches!(r, GetCheckpointResult::Ok { checkpoint , .. } if checkpoint.into_inner() == 8); let r = mgr.get_from_region(RegionIdWithVersion::new(2, 35)); assert_matches::assert_matches!(r, GetCheckpointResult::Ok { checkpoint , .. } if checkpoint.into_inner() == 16); - mgr.flush(); + mgr.freeze_and_flush(); let r = mgr.get_from_region(RegionIdWithVersion::new(1, 32)); assert_matches::assert_matches!(r, GetCheckpointResult::NotFound { .. }); } @@ -758,6 +809,48 @@ pub mod tests { assert_matches::assert_matches!(r, GetCheckpointResult::Ok{checkpoint, ..} if checkpoint.into_inner() == 24); } + #[test] + fn test_mgr_freeze() { + let mut mgr = super::CheckpointManager::default(); + mgr.resolve_regions(vec![ + ResolveResult { + region: region(1, 32, 8), + checkpoint: TimeStamp::new(8), + checkpoint_type: CheckpointType::MinTs, + }, + ResolveResult { + region: region(2, 34, 8), + checkpoint: TimeStamp::new(15), + checkpoint_type: CheckpointType::MinTs, + }, + ]); + + // Freezed + mgr.freeze(); + let r = mgr.get_from_region(RegionIdWithVersion::new(1, 32)); + assert_matches::assert_matches!(r, GetCheckpointResult::NotFound { .. }); + let r = mgr.get_from_region(RegionIdWithVersion::new(2, 34)); + assert_matches::assert_matches!(r, GetCheckpointResult::NotFound { .. }); + // Shouldn't be recorded to resolved ts. + mgr.resolve_regions(vec![ResolveResult { + region: region(1, 32, 8), + checkpoint: TimeStamp::new(16), + checkpoint_type: CheckpointType::MinTs, + }]); + + // Flush done, should be able to be queried. + mgr.flush_and_notify(vec![ResolveResult { + region: region(2, 34, 8), + checkpoint: TimeStamp::new(17), + checkpoint_type: CheckpointType::MinTs, + }]); + + let r = mgr.get_from_region(RegionIdWithVersion::new(1, 32)); + assert_matches::assert_matches!(r, GetCheckpointResult::Ok{checkpoint, ..} if checkpoint.into_inner() == 8); + let r = mgr.get_from_region(RegionIdWithVersion::new(2, 34)); + assert_matches::assert_matches!(r, GetCheckpointResult::Ok{checkpoint, ..} if checkpoint.into_inner() == 17); + } + pub struct MockPdClient { safepoint: RwLock>, } diff --git a/components/backup-stream/src/endpoint.rs b/components/backup-stream/src/endpoint.rs index b11259d5be6..45ad4305abb 100644 --- a/components/backup-stream/src/endpoint.rs +++ b/components/backup-stream/src/endpoint.rs @@ -1,26 +1,35 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use std::{any::Any, collections::HashSet, fmt, marker::PhantomData, sync::Arc, time::Duration}; +use std::{ + any::Any, + collections::HashSet, + fmt, + marker::PhantomData, + sync::{Arc, Mutex}, + time::Duration, +}; use concurrency_manager::ConcurrencyManager; use engine_traits::KvEngine; use error_code::ErrorCodeExt; -use futures::{stream::AbortHandle, FutureExt}; +use futures::{stream::AbortHandle, FutureExt, TryFutureExt}; use kvproto::{ brpb::{StreamBackupError, StreamBackupTaskInfo}, metapb::Region, }; use pd_client::PdClient; +use raft::StateRole; use raftstore::{ coprocessor::{CmdBatch, ObserveHandle, RegionInfoProvider}, router::CdcHandle, }; use resolved_ts::{resolve_by_raft, LeadershipResolver}; -use tikv::config::BackupStreamConfig; +use tikv::config::{BackupStreamConfig, ResolvedTsConfig}; use tikv_util::{ box_err, config::ReadableDuration, debug, defer, info, + memory::MemoryQuota, sys::thread::ThreadBuildWrapper, time::{Instant, Limiter}, warn, @@ -30,7 +39,7 @@ use tikv_util::{ use tokio::{ io::Result as TokioResult, runtime::{Handle, Runtime}, - sync::oneshot, + sync::Semaphore, }; use tokio_stream::StreamExt; use txn_types::TimeStamp; @@ -43,7 +52,7 @@ use crate::{ GetCheckpointResult, RegionIdWithVersion, Subscription, }, errors::{Error, Result}, - event_loader::{InitialDataLoader, PendingMemoryQuota}, + event_loader::InitialDataLoader, future, metadata::{store::MetaStore, MetadataClient, MetadataEvent, StreamTask}, metrics::{self, TaskStatus}, @@ -52,7 +61,7 @@ use crate::{ subscription_manager::{RegionSubscriptionManager, ResolvedRegions}, subscription_track::{Ref, RefMut, ResolveResult, SubscriptionTracer}, try_send, - utils::{self, CallbackWaitGroup, StopWatch, Work}, + utils::{self, FutureWaitGroup, StopWatch, Work}, }; const SLOW_EVENT_THRESHOLD: f64 = 120.0; @@ -60,7 +69,7 @@ const SLOW_EVENT_THRESHOLD: f64 = 120.0; /// task has fatal error. const CHECKPOINT_SAFEPOINT_TTL_IF_ERROR: u64 = 24; -pub struct Endpoint { +pub struct Endpoint { // Note: those fields are more like a shared context between components. // For now, we copied them everywhere, maybe we'd better extract them into a // context type. @@ -69,7 +78,6 @@ pub struct Endpoint { pub(crate) store_id: u64, pub(crate) regions: R, pub(crate) engine: PhantomData, - pub(crate) router: RT, pub(crate) pd_client: Arc, pub(crate) subs: SubscriptionTracer, pub(crate) concurrency_manager: ConcurrencyManager, @@ -78,34 +86,33 @@ pub struct Endpoint { pub range_router: Router, observer: BackupStreamObserver, pool: Runtime, - initial_scan_memory_quota: PendingMemoryQuota, - initial_scan_throughput_quota: Limiter, region_operator: RegionSubscriptionManager, failover_time: Option, // We holds the config before, even it is useless for now, // however probably it would be useful in the future. config: BackupStreamConfig, - checkpoint_mgr: CheckpointManager, + pub checkpoint_mgr: CheckpointManager, // Runtime status: /// The handle to abort last save storage safe point. /// This is used for simulating an asynchronous background worker. /// Each time we spawn a task, once time goes by, we abort that task. pub abort_last_storage_save: Option, + pub initial_scan_semaphore: Arc, } -impl Endpoint +impl Endpoint where R: RegionInfoProvider + 'static + Clone, E: KvEngine, - RT: CdcHandle + 'static, PDC: PdClient + 'static, S: MetaStore + 'static, { - pub fn new( + pub fn new + 'static>( store_id: u64, store: S, config: BackupStreamConfig, + resolved_ts_config: ResolvedTsConfig, scheduler: Scheduler, observer: BackupStreamObserver, accessor: R, @@ -134,8 +141,9 @@ where pool.spawn(Self::starts_flush_ticks(range_router.clone())); - let initial_scan_memory_quota = - PendingMemoryQuota::new(config.initial_scan_pending_memory_quota.0 as _); + let initial_scan_memory_quota = Arc::new(MemoryQuota::new( + config.initial_scan_pending_memory_quota.0 as _, + )); let limit = if config.initial_scan_rate_limit.0 > 0 { config.initial_scan_rate_limit.0 as f64 } else { @@ -145,27 +153,33 @@ where info!("the endpoint of stream backup started"; "path" => %config.temp_path); let subs = SubscriptionTracer::default(); + let initial_scan_semaphore = Arc::new(Semaphore::new(config.initial_scan_concurrency)); let (region_operator, op_loop) = RegionSubscriptionManager::start( InitialDataLoader::new( - router.clone(), - accessor.clone(), range_router.clone(), subs.clone(), scheduler.clone(), - initial_scan_memory_quota.clone(), - pool.handle().clone(), - initial_scan_throughput_quota.clone(), + initial_scan_memory_quota, + initial_scan_throughput_quota, + // NOTE: in fact we can get rid of the `Arc`. Just need to warp the router when the + // scanner pool is created. But at that time the handle has been sealed in the + // `InitialScan` trait -- we cannot do that. + Arc::new(Mutex::new(router)), + Arc::clone(&initial_scan_semaphore), ), + accessor.clone(), observer.clone(), meta_client.clone(), pd_client.clone(), ((config.num_threads + 1) / 2).max(1), resolver, + resolved_ts_config.advance_ts_interval.0, ); pool.spawn(op_loop); let mut checkpoint_mgr = CheckpointManager::default(); pool.spawn(checkpoint_mgr.spawn_subscription_mgr()); let ep = Endpoint { + initial_scan_semaphore, meta_client, range_router, scheduler, @@ -174,12 +188,9 @@ where store_id, regions: accessor, engine: PhantomData, - router, pd_client, subs, concurrency_manager, - initial_scan_memory_quota, - initial_scan_throughput_quota, region_operator, failover_time: None, config, @@ -191,18 +202,64 @@ where } } -impl Endpoint +impl Endpoint where S: MetaStore + 'static, R: RegionInfoProvider + Clone + 'static, E: KvEngine, - RT: CdcHandle + 'static, PDC: PdClient + 'static, { fn get_meta_client(&self) -> MetadataClient { self.meta_client.clone() } + fn on_fatal_error_of_task(&self, task: &str, err: &Error) -> future![()] { + metrics::update_task_status(TaskStatus::Error, task); + let meta_cli = self.get_meta_client(); + let pdc = self.pd_client.clone(); + let store_id = self.store_id; + let sched = self.scheduler.clone(); + let safepoint_name = self.pause_guard_id_for_task(task); + let safepoint_ttl = self.pause_guard_duration(); + let code = err.error_code().code.to_owned(); + let msg = err.to_string(); + let task = task.to_owned(); + async move { + let err_fut = async { + let safepoint = meta_cli.global_progress_of_task(&task).await?; + pdc.update_service_safe_point( + safepoint_name, + TimeStamp::new(safepoint.saturating_sub(1)), + safepoint_ttl, + ) + .await?; + meta_cli.pause(&task).await?; + let mut last_error = StreamBackupError::new(); + last_error.set_error_code(code); + last_error.set_error_message(msg.clone()); + last_error.set_store_id(store_id); + last_error.set_happen_at(TimeStamp::physical_now()); + meta_cli.report_last_error(&task, last_error).await?; + Result::Ok(()) + }; + if let Err(err_report) = err_fut.await { + err_report.report(format_args!("failed to upload error {}", err_report)); + let name = task.to_owned(); + // Let's retry reporting after 5s. + tokio::task::spawn(async move { + tokio::time::sleep(Duration::from_secs(5)).await; + try_send!( + sched, + Task::FatalError( + TaskSelector::ByName(name), + Box::new(annotate!(err_report, "origin error: {}", msg)) + ) + ); + }); + } + } + } + fn on_fatal_error(&self, select: TaskSelector, err: Box) { err.report_fatal(); let tasks = self @@ -212,49 +269,7 @@ where for task in tasks { // Let's pause the task first. self.unload_task(&task); - metrics::update_task_status(TaskStatus::Error, &task); - - let meta_cli = self.get_meta_client(); - let pdc = self.pd_client.clone(); - let store_id = self.store_id; - let sched = self.scheduler.clone(); - let safepoint_name = self.pause_guard_id_for_task(&task); - let safepoint_ttl = self.pause_guard_duration(); - let code = err.error_code().code.to_owned(); - let msg = err.to_string(); - self.pool.block_on(async move { - let err_fut = async { - let safepoint = meta_cli.global_progress_of_task(&task).await?; - pdc.update_service_safe_point( - safepoint_name, - TimeStamp::new(safepoint.saturating_sub(1)), - safepoint_ttl, - ) - .await?; - meta_cli.pause(&task).await?; - let mut last_error = StreamBackupError::new(); - last_error.set_error_code(code); - last_error.set_error_message(msg.clone()); - last_error.set_store_id(store_id); - last_error.set_happen_at(TimeStamp::physical_now()); - meta_cli.report_last_error(&task, last_error).await?; - Result::Ok(()) - }; - if let Err(err_report) = err_fut.await { - err_report.report(format_args!("failed to upload error {}", err_report)); - // Let's retry reporting after 5s. - tokio::task::spawn(async move { - tokio::time::sleep(Duration::from_secs(5)).await; - try_send!( - sched, - Task::FatalError( - TaskSelector::ByName(task.to_owned()), - Box::new(annotate!(err_report, "origin error: {}", msg)) - ) - ); - }); - } - }); + self.pool.block_on(self.on_fatal_error_of_task(&task, &err)); } } @@ -480,6 +495,11 @@ where let total_size = kvs.size(); metrics::HEAP_MEMORY .add(total_size as _); + #[cfg(feature = "failpoints")] + tokio::time::sleep(Duration::from_millis((|| { + fail::fail_point!("log_backup_batch_delay", |val| val.and_then( |x| x.parse::().ok()).unwrap_or(0)); + 0 + })())).await; utils::handle_on_event_result(&sched, router.on_events(kvs).await); metrics::HEAP_MEMORY .sub(total_size as _); @@ -494,20 +514,6 @@ where }); } - /// Make an initial data loader using the resource of the endpoint. - pub fn make_initial_loader(&self) -> InitialDataLoader { - InitialDataLoader::new( - self.router.clone(), - self.regions.clone(), - self.range_router.clone(), - self.subs.clone(), - self.scheduler.clone(), - self.initial_scan_memory_quota.clone(), - self.pool.handle().clone(), - self.initial_scan_throughput_quota.clone(), - ) - } - pub fn handle_watch_task(&self, op: TaskOp) { match op { TaskOp::AddTask(task) => { @@ -525,13 +531,12 @@ where } } - async fn observe_and_scan_region( + async fn observe_regions_in_range( &self, - init: InitialDataLoader, task: &StreamTask, start_key: Vec, end_key: Vec, - ) -> Result<()> { + ) { let start = Instant::now_coarse(); let success = self .observer @@ -549,7 +554,9 @@ where // directly and this would be fast. If this gets slow, maybe make it async // again. (Will that bring race conditions? say `Start` handled after // `ResfreshResolver` of some region.) - let range_init_result = init.initialize_range(start_key.clone(), end_key.clone()); + let range_init_result = self + .initialize_range(start_key.clone(), end_key.clone()) + .await; match range_init_result { Ok(()) => { info!("backup stream success to initialize"; @@ -561,6 +568,45 @@ where e.report("backup stream initialize failed"); } } + } + + /// initialize a range: it simply scan the regions with leader role and send + /// them to [`initialize_region`]. + pub async fn initialize_range(&self, start_key: Vec, end_key: Vec) -> Result<()> { + // Generally we will be very very fast to consume. + // Directly clone the initial data loader to the background thread looks a + // little heavier than creating a new channel. TODO: Perhaps we need a + // handle to the `InitialDataLoader`. Making it a `Runnable` worker might be a + // good idea. + let (tx, mut rx) = tokio::sync::mpsc::channel(1); + self.regions + .seek_region( + &start_key, + Box::new(move |i| { + // Ignore the error, this can only happen while the server is shutting down, the + // future has been canceled. + let _ = i + .filter(|r| r.role == StateRole::Leader) + .take_while(|r| r.region.start_key < end_key) + .try_for_each(|r| { + tx.blocking_send(ObserveOp::Start { + region: r.region.clone(), + }) + }); + }), + ) + .map_err(|err| { + Error::Other(box_err!( + "failed to seek region for start key {}: {}", + utils::redact(&start_key), + err + )) + })?; + // Don't reschedule this command: or once the endpoint's mailbox gets + // full, the system might deadlock. + while let Some(cmd) = rx.recv().await { + self.region_operator.request(cmd).await; + } Ok(()) } @@ -578,7 +624,6 @@ where /// Load the task into memory: this would make the endpint start to observe. fn load_task(&self, task: StreamTask) { let cli = self.meta_client.clone(); - let init = self.make_initial_loader(); let range_router = self.range_router.clone(); info!( @@ -587,23 +632,15 @@ where ); let task_name = task.info.get_name().to_owned(); - // clean the safepoint created at pause(if there is) - self.pool.spawn( - self.pd_client - .update_service_safe_point( - self.pause_guard_id_for_task(task.info.get_name()), - TimeStamp::zero(), - Duration::new(0, 0), - ) - .map(|r| { - r.map_err(|err| Error::from(err).report("removing safe point for pausing")) - }), - ); + self.clean_pause_guard_id_for_task(&task_name); self.pool.block_on(async move { let task_clone = task.clone(); let run = async move { let task_name = task.info.get_name(); let ranges = cli.ranges_of_task(task_name).await?; + fail::fail_point!("load_task::error_when_fetching_ranges", |_| { + Err(Error::Other("what range? no such thing, go away.".into())) + }); info!( "register backup stream ranges"; "task" => ?task, @@ -621,10 +658,8 @@ where .await?; for (start_key, end_key) in ranges { - let init = init.clone(); - - self.observe_and_scan_region(init, &task, start_key, end_key) - .await? + self.observe_regions_in_range(&task, start_key, end_key) + .await } info!( "finish register backup stream ranges"; @@ -633,15 +668,28 @@ where Result::Ok(()) }; if let Err(e) = run.await { - e.report(format!( - "failed to register backup stream task {} to router: ranges not found", - task_clone.info.get_name() - )); + self.on_fatal_error_of_task(&task_clone.info.name, &Box::new(e)) + .await; } }); metrics::update_task_status(TaskStatus::Running, &task_name); } + // clean the safepoint created at pause(if there is) + fn clean_pause_guard_id_for_task(&self, task_name: &str) { + self.pool.spawn( + self.pd_client + .update_service_safe_point( + self.pause_guard_id_for_task(task_name), + TimeStamp::zero(), + Duration::new(0, 0), + ) + .map(|r| { + r.map_err(|err| Error::from(err).report("removing safe point for pausing")) + }), + ); + } + fn pause_guard_id_for_task(&self, task: &str) -> String { format!("{}-{}-pause-guard", task, self.store_id) } @@ -666,7 +714,7 @@ where Err(err) => { err.report(format!("failed to resume backup stream task {}", task_name)); let sched = self.scheduler.clone(); - tokio::task::spawn(async move { + self.pool.spawn(async move { tokio::time::sleep(Duration::from_secs(5)).await; sched .schedule(Task::WatchTask(TaskOp::ResumeTask(task_name))) @@ -676,9 +724,10 @@ where } } - pub fn on_unregister(&self, task: &str) -> Option { - let info = self.unload_task(task); - self.remove_metrics_after_unregister(task); + pub fn on_unregister(&self, task_name: &str) -> Option { + let info = self.unload_task(task_name); + self.clean_pause_guard_id_for_task(task_name); + self.remove_metrics_after_unregister(task_name); info } @@ -721,29 +770,11 @@ where } } - fn get_resolved_regions(&self, min_ts: TimeStamp) -> future![Result] { - let (tx, rx) = oneshot::channel(); - let op = self.region_operator.clone(); - async move { - let req = ObserveOp::ResolveRegions { - callback: Box::new(move |rs| { - let _ = tx.send(rs); - }), - min_ts, - }; - op.request(req).await; - rx.await - .map_err(|err| annotate!(err, "failed to send request for resolve regions")) - } - } - - fn do_flush(&self, task: String, min_ts: TimeStamp) -> future![Result<()>] { - let get_rts = self.get_resolved_regions(min_ts); + fn do_flush(&self, task: String, mut resolved: ResolvedRegions) -> future![Result<()>] { let router = self.range_router.clone(); let store_id = self.store_id; let mut flush_ob = self.flush_observer(); async move { - let mut resolved = get_rts.await?; let mut new_rts = resolved.global_checkpoint(); fail::fail_point!("delay_on_flush"); flush_ob.before(resolved.take_resolve_result()).await; @@ -772,20 +803,37 @@ where // This should only happen in testing, it would be to unwrap... let _ = info.unwrap().set_flushing_status_cas(false, true); let mts = self.prepare_min_ts().await; - try_send!(self.scheduler, Task::FlushWithMinTs(task, mts)); + let sched = self.scheduler.clone(); + self.region_operator + .request(ObserveOp::ResolveRegions { + callback: Box::new(move |res| { + try_send!(sched, Task::ExecFlush(task, res)); + }), + min_ts: mts, + }) + .await; }); } pub fn on_flush(&self, task: String) { self.pool.block_on(async move { let mts = self.prepare_min_ts().await; + let sched = self.scheduler.clone(); info!("min_ts prepared for flushing"; "min_ts" => %mts); - try_send!(self.scheduler, Task::FlushWithMinTs(task, mts)); + self.region_operator + .request(ObserveOp::ResolveRegions { + callback: Box::new(move |res| { + try_send!(sched, Task::ExecFlush(task, res)); + }), + min_ts: mts, + }) + .await }) } - fn on_flush_with_min_ts(&self, task: String, min_ts: TimeStamp) { - self.pool.spawn(self.do_flush(task, min_ts).map(|r| { + fn on_exec_flush(&mut self, task: String, resolved: ResolvedRegions) { + self.checkpoint_mgr.freeze(); + self.pool.spawn(self.do_flush(task, resolved).map(|r| { if let Err(err) = r { err.report("during updating flush status") } @@ -859,11 +907,16 @@ where } fn on_update_change_config(&mut self, cfg: BackupStreamConfig) { + let concurrency_diff = + cfg.initial_scan_concurrency as isize - self.config.initial_scan_concurrency as isize; info!( "update log backup config"; "config" => ?cfg, + "concurrency_diff" => concurrency_diff, ); self.range_router.udpate_config(&cfg); + self.update_semaphore_capacity(&self.initial_scan_semaphore, concurrency_diff); + self.config = cfg; } @@ -873,6 +926,24 @@ where self.pool.block_on(self.region_operator.request(op)); } + fn update_semaphore_capacity(&self, sema: &Arc, diff: isize) { + use std::cmp::Ordering::*; + match diff.cmp(&0) { + Less => { + self.pool.spawn( + Arc::clone(sema) + .acquire_many_owned(-diff as _) + // It is OK to trivially ignore the Error case (semaphore has been closed, we are shutting down the server.) + .map_ok(|p| p.forget()), + ); + } + Equal => {} + Greater => { + sema.add_permits(diff as _); + } + } + } + pub fn run_task(&mut self, task: Task) { debug!("run backup stream task"; "task" => ?task, "store_id" => %self.store_id); let now = Instant::now_coarse(); @@ -903,7 +974,7 @@ where } } Task::MarkFailover(t) => self.failover_time = Some(t), - Task::FlushWithMinTs(task, min_ts) => self.on_flush_with_min_ts(task, min_ts), + Task::ExecFlush(task, min_ts) => self.on_exec_flush(task, min_ts), Task::RegionCheckpointsOp(s) => self.handle_region_checkpoints_op(s), Task::UpdateGlobalCheckpoint(task) => self.on_update_global_checkpoint(task), } @@ -932,8 +1003,8 @@ where self.checkpoint_mgr.resolve_regions(checkpoints); metrics::MIN_TS_RESOLVE_DURATION.observe(start_time.saturating_elapsed_secs()); } - RegionCheckpointOperation::Flush => { - self.checkpoint_mgr.flush(); + RegionCheckpointOperation::FlushWith(checkpoints) => { + self.checkpoint_mgr.flush_and_notify(checkpoints); } RegionCheckpointOperation::Get(g, cb) => { let _guard = self.pool.handle().enter(); @@ -1000,7 +1071,7 @@ where } pub fn do_backup(&self, events: Vec) { - let wg = CallbackWaitGroup::new(); + let wg = FutureWaitGroup::new(); for batch in events { self.backup_batch(batch, wg.clone().work()); } @@ -1038,9 +1109,14 @@ where RT: CdcHandle + 'static, EK: KvEngine, { - pub async fn resolve(&mut self, regions: Vec, min_ts: TimeStamp) -> Vec { + pub async fn resolve( + &mut self, + regions: Vec, + min_ts: TimeStamp, + timeout: Option, + ) -> Vec { match self { - BackupStreamResolver::V1(x) => x.resolve(regions, min_ts).await, + BackupStreamResolver::V1(x) => x.resolve(regions, min_ts, timeout).await, BackupStreamResolver::V2(x, _) => { let x = x.clone(); resolve_by_raft(regions, min_ts, x).await @@ -1058,7 +1134,7 @@ pub enum RegionSet { } pub enum RegionCheckpointOperation { - Flush, + FlushWith(Vec), PrepareMinTsForResolve, Resolve { min_ts: TimeStamp, @@ -1075,7 +1151,7 @@ pub enum RegionCheckpointOperation { impl fmt::Debug for RegionCheckpointOperation { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { - Self::Flush => f.debug_tuple("Flush").finish(), + Self::FlushWith(checkpoints) => f.debug_tuple("FlushWith").field(checkpoints).finish(), Self::Get(arg0, _) => f.debug_tuple("Get").field(arg0).finish(), Self::Subscribe(_) => f.debug_tuple("Subscription").finish(), @@ -1120,9 +1196,9 @@ pub enum Task { MarkFailover(Instant), /// Flush the task with name. Flush(String), - /// Execute the flush with the calculated `min_ts`. + /// Execute the flush with the calculated resolved result. /// This is an internal command only issued by the `Flush` task. - FlushWithMinTs(String, TimeStamp), + ExecFlush(String, ResolvedRegions), /// The command for getting region checkpoints. RegionCheckpointsOp(RegionCheckpointOperation), /// update global-checkpoint-ts to storage. @@ -1229,10 +1305,10 @@ impl fmt::Debug for Task { .debug_tuple("MarkFailover") .field(&format_args!("{:?} ago", t.saturating_elapsed())) .finish(), - Self::FlushWithMinTs(arg0, arg1) => f - .debug_tuple("FlushWithMinTs") + Self::ExecFlush(arg0, arg1) => f + .debug_tuple("ExecFlush") .field(arg0) - .field(arg1) + .field(&arg1.global_checkpoint()) .finish(), Self::RegionCheckpointsOp(s) => f.debug_tuple("GetRegionCheckpoints").field(s).finish(), Self::UpdateGlobalCheckpoint(task) => { @@ -1272,19 +1348,18 @@ impl Task { Task::FatalError(..) => "fatal_error", Task::Sync(..) => "sync", Task::MarkFailover(_) => "mark_failover", - Task::FlushWithMinTs(..) => "flush_with_min_ts", + Task::ExecFlush(..) => "flush_with_min_ts", Task::RegionCheckpointsOp(..) => "get_checkpoints", Task::UpdateGlobalCheckpoint(..) => "update_global_checkpoint", } } } -impl Runnable for Endpoint +impl Runnable for Endpoint where S: MetaStore + 'static, R: RegionInfoProvider + Clone + 'static, E: KvEngine, - RT: CdcHandle + 'static, PDC: PdClient + 'static, { type Task = Task; @@ -1297,10 +1372,7 @@ where #[cfg(test)] mod test { use engine_rocks::RocksEngine; - use raftstore::{ - coprocessor::region_info_accessor::MockRegionInfoProvider, router::CdcRaftRouter, - }; - use test_raftstore::MockRaftStoreRouter; + use raftstore::coprocessor::region_info_accessor::MockRegionInfoProvider; use tikv_util::worker::dummy_scheduler; use crate::{ @@ -1315,13 +1387,9 @@ mod test { cli.insert_task_with_range(&task, &[]).await.unwrap(); fail::cfg("failed_to_get_tasks", "1*return").unwrap(); - Endpoint::< - _, - MockRegionInfoProvider, - RocksEngine, - CdcRaftRouter, - MockPdClient, - >::start_and_watch_tasks(cli, sched) + Endpoint::<_, MockRegionInfoProvider, RocksEngine, MockPdClient>::start_and_watch_tasks( + cli, sched, + ) .await .unwrap(); fail::remove("failed_to_get_tasks"); diff --git a/components/backup-stream/src/event_loader.rs b/components/backup-stream/src/event_loader.rs index 1b663c0e982..0a957ea87ed 100644 --- a/components/backup-stream/src/event_loader.rs +++ b/components/backup-stream/src/event_loader.rs @@ -3,77 +3,49 @@ use std::{marker::PhantomData, sync::Arc, time::Duration}; use engine_traits::{KvEngine, CF_DEFAULT, CF_WRITE}; -use futures::executor::block_on; use kvproto::{kvrpcpb::ExtraOp, metapb::Region, raft_cmdpb::CmdType}; use raftstore::{ - coprocessor::{ObserveHandle, RegionInfoProvider}, + coprocessor::ObserveHandle, router::CdcHandle, store::{fsm::ChangeObserver, Callback}, }; use tikv::storage::{ kv::StatisticsSummary, mvcc::{DeltaScanner, ScannerBuilder}, - txn::{EntryBatch, TxnEntry, TxnEntryScanner}, + txn::{TxnEntry, TxnEntryScanner}, Snapshot, Statistics, }; use tikv_util::{ box_err, + memory::{MemoryQuota, OwnedAllocated}, time::{Instant, Limiter}, worker::Scheduler, }; -use tokio::{ - runtime::Handle, - sync::{OwnedSemaphorePermit, Semaphore}, -}; +use tokio::sync::Semaphore; use txn_types::{Key, Lock, TimeStamp}; use crate::{ annotate, debug, - endpoint::ObserveOp, errors::{ContextualResultExt, Error, Result}, metrics, router::{ApplyEvent, ApplyEvents, Router}, subscription_track::{Ref, RefMut, SubscriptionTracer, TwoPhaseResolver}, - try_send, - utils::{self, RegionPager}, - Task, + utils, Task, }; const MAX_GET_SNAPSHOT_RETRY: usize = 5; -#[derive(Clone)] -pub struct PendingMemoryQuota(Arc); - -impl std::fmt::Debug for PendingMemoryQuota { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("PendingMemoryQuota") - .field("remain", &self.0.available_permits()) - .field("total", &self.0) - .finish() - } -} - -pub struct PendingMemory(OwnedSemaphorePermit); - -impl PendingMemoryQuota { - pub fn new(quota: usize) -> Self { - Self(Arc::new(Semaphore::new(quota))) - } - - pub fn pending(&self, size: usize) -> PendingMemory { - PendingMemory( - Handle::current() - .block_on(self.0.clone().acquire_many_owned(size as _)) - .expect("BUG: the semaphore is closed unexpectedly."), - ) - } +struct ScanResult { + more: bool, + out_of_memory: bool, + statistics: Statistics, } /// EventLoader transforms data from the snapshot into ApplyEvent. pub struct EventLoader { scanner: DeltaScanner, // pooling the memory. - entry_batch: EntryBatch, + entry_batch: Vec, } const ENTRY_BATCH_SIZE: usize = 1024; @@ -102,20 +74,48 @@ impl EventLoader { Ok(Self { scanner, - entry_batch: EntryBatch::with_capacity(ENTRY_BATCH_SIZE), + entry_batch: Vec::with_capacity(ENTRY_BATCH_SIZE), }) } + fn scan_result(&mut self, more: bool) -> ScanResult { + ScanResult { + more, + out_of_memory: false, + statistics: self.scanner.take_statistics(), + } + } + + fn out_of_memory(&mut self) -> ScanResult { + ScanResult { + more: true, + out_of_memory: true, + statistics: self.scanner.take_statistics(), + } + } + /// Scan a batch of events from the snapshot, and save them into the /// internal buffer. - fn fill_entries(&mut self) -> Result { + fn fill_entries(&mut self, memory_quota: &mut OwnedAllocated) -> Result { assert!( self.entry_batch.is_empty(), - "EventLoader: the entry batch isn't empty when filling entries, which is error-prone, please call `omit_entries` first. (len = {})", + "EventLoader: the entry batch isn't empty when filling entries, which is error-prone, please call `emit_entries_to` first. (len = {})", self.entry_batch.len() ); - self.scanner.scan_entries(&mut self.entry_batch)?; - Ok(self.scanner.take_statistics()) + let batch = &mut self.entry_batch; + while batch.len() < batch.capacity() { + match self.scanner.next_entry()? { + Some(entry) => { + let size = entry.size(); + batch.push(entry); + if memory_quota.alloc(size).is_err() { + return Ok(self.out_of_memory()); + } + } + None => return Ok(self.scan_result(false)), + } + } + Ok(self.scan_result(true)) } /// Drain the internal buffer, converting them to the [`ApplyEvents`], @@ -125,7 +125,7 @@ impl EventLoader { result: &mut ApplyEvents, resolver: &mut TwoPhaseResolver, ) -> Result<()> { - for entry in self.entry_batch.drain() { + for entry in self.entry_batch.drain(..) { match entry { TxnEntry::Prewrite { default: (key, value), @@ -175,121 +175,64 @@ impl EventLoader { } /// The context for loading incremental data between range. -/// Like [`cdc::Initializer`], but supports initialize over range. +/// Like [`cdc::Initializer`]. /// Note: maybe we can merge those two structures? -/// Note': maybe extract more fields to trait so it would be easier to test. #[derive(Clone)] -pub struct InitialDataLoader { +pub struct InitialDataLoader { // Note: maybe we can make it an abstract thing like `EventSink` with // method `async (KvEvent) -> Result<()>`? pub(crate) sink: Router, pub(crate) tracing: SubscriptionTracer, pub(crate) scheduler: Scheduler, - // Note: this is only for `init_range`, maybe make it an argument? - pub(crate) regions: R, - // Note: Maybe move those fields about initial scanning into some trait? - pub(crate) router: RT, - pub(crate) quota: PendingMemoryQuota, + + pub(crate) quota: Arc, pub(crate) limit: Limiter, + // If there are too many concurrent initial scanning, the limit of disk speed or pending memory + // quota will probably be triggered. Then the whole scanning will be pretty slow. And when + // we are holding a iterator for a long time, the memtable may not be able to be flushed. + // Using this to restrict the possibility of that. + concurrency_limit: Arc, + + cdc_handle: H, - pub(crate) handle: Handle, _engine: PhantomData, } -impl InitialDataLoader +impl InitialDataLoader where E: KvEngine, - R: RegionInfoProvider + Clone + 'static, - RT: CdcHandle, + H: CdcHandle + Sync, { pub fn new( - router: RT, - regions: R, sink: Router, tracing: SubscriptionTracer, sched: Scheduler, - quota: PendingMemoryQuota, - handle: Handle, + quota: Arc, limiter: Limiter, + cdc_handle: H, + concurrency_limit: Arc, ) -> Self { Self { - router, - regions, sink, tracing, scheduler: sched, _engine: PhantomData, quota, - handle, + cdc_handle, + concurrency_limit, limit: limiter, } } - pub fn observe_over_with_retry( + pub async fn capture_change( &self, region: &Region, - mut cmd: impl FnMut() -> ChangeObserver, + cmd: ChangeObserver, ) -> Result { - let mut last_err = None; - for _ in 0..MAX_GET_SNAPSHOT_RETRY { - let c = cmd(); - let r = self.observe_over(region, c); - match r { - Ok(s) => { - return Ok(s); - } - Err(e) => { - let can_retry = match e.without_context() { - Error::RaftRequest(pbe) => { - !(pbe.has_epoch_not_match() - || pbe.has_not_leader() - || pbe.get_message().contains("stale observe id") - || pbe.has_region_not_found()) - } - Error::RaftStore(raftstore::Error::RegionNotFound(_)) - | Error::RaftStore(raftstore::Error::NotLeader(..)) => false, - _ => true, - }; - e.report(format_args!( - "during getting initial snapshot for region {:?}; can retry = {}", - region, can_retry - )); - last_err = match last_err { - None => Some(e), - Some(err) => Some(Error::Contextual { - context: format!("and error {}", err), - inner_error: Box::new(e), - }), - }; - - if !can_retry { - break; - } - std::thread::sleep(Duration::from_secs(1)); - continue; - } - } - } - Err(last_err.expect("BUG: max retry time exceed but no error")) - } - - /// Start observe over some region. - /// This will register the region to the raftstore as observing, - /// and return the current snapshot of that region. - fn observe_over(&self, region: &Region, cmd: ChangeObserver) -> Result { - // There are 2 ways for getting the initial snapshot of a region: - // - the BR method: use the interface in the RaftKv interface, read the - // key-values directly. - // - the CDC method: use the raftstore message `SignificantMsg::CaptureChange` - // to register the region to CDC observer and get a snapshot at the same time. - // Registering the observer to the raftstore is necessary because we should only - // listen events from leader. In CDC, the change observer is - // per-delegate(i.e. per-region), we can create the command per-region here too. - let (callback, fut) = tikv_util::future::paired_future_callback::>(); - self.router + self.cdc_handle .capture_change( region.get_id(), region.get_region_epoch().clone(), @@ -315,7 +258,8 @@ where region.get_id() ))?; - let snap = block_on(fut) + let snap = fut + .await .map_err(|err| { annotate!( err, @@ -332,6 +276,54 @@ where Ok(snap) } + pub async fn observe_over_with_retry( + &self, + region: &Region, + mut cmd: impl FnMut() -> ChangeObserver, + ) -> Result { + let mut last_err = None; + for _ in 0..MAX_GET_SNAPSHOT_RETRY { + let c = cmd(); + let r = self.capture_change(region, c).await; + match r { + Ok(s) => { + return Ok(s); + } + Err(e) => { + let can_retry = match e.without_context() { + Error::RaftRequest(pbe) => { + !(pbe.has_epoch_not_match() + || pbe.has_not_leader() + || pbe.get_message().contains("stale observe id") + || pbe.has_region_not_found()) + } + Error::RaftStore(raftstore::Error::RegionNotFound(_)) + | Error::RaftStore(raftstore::Error::NotLeader(..)) => false, + _ => true, + }; + e.report(format_args!( + "during getting initial snapshot for region {:?}; can retry = {}", + region, can_retry + )); + last_err = match last_err { + None => Some(e), + Some(err) => Some(Error::Contextual { + context: format!("and error {}", err), + inner_error: Box::new(e), + }), + }; + + if !can_retry { + break; + } + tokio::time::sleep(Duration::from_secs(1)).await; + continue; + } + } + } + Err(last_err.expect("BUG: max retry time exceed but no error")) + } + fn with_resolver( &self, region: &Region, @@ -381,7 +373,7 @@ where f(v.value_mut().resolver()) } - fn scan_and_async_send( + async fn scan_and_async_send( &self, region: &Region, handle: &ObserveHandle, @@ -397,44 +389,48 @@ where let mut events = ApplyEvents::with_capacity(1024, region.id); // Note: the call of `fill_entries` is the only step which would read the disk. // we only need to record the disk throughput of this. - let (stat, disk_read) = - utils::with_record_read_throughput(|| event_loader.fill_entries()); - // We must use the size of entry batch here to check whether we have progress. - // Or we may exit too early if there are only records: - // - can be inlined to `write` CF (hence it won't be written to default CF) - // - are prewritten. (hence it will only contains `Prewrite` records). - // In this condition, ALL records generate no ApplyEvent(only lock change), - // and we would exit after the first run of loop :( - let no_progress = event_loader.entry_batch.is_empty(); - let stat = stat?; + let mut allocated = OwnedAllocated::new(Arc::clone(&self.quota)); + let (res, disk_read) = + utils::with_record_read_throughput(|| event_loader.fill_entries(&mut allocated)); + let res = res?; self.with_resolver(region, handle, |r| { event_loader.emit_entries_to(&mut events, r) })?; - if no_progress { - metrics::INITIAL_SCAN_DURATION.observe(start.saturating_elapsed_secs()); - return Ok(stats.stat); - } - stats.add_statistics(&stat); + stats.add_statistics(&res.statistics); let region_id = region.get_id(); let sink = self.sink.clone(); let event_size = events.size(); let sched = self.scheduler.clone(); - let permit = self.quota.pending(event_size); - self.limit.blocking_consume(disk_read as _); + self.limit.consume(disk_read as _).await; debug!("sending events to router"; "size" => %event_size, "region" => %region_id); metrics::INCREMENTAL_SCAN_SIZE.observe(event_size as f64); metrics::INCREMENTAL_SCAN_DISK_READ.inc_by(disk_read as f64); metrics::HEAP_MEMORY.add(event_size as _); + fail::fail_point!("scan_and_async_send::about_to_consume"); join_handles.push(tokio::spawn(async move { utils::handle_on_event_result(&sched, sink.on_events(events).await); metrics::HEAP_MEMORY.sub(event_size as _); + drop(allocated); debug!("apply event done"; "size" => %event_size, "region" => %region_id); - drop(permit); })); + if !res.more { + metrics::INITIAL_SCAN_DURATION.observe(start.saturating_elapsed_secs()); + return Ok(stats.stat); + } + if res.out_of_memory { + futures::future::try_join_all(join_handles.drain(..)) + .await + .map_err(|err| { + annotate!( + err, + "failed to join tokio runtime during out-of-memory-quota" + ) + })?; + } } } - pub fn do_initial_scan( + pub async fn do_initial_scan( &self, region: &Region, // We are using this handle for checking whether the initial scan is stale. @@ -442,18 +438,25 @@ where start_ts: TimeStamp, snap: impl Snapshot, ) -> Result { - let _guard = self.handle.enter(); let tr = self.tracing.clone(); let region_id = region.get_id(); let mut join_handles = Vec::with_capacity(8); + let permit = self + .concurrency_limit + .acquire() + .await + .expect("BUG: semaphore closed"); // It is ok to sink more data than needed. So scan to +inf TS for convenance. let event_loader = EventLoader::load_from(snap, start_ts, TimeStamp::max(), region)?; - let stats = self.scan_and_async_send(region, &handle, event_loader, &mut join_handles)?; + let stats = self + .scan_and_async_send(region, &handle, event_loader, &mut join_handles) + .await?; + drop(permit); - Handle::current() - .block_on(futures::future::try_join_all(join_handles)) + futures::future::try_join_all(join_handles) + .await .map_err(|err| annotate!(err, "tokio runtime failed to join consuming threads"))?; Self::with_resolver_by(&tr, region, &handle, |r| { @@ -467,39 +470,17 @@ where Ok(stats) } - - /// initialize a range: it simply scan the regions with leader role and send - /// them to [`initialize_region`]. - pub fn initialize_range(&self, start_key: Vec, end_key: Vec) -> Result<()> { - let mut pager = RegionPager::scan_from(self.regions.clone(), start_key, end_key); - loop { - let regions = pager.next_page(8)?; - debug!("scanning for entries in region."; "regions" => ?regions); - if regions.is_empty() { - break; - } - for r in regions { - // Note: Even we did the initial scanning, and blocking resolved ts from - // advancing, if the next_backup_ts was updated in some extreme condition, there - // is still little chance to lost data: For example, if a region cannot elect - // the leader for long time. (say, net work partition) At that time, we have - // nowhere to record the lock status of this region. - try_send!( - self.scheduler, - Task::ModifyObserve(ObserveOp::Start { region: r.region }) - ); - } - } - Ok(()) - } } #[cfg(test)] mod tests { + use std::sync::Arc; + use futures::executor::block_on; use kvproto::metapb::*; use tikv::storage::{txn::tests::*, TestEngineBuilder}; use tikv_kv::SnapContext; + use tikv_util::memory::{MemoryQuota, OwnedAllocated}; use txn_types::TimeStamp; use super::EventLoader; @@ -529,10 +510,13 @@ mod tests { let snap = block_on(async { tikv_kv::snapshot(&mut engine, SnapContext::default()).await }) .unwrap(); + let quota_inf = Arc::new(MemoryQuota::new(usize::MAX)); let mut loader = EventLoader::load_from(snap, TimeStamp::zero(), TimeStamp::max(), &r).unwrap(); - let (r, data_load) = with_record_read_throughput(|| loader.fill_entries()); + let (r, data_load) = with_record_read_throughput(|| { + loader.fill_entries(&mut OwnedAllocated::new(quota_inf)) + }); r.unwrap(); let mut events = ApplyEvents::with_capacity(1024, 42); let mut res = TwoPhaseResolver::new(42, None); diff --git a/components/backup-stream/src/lib.rs b/components/backup-stream/src/lib.rs index 3d4690d7f48..0402e5d2ee3 100644 --- a/components/backup-stream/src/lib.rs +++ b/components/backup-stream/src/lib.rs @@ -10,7 +10,7 @@ mod endpoint; pub mod errors; mod event_loader; pub mod metadata; -pub(crate) mod metrics; +pub mod metrics; pub mod observer; pub mod router; mod service; diff --git a/components/backup-stream/src/metadata/client.rs b/components/backup-stream/src/metadata/client.rs index 1fdc1b3b1e8..59f685b9f23 100644 --- a/components/backup-stream/src/metadata/client.rs +++ b/components/backup-stream/src/metadata/client.rs @@ -286,7 +286,19 @@ impl MetadataClient { Ok(()) } - pub async fn get_last_error( + pub async fn get_last_error(&self, name: &str) -> Result> { + let key = MetaKey::last_errors_of(name); + + let r = self.meta_store.get_latest(Keys::Prefix(key)).await?.inner; + if r.is_empty() { + return Ok(None); + } + let r = &r[0]; + let err = protobuf::parse_from_bytes(r.value())?; + Ok(Some(err)) + } + + pub async fn get_last_error_of( &self, name: &str, store_id: u64, @@ -319,6 +331,13 @@ impl MetadataClient { .await } + /// resume a task. + pub async fn resume(&self, name: &str) -> Result<()> { + self.meta_store + .delete(Keys::Key(MetaKey::pause_of(name))) + .await + } + pub async fn get_tasks_pause_status(&self) -> Result, bool>> { let kvs = self .meta_store @@ -342,6 +361,11 @@ impl MetadataClient { defer! { super::metrics::METADATA_OPERATION_LATENCY.with_label_values(&["task_get"]).observe(now.saturating_elapsed().as_secs_f64()) } + fail::fail_point!("failed_to_get_task", |_| { + Err(Error::MalformedMetadata( + "failed to connect etcd client".to_string(), + )) + }); let items = self .meta_store .get_latest(Keys::Key(MetaKey::task_of(name))) @@ -364,7 +388,7 @@ impl MetadataClient { } fail::fail_point!("failed_to_get_tasks", |_| { Err(Error::MalformedMetadata( - "faild to connect etcd client".to_string(), + "failed to connect etcd client".to_string(), )) }); let kvs = self diff --git a/components/backup-stream/src/router.rs b/components/backup-stream/src/router.rs index b0d3453c958..a5e6489f6fc 100644 --- a/components/backup-stream/src/router.rs +++ b/components/backup-stream/src/router.rs @@ -540,6 +540,15 @@ impl RouterInner { let task_info = self.get_task_info(&task).await?; task_info.on_events(events).await?; let file_size_limit = self.temp_file_size_limit.load(Ordering::SeqCst); + #[cfg(features = "failpoints")] + { + let delayed = (|| { + fail::fail_point!("router_on_event_delay_ms", |v| { + v.and_then(|v| v.parse::().ok()).unwrap_or(0) + }) + })(); + tokio::time::sleep(Duration::from_millis(delayed)).await; + } // When this event make the size of temporary files exceeds the size limit, make // a flush. Note that we only flush if the size is less than the limit before @@ -708,22 +717,25 @@ impl TempFileKey { /// The full name of the file owns the key. fn temp_file_name(&self) -> String { + let timestamp = (|| { + fail::fail_point!("temp_file_name_timestamp", |t| t.map_or_else( + || TimeStamp::physical_now(), + |v| + // reduce the precision of timestamp + v.parse::().ok().map_or(0, |u| TimeStamp::physical_now() / u) + )); + TimeStamp::physical_now() + })(); + let uuid = uuid::Uuid::new_v4(); if self.is_meta { format!( - "meta_{:08}_{}_{:?}_{}.temp.log", - self.region_id, - self.cf, - self.cmd_type, - TimeStamp::physical_now(), + "meta_{:08}_{}_{:?}_{:?}_{}.temp.log", + self.region_id, self.cf, self.cmd_type, uuid, timestamp, ) } else { format!( - "{:08}_{:08}_{}_{:?}_{}.temp.log", - self.table_id, - self.region_id, - self.cf, - self.cmd_type, - TimeStamp::physical_now(), + "{:08}_{:08}_{}_{:?}_{:?}_{}.temp.log", + self.table_id, self.region_id, self.cf, self.cmd_type, uuid, timestamp, ) } } @@ -819,6 +831,28 @@ pub struct StreamTaskInfo { temp_file_pool: Arc, } +impl Drop for StreamTaskInfo { + fn drop(&mut self) { + let (success, failed): (Vec<_>, Vec<_>) = self + .flushing_files + .get_mut() + .drain(..) + .chain(self.flushing_meta_files.get_mut().drain(..)) + .map(|(_, f, _)| f.inner.path().to_owned()) + .map(|p| self.temp_file_pool.remove(&p)) + .partition(|r| *r); + info!("stream task info dropped[1/2], removing flushing_temp files"; "success" => %success.len(), "failure" => %failed.len()); + let (success, failed): (Vec<_>, Vec<_>) = self + .files + .get_mut() + .drain() + .map(|(_, f)| f.into_inner().inner.path().to_owned()) + .map(|p| self.temp_file_pool.remove(&p)) + .partition(|r| *r); + info!("stream task info dropped[2/2], removing temp files"; "success" => %success.len(), "failure" => %failed.len()); + } +} + impl std::fmt::Debug for StreamTaskInfo { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.debug_struct("StreamTaskInfo") @@ -864,6 +898,7 @@ impl StreamTaskInfo { } async fn on_events_of_key(&self, key: TempFileKey, events: ApplyEvents) -> Result<()> { + fail::fail_point!("before_generate_temp_file"); if let Some(f) = self.files.read().await.get(&key) { self.total_size .fetch_add(f.lock().await.on_events(events).await?, Ordering::SeqCst); @@ -886,6 +921,7 @@ impl StreamTaskInfo { let f = w.get(&key).unwrap(); self.total_size .fetch_add(f.lock().await.on_events(events).await?, Ordering::SeqCst); + fail::fail_point!("after_write_to_file"); Ok(()) } @@ -968,7 +1004,9 @@ impl StreamTaskInfo { pub async fn move_to_flushing_files(&self) -> Result<&Self> { // if flushing_files is not empty, which represents this flush is a retry // operation. - if !self.flushing_files.read().await.is_empty() { + if !self.flushing_files.read().await.is_empty() + || !self.flushing_meta_files.read().await.is_empty() + { return Ok(self); } @@ -1030,7 +1068,12 @@ impl StreamTaskInfo { // and push it into merged_file_info(DataFileGroup). file_info_clone.set_range_offset(stat_length); data_files_open.push({ - let file = shared_pool.open_raw_for_read(data_file.inner.path())?; + let file = shared_pool + .open_raw_for_read(data_file.inner.path()) + .context(format_args!( + "failed to open read file {:?}", + data_file.inner.path() + ))?; let compress_length = file.len().await?; stat_length += compress_length; file_info_clone.set_range_length(compress_length); @@ -1095,7 +1138,6 @@ impl StreamTaskInfo { .await?; self.merge_log(metadata, storage.clone(), &self.flushing_meta_files, true) .await?; - Ok(()) } @@ -1155,7 +1197,8 @@ impl StreamTaskInfo { UnpinReader(Box::new(Cursor::new(meta_buff))), buflen as _, ) - .await?; + .await + .context(format_args!("flush meta {:?}", meta_path))?; } Ok(()) } @@ -1189,13 +1232,14 @@ impl StreamTaskInfo { .await? .generate_metadata(store_id) .await?; + + fail::fail_point!("after_moving_to_flushing_files"); crate::metrics::FLUSH_DURATION .with_label_values(&["generate_metadata"]) .observe(sw.lap().as_secs_f64()); // flush log file to storage. self.flush_log(&mut metadata_info).await?; - // the field `min_resolved_ts` of metadata will be updated // only after flush is done. metadata_info.min_resolved_ts = metadata_info @@ -2068,6 +2112,12 @@ mod tests { let (task, _path) = task("cleanup_test".to_owned()).await?; must_register_table(&router, task, 1).await; write_simple_data(&router).await; + let tempfiles = router + .get_task_info("cleanup_test") + .await + .unwrap() + .temp_file_pool + .clone(); router .get_task_info("cleanup_test") .await? @@ -2076,6 +2126,7 @@ mod tests { write_simple_data(&router).await; let mut w = walkdir::WalkDir::new(&tmp).into_iter(); assert!(w.next().is_some(), "the temp files doesn't created"); + assert!(tempfiles.mem_used() > 0, "the temp files doesn't created."); drop(router); let w = walkdir::WalkDir::new(&tmp) .into_iter() @@ -2093,6 +2144,11 @@ mod tests { "the temp files should be removed, but it is {:?}", w ); + assert_eq!( + tempfiles.mem_used(), + 0, + "the temp files hasn't been cleared." + ); Ok(()) } @@ -2411,4 +2467,91 @@ mod tests { let r = cfg_manager.dispatch(changed); assert!(r.is_err()); } + + #[tokio::test(flavor = "multi_thread", worker_threads = 4)] + async fn test_flush_on_events_race() -> Result<()> { + let (tx, _rx) = dummy_scheduler(); + let tmp = std::env::temp_dir().join(format!("{}", uuid::Uuid::new_v4())); + let router = Arc::new(RouterInner::new( + tx, + Config { + prefix: tmp.clone(), + // disable auto flush. + temp_file_size_limit: 1000, + temp_file_memory_quota: 2, + max_flush_interval: Duration::from_secs(300), + }, + )); + + let (task, _path) = task("race".to_owned()).await?; + must_register_table(router.as_ref(), task, 1).await; + router + .must_mut_task_info("race", |i| { + i.storage = Arc::new(NoopStorage::default()); + }) + .await; + let mut b = KvEventsBuilder::new(42, 0); + b.put_table(CF_DEFAULT, 1, b"k1", b"v1"); + let events_before_flush = b.finish(); + + b.put_table(CF_DEFAULT, 1, b"k1", b"v1"); + let events_after_flush = b.finish(); + + // make timestamp precision to 1 seconds. + fail::cfg("temp_file_name_timestamp", "return(1000)").unwrap(); + + let (trigger_tx, trigger_rx) = std::sync::mpsc::sync_channel(0); + let trigger_rx = std::sync::Mutex::new(trigger_rx); + + let (fp_tx, fp_rx) = std::sync::mpsc::sync_channel(0); + let fp_rx = std::sync::Mutex::new(fp_rx); + + let t = router.get_task_info("race").await.unwrap(); + let _ = router.on_events(events_before_flush).await; + + // make generate temp files ***happen after*** moving files to flushing_files + // and read flush file ***happen between*** genenrate file name and + // write kv to file. T1 is write thread. T2 is flush thread + // The order likes + // [T1] generate file name -> [T2] moving files to flushing_files -> [T1] write + // kv to file -> [T2] read flush file. + fail::cfg_callback("after_write_to_file", move || { + fp_tx.send(()).unwrap(); + }) + .unwrap(); + + fail::cfg_callback("before_generate_temp_file", move || { + trigger_rx.lock().unwrap().recv().unwrap(); + }) + .unwrap(); + + fail::cfg_callback("after_moving_to_flushing_files", move || { + trigger_tx.send(()).unwrap(); + fp_rx.lock().unwrap().recv().unwrap(); + }) + .unwrap(); + + // set flush status to true, because we disabled the auto flush. + t.set_flushing_status(true); + let router_clone = router.clone(); + let _ = tokio::join!( + // do flush in another thread + tokio::spawn(async move { + router_clone.do_flush("race", 42, TimeStamp::max()).await; + }), + router.on_events(events_after_flush) + ); + fail::remove("after_write_to_file"); + fail::remove("before_generate_temp_file"); + fail::remove("after_moving_to_flushing_files"); + fail::remove("temp_file_name_timestamp"); + + // set flush status to true, because we disabled the auto flush. + t.set_flushing_status(true); + let res = router.do_flush("race", 42, TimeStamp::max()).await; + // this time flush should success. + assert!(res.is_some()); + assert_eq!(t.files.read().await.len(), 0,); + Ok(()) + } } diff --git a/components/backup-stream/src/subscription_manager.rs b/components/backup-stream/src/subscription_manager.rs index e418d59029d..1c090ceda4d 100644 --- a/components/backup-stream/src/subscription_manager.rs +++ b/components/backup-stream/src/subscription_manager.rs @@ -1,15 +1,7 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use std::{ - sync::{ - atomic::{AtomicBool, Ordering}, - Arc, - }, - time::Duration, -}; +use std::{sync::Arc, time::Duration}; -use crossbeam::channel::{Receiver as SyncReceiver, Sender as SyncSender}; -use crossbeam_channel::SendError; use engine_traits::KvEngine; use error_code::ErrorCodeExt; use futures::FutureExt; @@ -22,10 +14,11 @@ use raftstore::{ store::fsm::ChangeObserver, }; use tikv::storage::Statistics; -use tikv_util::{box_err, debug, info, time::Instant, warn, worker::Scheduler}; -use tokio::sync::mpsc::{channel, Receiver, Sender}; +use tikv_util::{ + box_err, debug, info, sys::thread::ThreadBuildWrapper, time::Instant, warn, worker::Scheduler, +}; +use tokio::sync::mpsc::{channel, error::SendError, Receiver, Sender}; use txn_types::TimeStamp; -use yatp::task::callback::Handle as YatpHandle; use crate::{ annotate, @@ -39,13 +32,11 @@ use crate::{ router::{Router, TaskSelector}, subscription_track::{CheckpointType, ResolveResult, SubscriptionTracer}, try_send, - utils::{self, CallbackWaitGroup, Work}, + utils::{self, FutureWaitGroup, Work}, Task, }; -type ScanPool = yatp::ThreadPool; - -const INITIAL_SCAN_FAILURE_MAX_RETRY_TIME: usize = 10; +type ScanPool = tokio::runtime::Runtime; // The retry parameters for failed to get last checkpoint ts. // When PD is temporarily disconnected, we may need this retry. @@ -128,8 +119,9 @@ fn should_retry(err: &Error) -> bool { } /// the abstraction over a "DB" which provides the initial scanning. -trait InitialScan: Clone { - fn do_initial_scan( +#[async_trait::async_trait] +trait InitialScan: Clone + Sync + Send + 'static { + async fn do_initial_scan( &self, region: &Region, start_ts: TimeStamp, @@ -139,13 +131,13 @@ trait InitialScan: Clone { fn handle_fatal_error(&self, region: &Region, err: Error); } -impl InitialScan for InitialDataLoader +#[async_trait::async_trait] +impl InitialScan for InitialDataLoader where E: KvEngine, - R: RegionInfoProvider + Clone + 'static, - RT: CdcHandle, + RT: CdcHandle + Sync + 'static, { - fn do_initial_scan( + async fn do_initial_scan( &self, region: &Region, start_ts: TimeStamp, @@ -155,12 +147,14 @@ where let h = handle.clone(); // Note: we have external retry at `ScanCmd::exec_by_with_retry`, should we keep // retrying here? - let snap = self.observe_over_with_retry(region, move || { - ChangeObserver::from_pitr(region_id, handle.clone()) - })?; + let snap = self + .observe_over_with_retry(region, move || { + ChangeObserver::from_pitr(region_id, handle.clone()) + }) + .await?; #[cfg(feature = "failpoints")] fail::fail_point!("scan_after_get_snapshot"); - let stat = self.do_initial_scan(region, h, start_ts, snap)?; + let stat = self.do_initial_scan(region, h, start_ts, snap).await?; Ok(stat) } @@ -180,7 +174,7 @@ where impl ScanCmd { /// execute the initial scanning via the specificated [`InitialDataLoader`]. - fn exec_by(&self, initial_scan: impl InitialScan) -> Result<()> { + async fn exec_by(&self, initial_scan: impl InitialScan) -> Result<()> { let Self { region, handle, @@ -188,7 +182,9 @@ impl ScanCmd { .. } = self; let begin = Instant::now_coarse(); - let stat = initial_scan.do_initial_scan(region, *last_checkpoint, handle.clone())?; + let stat = initial_scan + .do_initial_scan(region, *last_checkpoint, handle.clone()) + .await?; info!("initial scanning finished!"; "takes" => ?begin.saturating_elapsed(), "from_ts" => %last_checkpoint, utils::slog_region(region)); utils::record_cf_stat("lock", &stat.lock); utils::record_cf_stat("write", &stat.write); @@ -197,17 +193,15 @@ impl ScanCmd { } /// execute the command, when meeting error, retrying. - fn exec_by_with_retry(self, init: impl InitialScan, cancel: &AtomicBool) { - let mut retry_time = INITIAL_SCAN_FAILURE_MAX_RETRY_TIME; + async fn exec_by_with_retry(self, init: impl InitialScan) { + let mut retry_time = TRY_START_OBSERVE_MAX_RETRY_TIME; loop { - if cancel.load(Ordering::SeqCst) { - return; - } - match self.exec_by(init.clone()) { + match self.exec_by(init.clone()).await { Err(err) if should_retry(&err) && retry_time > 0 => { - // NOTE: blocking this thread may stick the process. - // Maybe spawn a task to tokio and reschedule the task then? - std::thread::sleep(Duration::from_millis(500)); + tokio::time::sleep(backoff_for_start_observe( + TRY_START_OBSERVE_MAX_RETRY_TIME - retry_time, + )) + .await; warn!("meet retryable error"; "err" => %err, "retry_time" => retry_time); retry_time -= 1; continue; @@ -223,82 +217,62 @@ impl ScanCmd { } } -fn scan_executor_loop( - init: impl InitialScan, - cmds: SyncReceiver, - canceled: Arc, -) { - while let Ok(cmd) = cmds.recv() { - fail::fail_point!("execute_scan_command"); +async fn scan_executor_loop(init: impl InitialScan, mut cmds: Receiver) { + while let Some(cmd) = cmds.recv().await { debug!("handling initial scan request"; "region_id" => %cmd.region.get_id()); metrics::PENDING_INITIAL_SCAN_LEN .with_label_values(&["queuing"]) .dec(); - if canceled.load(Ordering::Acquire) { - return; + #[cfg(feature = "failpoints")] + { + let sleep = (|| { + fail::fail_point!("execute_scan_command_sleep_100", |_| { 100 }); + 0 + })(); + tokio::time::sleep(std::time::Duration::from_secs(sleep)).await; } - metrics::PENDING_INITIAL_SCAN_LEN - .with_label_values(&["executing"]) - .inc(); - cmd.exec_by_with_retry(init.clone(), &canceled); - metrics::PENDING_INITIAL_SCAN_LEN - .with_label_values(&["executing"]) - .dec(); + let init = init.clone(); + tokio::task::spawn(async move { + metrics::PENDING_INITIAL_SCAN_LEN + .with_label_values(&["executing"]) + .inc(); + cmd.exec_by_with_retry(init).await; + metrics::PENDING_INITIAL_SCAN_LEN + .with_label_values(&["executing"]) + .dec(); + }); } } /// spawn the executors in the scan pool. -/// we make workers thread instead of spawn scan task directly into the pool -/// because the [`InitialDataLoader`] isn't `Sync` hence we must use it very -/// carefully or rustc (along with tokio) would complain that we made a `!Send` -/// future. so we have moved the data loader to the synchronous context so its -/// reference won't be shared between threads any more. -fn spawn_executors(init: impl InitialScan + Send + 'static, number: usize) -> ScanPoolHandle { - let (tx, rx) = crossbeam::channel::bounded(MESSAGE_BUFFER_SIZE); +fn spawn_executors( + init: impl InitialScan + Send + Sync + 'static, + number: usize, +) -> ScanPoolHandle { + let (tx, rx) = tokio::sync::mpsc::channel(MESSAGE_BUFFER_SIZE); let pool = create_scan_pool(number); - let stopped = Arc::new(AtomicBool::new(false)); - for _ in 0..number { - let init = init.clone(); - let rx = rx.clone(); - let stopped = stopped.clone(); - pool.spawn(move |_: &mut YatpHandle<'_>| { - let _io_guard = file_system::WithIoType::new(file_system::IoType::Replication); - scan_executor_loop(init, rx, stopped); - }) - } - ScanPoolHandle { - tx, - _pool: pool, - stopped, - } + pool.spawn(async move { + scan_executor_loop(init, rx).await; + }); + ScanPoolHandle { tx, _pool: pool } } struct ScanPoolHandle { - tx: SyncSender, - stopped: Arc, + // Theoretically, we can get rid of the sender, and spawn a new task via initial loader in each + // thread. But that will make `SubscribeManager` holds a reference to the implementation of + // `InitialScan`, which will get the type information a mass. + tx: Sender, - // in fact, we won't use the pool any more. - // but we should hold the reference to the pool so it won't try to join the threads running. _pool: ScanPool, } -impl Drop for ScanPoolHandle { - fn drop(&mut self) { - self.stopped.store(true, Ordering::Release); - } -} - impl ScanPoolHandle { - fn request(&self, cmd: ScanCmd) -> std::result::Result<(), SendError> { - if self.stopped.load(Ordering::Acquire) { - warn!("scan pool is stopped, ignore the scan command"; "region" => %cmd.region.get_id()); - return Ok(()); - } + async fn request(&self, cmd: ScanCmd) -> std::result::Result<(), SendError> { metrics::PENDING_INITIAL_SCAN_LEN .with_label_values(&["queuing"]) .inc(); - self.tx.send(cmd) + self.tx.send(cmd).await } } @@ -322,7 +296,9 @@ pub struct RegionSubscriptionManager { messenger: Sender, scan_pool_handle: Arc, - scans: Arc, + scans: Arc, + + advance_ts_interval: Duration, } impl Clone for RegionSubscriptionManager @@ -343,16 +319,26 @@ where subs: self.subs.clone(), messenger: self.messenger.clone(), scan_pool_handle: self.scan_pool_handle.clone(), - scans: CallbackWaitGroup::new(), + scans: FutureWaitGroup::new(), + advance_ts_interval: self.advance_ts_interval, } } } -/// Create a yatp pool for doing initial scanning. +/// Create a pool for doing initial scanning. fn create_scan_pool(num_threads: usize) -> ScanPool { - yatp::Builder::new("log-backup-scan") - .max_thread_count(num_threads) - .build_callback_pool() + tokio::runtime::Builder::new_multi_thread() + .with_sys_and_custom_hooks( + move || { + file_system::set_io_type(file_system::IoType::Replication); + }, + || {}, + ) + .thread_name("log-backup-scan") + .enable_time() + .worker_threads(num_threads) + .build() + .unwrap() } impl RegionSubscriptionManager @@ -367,22 +353,25 @@ where /// /// a two-tuple, the first is the handle to the manager, the second is the /// operator loop future. - pub fn start( - initial_loader: InitialDataLoader, + pub fn start( + initial_loader: InitialDataLoader, + regions: R, observer: BackupStreamObserver, meta_cli: MetadataClient, pd_client: Arc, scan_pool_size: usize, - resolver: BackupStreamResolver, + resolver: BackupStreamResolver, + advance_ts_interval: Duration, ) -> (Self, future![()]) where E: KvEngine, - RT: CdcHandle + 'static, + HInit: CdcHandle + Sync + 'static, + HChkLd: CdcHandle + 'static, { let (tx, rx) = channel(MESSAGE_BUFFER_SIZE); let scan_pool_handle = spawn_executors(initial_loader.clone(), scan_pool_size); let op = Self { - regions: initial_loader.regions.clone(), + regions, meta_cli, pd_client, range_router: initial_loader.sink.clone(), @@ -391,7 +380,8 @@ where subs: initial_loader.tracing, messenger: tx, scan_pool_handle: Arc::new(scan_pool_handle), - scans: CallbackWaitGroup::new(), + scans: FutureWaitGroup::new(), + advance_ts_interval, }; let fut = op.clone().region_operator_loop(rx, resolver); (op, fut) @@ -408,8 +398,10 @@ where } /// wait initial scanning get finished. - pub fn wait(&self, timeout: Duration) -> future![bool] { - tokio::time::timeout(timeout, self.scans.wait()).map(|result| result.is_err()) + pub async fn wait(&self, timeout: Duration) -> bool { + tokio::time::timeout(timeout, self.scans.wait()) + .map(move |result| result.is_err()) + .await } /// the handler loop. @@ -481,13 +473,20 @@ where } } ObserveOp::ResolveRegions { callback, min_ts } => { + fail::fail_point!("subscription_manager_resolve_regions"); let now = Instant::now(); let timedout = self.wait(Duration::from_secs(5)).await; if timedout { warn!("waiting for initial scanning done timed out, forcing progress!"; "take" => ?now.saturating_elapsed(), "timedout" => %timedout); } - let regions = resolver.resolve(self.subs.current_regions(), min_ts).await; + let regions = resolver + .resolve( + self.subs.current_regions(), + min_ts, + Some(self.advance_ts_interval), + ) + .await; let cps = self.subs.resolve_with(min_ts, regions); let min_region = cps.iter().min_by_key(|rs| rs.checkpoint); // If there isn't any region observed, the `min_ts` can be used as resolved ts @@ -522,7 +521,8 @@ where region, self.get_last_checkpoint_of(&for_task, region).await?, handle.clone(), - ); + ) + .await; Result::Ok(()) } .await; @@ -567,7 +567,8 @@ where Err(Error::Other(box_err!("Nature is boring"))) }); let tso = self.get_last_checkpoint_of(&for_task, region).await?; - self.observe_over_with_initial_data_from_checkpoint(region, tso, handle.clone()); + self.observe_over_with_initial_data_from_checkpoint(region, tso, handle.clone()) + .await; } } Ok(()) @@ -702,13 +703,13 @@ where Ok(cp.ts) } - fn spawn_scan(&self, cmd: ScanCmd) { + async fn spawn_scan(&self, cmd: ScanCmd) { // we should not spawn initial scanning tasks to the tokio blocking pool // because it is also used for converting sync File I/O to async. (for now!) // In that condition, if we blocking for some resources(for example, the // `MemoryQuota`) at the block threads, we may meet some ghosty // deadlock. - let s = self.scan_pool_handle.request(cmd); + let s = self.scan_pool_handle.request(cmd).await; if let Err(err) = s { let region_id = err.0.region.get_id(); annotate!(err, "BUG: scan_pool closed") @@ -716,7 +717,7 @@ where } } - fn observe_over_with_initial_data_from_checkpoint( + async fn observe_over_with_initial_data_from_checkpoint( &self, region: &Region, last_checkpoint: TimeStamp, @@ -730,6 +731,7 @@ where last_checkpoint, _work: self.scans.clone().work(), }) + .await } fn find_task_by_region(&self, r: &Region) -> Option { @@ -748,8 +750,9 @@ mod test { #[derive(Clone, Copy)] struct NoopInitialScan; + #[async_trait::async_trait] impl InitialScan for NoopInitialScan { - fn do_initial_scan( + async fn do_initial_scan( &self, _region: &Region, _start_ts: txn_types::TimeStamp, @@ -769,7 +772,7 @@ mod test { use std::time::Duration; use super::ScanCmd; - use crate::{subscription_manager::spawn_executors, utils::CallbackWaitGroup}; + use crate::{subscription_manager::spawn_executors, utils::FutureWaitGroup}; fn should_finish_in(f: impl FnOnce() + Send + 'static, d: std::time::Duration) { let (tx, rx) = futures::channel::oneshot::channel(); @@ -786,18 +789,21 @@ mod test { } let pool = spawn_executors(NoopInitialScan, 1); - let wg = CallbackWaitGroup::new(); - fail::cfg("execute_scan_command", "sleep(100)").unwrap(); + let wg = FutureWaitGroup::new(); + fail::cfg("execute_scan_command_sleep_100", "return").unwrap(); for _ in 0..100 { let wg = wg.clone(); - pool.request(ScanCmd { - region: Default::default(), - handle: Default::default(), - last_checkpoint: Default::default(), - // Note: Maybe make here a Box or some other trait? - _work: wg.work(), - }) - .unwrap() + assert!( + pool._pool + .block_on(pool.request(ScanCmd { + region: Default::default(), + handle: Default::default(), + last_checkpoint: Default::default(), + // Note: Maybe make here a Box or some other trait? + _work: wg.work(), + })) + .is_ok() + ) } should_finish_in(move || drop(pool), Duration::from_secs(5)); diff --git a/components/backup-stream/src/subscription_track.rs b/components/backup-stream/src/subscription_track.rs index e92759bc2b2..c70ad9c8038 100644 --- a/components/backup-stream/src/subscription_track.rs +++ b/components/backup-stream/src/subscription_track.rs @@ -8,8 +8,8 @@ use dashmap::{ }; use kvproto::metapb::Region; use raftstore::coprocessor::*; -use resolved_ts::Resolver; -use tikv_util::{info, warn}; +use resolved_ts::{Resolver, TsSource, TxnLocks}; +use tikv_util::{info, memory::MemoryQuota, warn}; use txn_types::TimeStamp; use crate::{debug, metrics::TRACK_REGION, utils}; @@ -82,6 +82,7 @@ impl ActiveSubscription { self.handle.stop_observing(); } + #[cfg(test)] pub fn is_observing(&self) -> bool { self.handle.is_observing() } @@ -99,7 +100,7 @@ impl ActiveSubscription { pub enum CheckpointType { MinTs, StartTsOfInitialScan, - StartTsOfTxn(Option>), + StartTsOfTxn(Option<(TimeStamp, TxnLocks)>), } impl std::fmt::Debug for CheckpointType { @@ -109,10 +110,7 @@ impl std::fmt::Debug for CheckpointType { Self::StartTsOfInitialScan => write!(f, "StartTsOfInitialScan"), Self::StartTsOfTxn(arg0) => f .debug_tuple("StartTsOfTxn") - .field(&format_args!( - "{}", - utils::redact(&arg0.as_ref().map(|x| x.as_ref()).unwrap_or(&[])) - )) + .field(&format_args!("{:?}", arg0)) .finish(), } } @@ -322,6 +320,7 @@ impl SubscriptionTracer { } /// check whether the region_id should be observed by this observer. + #[cfg(test)] pub fn is_observing(&self, region_id: u64) -> bool { let sub = self.0.get_mut(®ion_id); match sub { @@ -401,7 +400,7 @@ impl<'a> SubscriptionRef<'a> { } } -/// This enhanced version of `Resolver` allow some unordered lock events. +/// This enhanced version of `Resolver` allow some unordered lock events. /// The name "2-phase" means this is used for 2 *concurrency* phases of /// observing a region: /// 1. Doing the initial scanning. @@ -466,9 +465,11 @@ impl std::fmt::Debug for FutureLock { impl TwoPhaseResolver { /// try to get one of the key of the oldest lock in the resolver. - pub fn sample_far_lock(&self) -> Option> { - let (_, keys) = self.resolver.locks().first_key_value()?; - keys.iter().next().cloned() + pub fn sample_far_lock(&self) -> Option<(TimeStamp, TxnLocks)> { + self.resolver + .locks() + .first_key_value() + .map(|(ts, txn_locks)| (*ts, txn_locks.clone())) } pub fn in_phase_one(&self) -> bool { @@ -479,7 +480,8 @@ impl TwoPhaseResolver { if !self.in_phase_one() { warn!("backup stream tracking lock as if in phase one"; "start_ts" => %start_ts, "key" => %utils::redact(&key)) } - self.resolver.track_lock(start_ts, key, None) + // TODO: handle memory quota exceed, for now, quota is set to usize::MAX. + self.resolver.track_lock(start_ts, key, None).unwrap(); } pub fn track_lock(&mut self, start_ts: TimeStamp, key: Vec) { @@ -487,7 +489,8 @@ impl TwoPhaseResolver { self.future_locks.push(FutureLock::Lock(key, start_ts)); return; } - self.resolver.track_lock(start_ts, key, None) + // TODO: handle memory quota exceed, for now, quota is set to usize::MAX. + self.resolver.track_lock(start_ts, key, None).unwrap(); } pub fn untrack_lock(&mut self, key: &[u8]) { @@ -501,7 +504,10 @@ impl TwoPhaseResolver { fn handle_future_lock(&mut self, lock: FutureLock) { match lock { - FutureLock::Lock(key, ts) => self.resolver.track_lock(ts, key, None), + FutureLock::Lock(key, ts) => { + // TODO: handle memory quota exceed, for now, quota is set to usize::MAX. + self.resolver.track_lock(ts, key, None).unwrap(); + } FutureLock::Unlock(key) => self.resolver.untrack_lock(&key, None), } } @@ -511,7 +517,7 @@ impl TwoPhaseResolver { return min_ts.min(stable_ts); } - self.resolver.resolve(min_ts, None) + self.resolver.resolve(min_ts, None, TsSource::BackupStream) } pub fn resolved_ts(&self) -> TimeStamp { @@ -523,8 +529,10 @@ impl TwoPhaseResolver { } pub fn new(region_id: u64, stable_ts: Option) -> Self { + // TODO: limit the memory usage of the resolver. + let memory_quota = Arc::new(MemoryQuota::new(std::usize::MAX)); Self { - resolver: Resolver::new(region_id), + resolver: Resolver::new(region_id, memory_quota), future_locks: Default::default(), stable_ts, } @@ -541,7 +549,7 @@ impl TwoPhaseResolver { // advance the internal resolver. // the start ts of initial scanning would be a safe ts for min ts // -- because is used to be a resolved ts. - self.resolver.resolve(ts, None); + self.resolver.resolve(ts, None, TsSource::BackupStream); } None => { warn!("BUG: a two-phase resolver is executing phase_one_done when not in phase one"; "resolver" => ?self) @@ -565,6 +573,7 @@ mod test { use kvproto::metapb::{Region, RegionEpoch}; use raftstore::coprocessor::ObserveHandle; + use resolved_ts::TxnLocks; use txn_types::TimeStamp; use super::{SubscriptionTracer, TwoPhaseResolver}; @@ -667,7 +676,13 @@ mod test { ( region(4, 8, 1), 128.into(), - StartTsOfTxn(Some(Arc::from(b"Alpi".as_slice()))) + StartTsOfTxn(Some(( + TimeStamp::new(128), + TxnLocks { + lock_count: 1, + sample_lock: Some(Arc::from(b"Alpi".as_slice())), + } + ))) ), ] ); diff --git a/components/backup-stream/src/tempfiles.rs b/components/backup-stream/src/tempfiles.rs index add1ee67c12..b8f9c9e1120 100644 --- a/components/backup-stream/src/tempfiles.rs +++ b/components/backup-stream/src/tempfiles.rs @@ -259,6 +259,11 @@ impl TempFilePool { &self.cfg } + #[cfg(test)] + pub fn mem_used(&self) -> usize { + self.current.load(Ordering::Acquire) + } + /// Create a file for writting. /// This function is synchronous so we can call it easier in the polling /// context. (Anyway, it is really hard to call an async function in the diff --git a/components/backup-stream/src/utils.rs b/components/backup-stream/src/utils.rs index 1b150eaa1f0..6cf5fd80d9b 100644 --- a/components/backup-stream/src/utils.rs +++ b/components/backup-stream/src/utils.rs @@ -5,27 +5,26 @@ use std::{ borrow::Borrow, cell::RefCell, collections::{hash_map::RandomState, BTreeMap, HashMap}, + future::Future, ops::{Bound, RangeBounds}, path::Path, sync::{ atomic::{AtomicUsize, Ordering}, Arc, }, - task::Context, + task::{Context, Waker}, time::Duration, }; use async_compression::{tokio::write::ZstdEncoder, Level}; use engine_rocks::ReadPerfInstant; use engine_traits::{CfName, CF_DEFAULT, CF_LOCK, CF_RAFT, CF_WRITE}; -use futures::{channel::mpsc, executor::block_on, ready, task::Poll, FutureExt, StreamExt}; +use futures::{ready, task::Poll}; use kvproto::{ brpb::CompressionType, metapb::Region, raft_cmdpb::{CmdType, Request}, }; -use raft::StateRole; -use raftstore::{coprocessor::RegionInfoProvider, RegionInfo}; use tikv::storage::CfStatistics; use tikv_util::{ box_err, @@ -33,20 +32,18 @@ use tikv_util::{ self_thread_inspector, IoStat, ThreadInspector, ThreadInspectorImpl as OsInspector, }, time::Instant, - warn, worker::Scheduler, Either, }; use tokio::{ fs::File, io::{AsyncRead, AsyncWrite, AsyncWriteExt, BufWriter}, - sync::{oneshot, Mutex, RwLock}, + sync::{Mutex, RwLock}, }; use txn_types::{Key, Lock, LockType}; use crate::{ errors::{Error, Result}, - metadata::store::BoxFuture, router::TaskSelector, Task, }; @@ -79,65 +76,6 @@ pub fn redact(key: &impl AsRef<[u8]>) -> log_wrappers::Value<'_> { log_wrappers::Value::key(key.as_ref()) } -/// RegionPager seeks regions with leader role in the range. -pub struct RegionPager

{ - regions: P, - start_key: Vec, - end_key: Vec, - reach_last_region: bool, -} - -impl RegionPager

{ - pub fn scan_from(regions: P, start_key: Vec, end_key: Vec) -> Self { - Self { - regions, - start_key, - end_key, - reach_last_region: false, - } - } - - pub fn next_page(&mut self, size: usize) -> Result> { - if self.start_key >= self.end_key || self.reach_last_region { - return Ok(vec![]); - } - - let (mut tx, rx) = mpsc::channel(size); - let end_key = self.end_key.clone(); - self.regions - .seek_region( - &self.start_key, - Box::new(move |i| { - let r = i - .filter(|r| r.role == StateRole::Leader) - .take(size) - .take_while(|r| r.region.start_key < end_key) - .try_for_each(|r| tx.try_send(r.clone())); - if let Err(_err) = r { - warn!("failed to scan region and send to initlizer") - } - }), - ) - .map_err(|err| { - Error::Other(box_err!( - "failed to seek region for start key {}: {}", - redact(&self.start_key), - err - )) - })?; - let collected_regions = block_on(rx.collect::>()); - self.start_key = collected_regions - .last() - .map(|region| region.region.end_key.to_owned()) - // no leader region found. - .unwrap_or_default(); - if self.start_key.is_empty() { - self.reach_last_region = true; - } - Ok(collected_regions) - } -} - /// StopWatch is a utility for record time cost in multi-stage tasks. /// NOTE: Maybe it should be generic over somewhat Clock type? pub struct StopWatch(Instant); @@ -440,47 +378,65 @@ pub fn should_track_lock(l: &Lock) -> bool { } } -pub struct CallbackWaitGroup { +pub struct FutureWaitGroup { running: AtomicUsize, - on_finish_all: std::sync::Mutex>>, + wakers: std::sync::Mutex>, +} + +pub struct Work(Arc); + +impl Drop for Work { + fn drop(&mut self) { + self.0.work_done(); + } +} + +pub struct WaitAll<'a>(&'a FutureWaitGroup); + +impl<'a> Future for WaitAll<'a> { + type Output = (); + + fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll { + // Fast path: nothing to wait. + let running = self.0.running.load(Ordering::SeqCst); + if running == 0 { + return Poll::Ready(()); + } + + // <1> + let mut callbacks = self.0.wakers.lock().unwrap(); + callbacks.push(cx.waker().clone()); + let running = self.0.running.load(Ordering::SeqCst); + // Unlikely path: if all background tasks finish at <1>, there will be a long + // period that nobody will wake the `wakers` even the condition is ready. + // We need to help ourselves here. + if running == 0 { + callbacks.drain(..).for_each(|w| w.wake()); + } + Poll::Pending + } } -impl CallbackWaitGroup { +impl FutureWaitGroup { pub fn new() -> Arc { Arc::new(Self { running: AtomicUsize::new(0), - on_finish_all: std::sync::Mutex::default(), + wakers: Default::default(), }) } fn work_done(&self) { let last = self.running.fetch_sub(1, Ordering::SeqCst); if last == 1 { - self.on_finish_all - .lock() - .unwrap() - .drain(..) - .for_each(|x| x()) + self.wakers.lock().unwrap().drain(..).for_each(|x| { + x.wake(); + }) } } /// wait until all running tasks done. - pub fn wait(&self) -> BoxFuture<()> { - // Fast path: no uploading. - if self.running.load(Ordering::SeqCst) == 0 { - return Box::pin(futures::future::ready(())); - } - - let (tx, rx) = oneshot::channel(); - self.on_finish_all.lock().unwrap().push(Box::new(move || { - // The waiter may timed out. - let _ = tx.send(()); - })); - // try to acquire the lock again. - if self.running.load(Ordering::SeqCst) == 0 { - return Box::pin(futures::future::ready(())); - } - Box::pin(rx.map(|_| ())) + pub fn wait(&self) -> WaitAll<'_> { + WaitAll(self) } /// make a work, as long as the return value held, mark a work in the group @@ -491,14 +447,6 @@ impl CallbackWaitGroup { } } -pub struct Work(Arc); - -impl Drop for Work { - fn drop(&mut self) { - self.0.work_done(); - } -} - struct ReadThroughputRecorder { // The system tool set. ins: Option, @@ -874,7 +822,7 @@ mod test { use kvproto::metapb::{Region, RegionEpoch}; use tokio::io::{AsyncWriteExt, BufReader}; - use crate::utils::{is_in_range, CallbackWaitGroup, SegmentMap}; + use crate::utils::{is_in_range, FutureWaitGroup, SegmentMap}; #[test] fn test_redact() { @@ -983,8 +931,8 @@ mod test { } fn run_case(c: Case) { + let wg = FutureWaitGroup::new(); for i in 0..c.repeat { - let wg = CallbackWaitGroup::new(); let cnt = Arc::new(AtomicUsize::new(c.bg_task)); for _ in 0..c.bg_task { let cnt = cnt.clone(); @@ -995,7 +943,7 @@ mod test { }); } block_on(tokio::time::timeout(Duration::from_secs(20), wg.wait())).unwrap(); - assert_eq!(cnt.load(Ordering::SeqCst), 0, "{:?}@{}", c, i); + assert_eq!(cnt.load(Ordering::SeqCst), 0, "{:?}@{}", c, i,); } } @@ -1012,6 +960,10 @@ mod test { bg_task: 512, repeat: 1, }, + Case { + bg_task: 16, + repeat: 10000, + }, Case { bg_task: 2, repeat: 100000, diff --git a/components/backup-stream/tests/failpoints/mod.rs b/components/backup-stream/tests/failpoints/mod.rs index ff9b9f82ba1..53f5d5b0ddc 100644 --- a/components/backup-stream/tests/failpoints/mod.rs +++ b/components/backup-stream/tests/failpoints/mod.rs @@ -9,7 +9,13 @@ pub use suite::*; mod all { - use std::time::Duration; + use std::{ + sync::{ + atomic::{AtomicBool, Ordering}, + Arc, + }, + time::Duration, + }; use backup_stream::{ metadata::{ @@ -19,18 +25,48 @@ mod all { GetCheckpointResult, RegionCheckpointOperation, RegionSet, Task, }; use futures::executor::block_on; - use tikv_util::defer; + use tikv_util::{ + config::{ReadableDuration, ReadableSize}, + defer, + }; + use txn_types::Key; use super::{ make_record_key, make_split_key_at_record, mutation, run_async_test, SuiteBuilder, }; + use crate::{make_table_key, Suite}; + + #[test] + fn failed_register_task() { + let suite = SuiteBuilder::new_named("failed_register_task").build(); + fail::cfg("load_task::error_when_fetching_ranges", "return").unwrap(); + let cli = suite.get_meta_cli(); + block_on(cli.insert_task_with_range( + &suite.simple_task("failed_register_task"), + &[(&make_table_key(1, b""), &make_table_key(2, b""))], + )) + .unwrap(); + + for _ in 0..10 { + if block_on(cli.get_last_error_of("failed_register_task", 1)) + .unwrap() + .is_some() + { + return; + } + std::thread::sleep(Duration::from_millis(100)); + } + + suite.dump_slash_etc(); + panic!("No error uploaded when failed to comminate to PD."); + } #[test] fn basic() { let mut suite = SuiteBuilder::new_named("basic").build(); fail::cfg("try_start_observe", "1*return").unwrap(); - run_async_test(async { + let (round1, round2) = run_async_test(async { // write data before the task starting, for testing incremental scanning. let round1 = suite.write_records(0, 128, 1).await; suite.must_register_task(1, "test_basic"); @@ -38,13 +74,13 @@ mod all { let round2 = suite.write_records(256, 128, 1).await; suite.force_flush_files("test_basic"); suite.wait_for_flush(); - suite - .check_for_write_records( - suite.flushed_files.path(), - round1.union(&round2).map(Vec::as_slice), - ) - .await; + (round1, round2) }); + suite.check_for_write_records( + suite.flushed_files.path(), + round1.union(&round2).map(Vec::as_slice), + ); + suite.cluster.shutdown(); } #[test] @@ -97,10 +133,10 @@ mod all { let keys2 = run_async_test(suite.write_records(256, 128, 1)); suite.force_flush_files("region_failure"); suite.wait_for_flush(); - run_async_test(suite.check_for_write_records( + suite.check_for_write_records( suite.flushed_files.path(), keys.union(&keys2).map(|s| s.as_slice()), - )); + ); } #[test] fn initial_scan_failure() { @@ -121,10 +157,10 @@ mod all { let keys2 = run_async_test(suite.write_records(256, 128, 1)); suite.force_flush_files("initial_scan_failure"); suite.wait_for_flush(); - run_async_test(suite.check_for_write_records( + suite.check_for_write_records( suite.flushed_files.path(), keys.union(&keys2).map(|s| s.as_slice()), - )); + ); } #[test] fn failed_during_refresh_region() { @@ -147,10 +183,10 @@ mod all { let keys2 = run_async_test(suite.write_records(256, 128, 1)); suite.force_flush_files("fail_to_refresh_region"); suite.wait_for_flush(); - run_async_test(suite.check_for_write_records( + suite.check_for_write_records( suite.flushed_files.path(), keys.union(&keys2).map(|s| s.as_slice()), - )); + ); let leader = suite.cluster.leader_of_region(1).unwrap().store_id; let (tx, rx) = std::sync::mpsc::channel(); suite.endpoints[&leader] @@ -186,7 +222,8 @@ mod all { suite.must_split(&make_split_key_at_record(1, 42)); std::thread::sleep(Duration::from_secs(2)); - let error = run_async_test(suite.get_meta_cli().get_last_error("retry_abort", 1)).unwrap(); + let error = + run_async_test(suite.get_meta_cli().get_last_error_of("retry_abort", 1)).unwrap(); let error = error.expect("no error uploaded"); error .get_error_message() @@ -212,12 +249,7 @@ mod all { let items = run_async_test(suite.write_records(0, 128, 1)); suite.force_flush_files("retry_abort"); suite.wait_for_flush(); - run_async_test( - suite.check_for_write_records( - suite.flushed_files.path(), - items.iter().map(Vec::as_slice), - ), - ); + suite.check_for_write_records(suite.flushed_files.path(), items.iter().map(Vec::as_slice)); } #[test] fn failure_and_split() { @@ -240,12 +272,139 @@ mod all { let round2 = run_async_test(suite.write_records(256, 128, 1)); suite.force_flush_files("failure_and_split"); suite.wait_for_flush(); - run_async_test(suite.check_for_write_records( + suite.check_for_write_records( suite.flushed_files.path(), round1.union(&round2).map(Vec::as_slice), - )); + ); let cp = suite.global_checkpoint(); assert!(cp > 512, "it is {}", cp); suite.cluster.shutdown(); } + + #[test] + fn memory_quota() { + let mut suite = SuiteBuilder::new_named("memory_quota") + .cfg(|cfg| cfg.initial_scan_pending_memory_quota = ReadableSize::kb(2)) + .build(); + let keys = run_async_test(suite.write_records(0, 128, 1)); + let failed = Arc::new(AtomicBool::new(false)); + fail::cfg("router_on_event_delay_ms", "6*return(1000)").unwrap(); + fail::cfg_callback("scan_and_async_send::about_to_consume", { + let failed = failed.clone(); + move || { + let v = backup_stream::metrics::HEAP_MEMORY.get(); + // Not greater than max key length * concurrent initial scan number. + if v > 4096 * 6 { + println!("[[ FAILED ]] The memory usage is {v} which exceeds the quota"); + failed.store(true, Ordering::SeqCst); + } + } + }) + .unwrap(); + suite.must_register_task(1, "memory_quota"); + suite.force_flush_files("memory_quota"); + suite.wait_for_flush(); + suite.check_for_write_records( + suite.flushed_files.path(), + keys.iter().map(|v| v.as_slice()), + ); + assert!(!failed.load(Ordering::SeqCst)); + } + + #[test] + fn failed_to_get_task_when_pausing() { + let suite = SuiteBuilder::new_named("resume_error").nodes(1).build(); + suite.must_register_task(1, "resume_error"); + let mcli = suite.get_meta_cli(); + run_async_test(mcli.pause("resume_error")).unwrap(); + suite.sync(); + fail::cfg("failed_to_get_task", "1*return").unwrap(); + run_async_test(mcli.resume("resume_error")).unwrap(); + suite.sync(); + // Make sure our suite doesn't panic. + suite.sync(); + } + + #[test] + fn resolve_during_flushing() { + let mut suite = SuiteBuilder::new_named("resolve_during_flushing") + .cfg(|cfg| { + cfg.min_ts_interval = ReadableDuration::days(1); + cfg.initial_scan_concurrency = 1; + }) + .nodes(2) + .build(); + suite.must_register_task(1, "resolve_during_flushing"); + let key = make_record_key(1, 1); + + let start_ts = suite.tso(); + suite.must_kv_prewrite( + 1, + vec![mutation( + key.clone(), + Suite::PROMISED_SHORT_VALUE.to_owned(), + )], + key.clone(), + start_ts, + ); + fail::cfg("after_moving_to_flushing_files", "pause").unwrap(); + suite.force_flush_files("resolve_during_flushing"); + let commit_ts = suite.tso(); + suite.just_commit_a_key(key.clone(), start_ts, commit_ts); + suite.run(|| Task::RegionCheckpointsOp(RegionCheckpointOperation::PrepareMinTsForResolve)); + // Wait until the resolve done. Sadly for now we don't have good solutions :( + std::thread::sleep(Duration::from_secs(2)); + fail::remove("after_moving_to_flushing_files"); + suite.wait_for_flush(); + assert_eq!(suite.global_checkpoint(), start_ts.into_inner()); + // transfer the leader, make sure everything has been flushed. + suite.must_shuffle_leader(1); + suite.wait_with(|cfg| cfg.initial_scan_semaphore.available_permits() > 0); + suite.force_flush_files("resolve_during_flushing"); + suite.wait_for_flush(); + let enc_key = Key::from_raw(&key).append_ts(commit_ts); + suite.check_for_write_records( + suite.flushed_files.path(), + std::iter::once(enc_key.as_encoded().as_slice()), + ); + } + + #[test] + fn commit_during_flushing() { + let mut suite = SuiteBuilder::new_named("commit_during_flushing") + .nodes(1) + .build(); + suite.must_register_task(1, "commit_during_flushing"); + let key = make_record_key(1, 1); + let start_ts = suite.tso(); + suite.must_kv_prewrite( + 1, + vec![mutation( + key.clone(), + Suite::PROMISED_SHORT_VALUE.to_owned(), + )], + key.clone(), + start_ts, + ); + fail::cfg("subscription_manager_resolve_regions", "pause").unwrap(); + let commit_ts = suite.tso(); + suite.force_flush_files("commit_during_flushing"); + suite.sync(); + suite.sync(); + fail::cfg("log_backup_batch_delay", "return(2000)").unwrap(); + suite.just_commit_a_key(key.clone(), start_ts, commit_ts); + fail::remove("subscription_manager_resolve_regions"); + suite.wait_for_flush(); + let enc_key = Key::from_raw(&key).append_ts(commit_ts); + assert!( + suite.global_checkpoint() > commit_ts.into_inner(), + "{} {:?}", + suite.global_checkpoint(), + commit_ts + ); + suite.check_for_write_records( + suite.flushed_files.path(), + std::iter::once(enc_key.as_encoded().as_slice()), + ) + } } diff --git a/components/backup-stream/tests/integration/mod.rs b/components/backup-stream/tests/integration/mod.rs index a209572c6d8..04fee6b2c09 100644 --- a/components/backup-stream/tests/integration/mod.rs +++ b/components/backup-stream/tests/integration/mod.rs @@ -16,6 +16,7 @@ mod all { use futures::{Stream, StreamExt}; use pd_client::PdClient; use test_raftstore::IsolationFilterFactory; + use tikv::config::BackupStreamConfig; use tikv_util::{box_err, defer, info, HandyRwLock}; use tokio::time::timeout; use txn_types::{Key, TimeStamp}; @@ -27,20 +28,19 @@ mod all { #[test] fn with_split() { let mut suite = SuiteBuilder::new_named("with_split").build(); - run_async_test(async { + let (round1, round2) = run_async_test(async { let round1 = suite.write_records(0, 128, 1).await; suite.must_split(&make_split_key_at_record(1, 42)); suite.must_register_task(1, "test_with_split"); let round2 = suite.write_records(256, 128, 1).await; - suite.force_flush_files("test_with_split"); - suite.wait_for_flush(); - suite - .check_for_write_records( - suite.flushed_files.path(), - round1.union(&round2).map(Vec::as_slice), - ) - .await; + (round1, round2) }); + suite.force_flush_files("test_with_split"); + suite.wait_for_flush(); + suite.check_for_write_records( + suite.flushed_files.path(), + round1.union(&round2).map(Vec::as_slice), + ); suite.cluster.shutdown(); } @@ -62,7 +62,7 @@ mod all { #[test] fn with_split_txn() { let mut suite = SuiteBuilder::new_named("split_txn").build(); - run_async_test(async { + let (commit_ts, start_ts, keys) = run_async_test(async { let start_ts = suite.cluster.pd_client.get_tso().await.unwrap(); let keys = (1..1960).map(|i| make_record_key(1, i)).collect::>(); suite.must_kv_prewrite( @@ -75,26 +75,25 @@ mod all { start_ts, ); let commit_ts = suite.cluster.pd_client.get_tso().await.unwrap(); - suite.commit_keys(keys[1913..].to_vec(), start_ts, commit_ts); - suite.must_register_task(1, "test_split_txn"); - suite.commit_keys(keys[..1913].to_vec(), start_ts, commit_ts); - suite.force_flush_files("test_split_txn"); - suite.wait_for_flush(); - let keys_encoded = keys - .iter() - .map(|v| { - Key::from_raw(v.as_slice()) - .append_ts(commit_ts) - .into_encoded() - }) - .collect::>(); - suite - .check_for_write_records( - suite.flushed_files.path(), - keys_encoded.iter().map(Vec::as_slice), - ) - .await; + (commit_ts, start_ts, keys) }); + suite.commit_keys(keys[1913..].to_vec(), start_ts, commit_ts); + suite.must_register_task(1, "test_split_txn"); + suite.commit_keys(keys[..1913].to_vec(), start_ts, commit_ts); + suite.force_flush_files("test_split_txn"); + suite.wait_for_flush(); + let keys_encoded = keys + .iter() + .map(|v| { + Key::from_raw(v.as_slice()) + .append_ts(commit_ts) + .into_encoded() + }) + .collect::>(); + suite.check_for_write_records( + suite.flushed_files.path(), + keys_encoded.iter().map(Vec::as_slice), + ); suite.cluster.shutdown(); } @@ -110,10 +109,10 @@ mod all { let round2 = run_async_test(suite.write_records(256, 128, 1)); suite.force_flush_files("test_leader_down"); suite.wait_for_flush(); - run_async_test(suite.check_for_write_records( + suite.check_for_write_records( suite.flushed_files.path(), round1.union(&round2).map(Vec::as_slice), - )); + ); suite.cluster.shutdown(); } @@ -161,7 +160,7 @@ mod all { let err = run_async_test( suite .get_meta_cli() - .get_last_error("test_fatal_error", *victim), + .get_last_error_of("test_fatal_error", *victim), ) .unwrap() .unwrap(); @@ -345,10 +344,10 @@ mod all { } assert_eq!(items.last().unwrap().end_key, Vec::::default()); - run_async_test(suite.check_for_write_records( + suite.check_for_write_records( suite.flushed_files.path(), round1.union(&round2).map(|x| x.as_slice()), - )); + ); } #[test] @@ -372,18 +371,18 @@ mod all { .unwrap(); suite.sync(); std::thread::sleep(Duration::from_secs(2)); - run_async_test(suite.check_for_write_records( + suite.check_for_write_records( suite.flushed_files.path(), round1.iter().map(|x| x.as_slice()), - )); + ); assert!(suite.global_checkpoint() > 256); suite.force_flush_files("r"); suite.wait_for_flush(); assert!(suite.global_checkpoint() > 512); - run_async_test(suite.check_for_write_records( + suite.check_for_write_records( suite.flushed_files.path(), round1.union(&round2).map(|x| x.as_slice()), - )); + ); } #[test] @@ -425,9 +424,30 @@ mod all { ts, cps ); - run_async_test(suite.check_for_write_records( + suite.check_for_write_records( suite.flushed_files.path(), round1.iter().map(|k| k.as_slice()), - )) + ) + } + + #[test] + fn update_config() { + let suite = SuiteBuilder::new_named("network_partition") + .nodes(1) + .build(); + let mut basic_config = BackupStreamConfig::default(); + basic_config.initial_scan_concurrency = 4; + suite.run(|| Task::ChangeConfig(basic_config.clone())); + suite.wait_with(|e| { + assert_eq!(e.initial_scan_semaphore.available_permits(), 4,); + true + }); + + basic_config.initial_scan_concurrency = 16; + suite.run(|| Task::ChangeConfig(basic_config.clone())); + suite.wait_with(|e| { + assert_eq!(e.initial_scan_semaphore.available_permits(), 16,); + true + }); } } diff --git a/components/backup-stream/tests/suite.rs b/components/backup-stream/tests/suite.rs index e1df628d76b..af2ca08a92f 100644 --- a/components/backup-stream/tests/suite.rs +++ b/components/backup-stream/tests/suite.rs @@ -2,7 +2,8 @@ use std::{ collections::{HashMap, HashSet}, - path::Path, + fmt::Display, + path::{Path, PathBuf}, sync::Arc, time::Duration, }; @@ -30,38 +31,41 @@ use kvproto::{ tikvpb::*, }; use pd_client::PdClient; -use protobuf::parse_from_bytes; -use raftstore::{ - router::{CdcRaftRouter, ServerRaftStoreRouter}, - RegionInfoAccessor, -}; +use raftstore::{router::CdcRaftRouter, RegionInfoAccessor}; use resolved_ts::LeadershipResolver; use tempdir::TempDir; use test_pd_client::TestPdClient; -use test_raftstore::{new_server_cluster, Cluster, ServerCluster, SimulateTransport}; +use test_raftstore::{new_server_cluster, Cluster, ServerCluster}; use test_util::retry; -use tikv::config::BackupStreamConfig; +use tikv::config::{BackupStreamConfig, ResolvedTsConfig}; use tikv_util::{ codec::{ number::NumberEncoder, stream_event::{EventIterator, Iterator}, }, - info, + debug, info, worker::LazyWorker, HandyRwLock, }; use txn_types::{Key, TimeStamp, WriteRef}; use walkdir::WalkDir; +#[derive(Debug)] +pub struct FileSegments { + path: PathBuf, + segments: Vec<(usize, usize)>, +} + +#[derive(Default, Debug)] +pub struct LogFiles { + default_cf: Vec, + write_cf: Vec, +} + pub type TestEndpoint = Endpoint< ErrorStore, RegionInfoAccessor, engine_test::kv::KvTestEngine, - CdcRaftRouter< - SimulateTransport< - ServerRaftStoreRouter, - >, - >, TestPdClient, >; @@ -259,6 +263,9 @@ pub struct Suite { } impl Suite { + pub const PROMISED_SHORT_VALUE: &'static [u8] = b"hello, world"; + pub const PROMISED_LONG_VALUE: &'static [u8] = &[0xbb; 4096]; + pub fn simple_task(&self, name: &str) -> StreamTask { let mut task = StreamTask::default(); task.info.set_name(name.to_owned()); @@ -343,7 +350,6 @@ impl Suite { let (_, port) = server.bind_addrs().next().unwrap(); let addr = format!("127.0.0.1:{}", port); let channel = ChannelBuilder::new(self.env.clone()).connect(&addr); - println!("connecting channel to {} for store {}", addr, id); let client = LogBackupClient::new(channel); self.servers.push(server); client @@ -376,6 +382,7 @@ impl Suite { id, self.meta_store.clone(), cfg, + ResolvedTsConfig::default(), worker.scheduler(), ob, regions, @@ -391,6 +398,11 @@ impl Suite { MetadataClient::new(self.meta_store.clone(), 0) } + #[allow(dead_code)] + pub fn dump_slash_etc(&self) { + self.meta_store.inner.blocking_lock().dump(); + } + pub fn must_split(&mut self, key: &[u8]) { let region = self.cluster.get_region(key); self.cluster.must_split(®ion, key); @@ -461,7 +473,12 @@ impl Suite { for ts in (from..(from + n)).map(|x| x * 2) { let ts = ts as u64; let key = make_record_key(for_table, ts); - let muts = vec![mutation(key.clone(), b"hello, world".to_vec())]; + let value = if ts % 4 == 0 { + Self::PROMISED_SHORT_VALUE.to_vec() + } else { + Self::PROMISED_LONG_VALUE.to_vec() + }; + let muts = vec![mutation(key.clone(), value)]; let enc_key = Key::from_raw(&key).into_encoded(); let region = self.cluster.get_region_id(&enc_key); let start_ts = self.cluster.pd_client.get_tso().await.unwrap(); @@ -518,45 +535,52 @@ impl Suite { } } - pub fn load_metadata_for_write_records( - &self, - path: &Path, - ) -> HashMap> { - let mut meta_map: HashMap> = HashMap::new(); - for entry in WalkDir::new(path) { - let entry = entry.unwrap(); - if entry.file_type().is_file() - && entry - .file_name() - .to_str() - .map_or(false, |s| s.ends_with(".meta")) - { - let content = std::fs::read(entry.path()).unwrap(); - let meta = parse_from_bytes::(content.as_ref()).unwrap(); - for g in meta.file_groups.into_iter() { - let path = g.path.split('/').last().unwrap(); - for f in g.data_files_info.into_iter() { - let file_info = meta_map.get_mut(path); - if let Some(v) = file_info { - v.push(( - f.range_offset as usize, - (f.range_offset + f.range_length) as usize, - )); + pub fn get_files_to_check(&self, path: &Path) -> std::io::Result { + let mut res = LogFiles::default(); + for entry in WalkDir::new(path.join("v1/backupmeta")) { + let entry = entry?; + if entry.file_name().to_str().unwrap().ends_with(".meta") { + let content = std::fs::read(entry.path())?; + let meta = protobuf::parse_from_bytes::(&content)?; + for fg in meta.get_file_groups() { + let mut default_segs = vec![]; + let mut write_segs = vec![]; + for file in fg.get_data_files_info() { + let v = if file.cf == "default" || file.cf.is_empty() { + Some(&mut default_segs) + } else if file.cf == "write" { + Some(&mut write_segs) } else { - let v = vec![( - f.range_offset as usize, - (f.range_offset + f.range_length) as usize, - )]; - meta_map.insert(String::from(path), v); - } + None + }; + v.into_iter().for_each(|v| { + v.push(( + file.get_range_offset() as usize, + (file.get_range_offset() + file.get_range_length()) as usize, + )) + }); + } + let p = path.join(fg.get_path()); + if !default_segs.is_empty() { + res.default_cf.push(FileSegments { + path: p.clone(), + segments: default_segs, + }) + } + if !write_segs.is_empty() { + res.write_cf.push(FileSegments { + path: p, + segments: write_segs, + }) } } } } - meta_map + Ok(res) } - pub async fn check_for_write_records<'a>( + #[track_caller] + pub fn check_for_write_records<'a>( &self, path: &Path, key_set: impl std::iter::Iterator, @@ -565,41 +589,68 @@ impl Suite { let n = remain_keys.len(); let mut extra_key = 0; let mut extra_len = 0; - let meta_map = self.load_metadata_for_write_records(path); - for entry in WalkDir::new(path) { - let entry = entry.unwrap(); - println!("checking: {:?}", entry); - if entry.file_type().is_file() - && entry - .file_name() - .to_str() - .map_or(false, |s| s.ends_with(".log")) - { - let buf = std::fs::read(entry.path()).unwrap(); - let file_infos = meta_map.get(entry.file_name().to_str().unwrap()).unwrap(); - for &file_info in file_infos { - let mut decoder = ZstdDecoder::new(Vec::new()); - let pbuf: &[u8] = &buf[file_info.0..file_info.1]; - decoder.write_all(pbuf).await.unwrap(); - decoder.flush().await.unwrap(); - decoder.close().await.unwrap(); - let content = decoder.into_inner(); - - let mut iter = EventIterator::new(&content); - loop { - if !iter.valid() { - break; - } - iter.next().unwrap(); - if !remain_keys.remove(iter.key()) { - extra_key += 1; - extra_len += iter.key().len() + iter.value().len(); - } + let files = self.get_files_to_check(path).unwrap_or_default(); + let mut default_keys = HashSet::new(); + let content_of = |buf: &[u8], range: (usize, usize)| { + let mut decoder = ZstdDecoder::new(Vec::new()); + let pbuf: &[u8] = &buf[range.0..range.1]; + run_async_test(async { + decoder.write_all(pbuf).await.unwrap(); + decoder.flush().await.unwrap(); + decoder.close().await.unwrap(); + }); + decoder.into_inner() + }; + for entry in files.write_cf { + debug!("checking write: {:?}", entry); + + let buf = std::fs::read(&entry.path).unwrap(); + for &file_info in entry.segments.iter() { + let data = content_of(&buf, file_info); + let mut iter = EventIterator::new(&data); + loop { + if !iter.valid() { + break; + } + iter.next().unwrap(); + if !remain_keys.remove(iter.key()) { + extra_key += 1; + extra_len += iter.key().len() + iter.value().len(); + } + + let value = iter.value(); + let wf = WriteRef::parse(value).unwrap(); + if wf.short_value.is_none() { + let mut key = Key::from_encoded_slice(iter.key()).truncate_ts().unwrap(); + key.append_ts_inplace(wf.start_ts); + + default_keys.insert(key.into_encoded()); + } else { + assert_eq!(wf.short_value, Some(Self::PROMISED_SHORT_VALUE)); + } + } + } + } + + for entry in files.default_cf { + debug!("checking default: {:?}", entry); - let value = iter.value(); - let wf = WriteRef::parse(value).unwrap(); - assert_eq!(wf.short_value, Some(b"hello, world" as &[u8])); + let buf = std::fs::read(&entry.path).unwrap(); + for &file_info in entry.segments.iter() { + let data = content_of(&buf, file_info); + let mut iter = EventIterator::new(&data); + loop { + if !iter.valid() { + break; + } + iter.next().unwrap(); + if !default_keys.remove(iter.key()) { + extra_key += 1; + extra_len += iter.key().len() + iter.value().len(); } + + let value = iter.value(); + assert_eq!(value, Self::PROMISED_LONG_VALUE); } } } @@ -613,17 +664,19 @@ impl Suite { extra_len ) } - if !remain_keys.is_empty() { - panic!( - "not all keys are recorded: it remains {:?} (total = {})", - remain_keys - .iter() - .take(3) - .map(|v| hex::encode(v)) - .collect::>(), - remain_keys.len() - ); - } + assert_empty(&remain_keys, "not all keys are recorded"); + assert_empty(&default_keys, "some keys don't have default entry"); + } +} + +#[track_caller] +fn assert_empty(v: &HashSet>, msg: impl Display) { + if !v.is_empty() { + panic!( + "{msg}: it remains {:?}... (total = {})", + v.iter().take(3).map(|v| hex::encode(v)).collect::>(), + v.len() + ); } } diff --git a/components/backup/src/disk_snap.rs b/components/backup/src/disk_snap.rs new file mode 100644 index 00000000000..94d956cc11c --- /dev/null +++ b/components/backup/src/disk_snap.rs @@ -0,0 +1,372 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. +//! This module contains things about disk snapshot. + +use std::{ + future::Pending, + sync::{ + atomic::{AtomicU64, Ordering}, + Arc, + }, + task::Poll, + time::Duration, +}; + +use futures::future; +use futures_util::{ + future::{BoxFuture, FutureExt}, + sink::SinkExt, + stream::{AbortHandle, Abortable, StreamExt}, +}; +use grpcio::{RpcStatus, RpcStatusCode, WriteFlags}; +use kvproto::{ + brpb::{ + PrepareSnapshotBackupEventType as PEvnT, PrepareSnapshotBackupRequest as PReq, + PrepareSnapshotBackupRequestType as PReqT, PrepareSnapshotBackupResponse as PResp, + }, + errorpb::{self, StaleCommand}, + metapb::Region, +}; +use raftstore::store::{ + snapshot_backup::{ + AbortReason, PrepareDiskSnapObserver, SnapshotBrHandle, SnapshotBrWaitApplyRequest, + }, + SnapshotBrWaitApplySyncer, +}; +use tikv_util::{sys::thread::ThreadBuildWrapper, warn, Either}; +use tokio::{ + runtime::{Handle, Runtime}, + sync::oneshot, +}; +use tokio_stream::Stream; + +const DEFAULT_RT_THREADS: usize = 2; + +type Result = std::result::Result; + +enum Error { + Uninitialized, + LeaseExpired, + /// Wait apply has been aborted. + /// When the `reason` is `None`, implies the request itself has been + /// canceled (seldom) due to message lost or something. + WaitApplyAborted(Option), + RaftStore(raftstore::Error), +} + +enum HandleErr { + AbortStream(RpcStatus), + SendErrResp(errorpb::Error), +} + +pub struct ResultSink(grpcio::DuplexSink); + +impl From> for ResultSink { + fn from(value: grpcio::DuplexSink) -> Self { + Self(value) + } +} + +impl ResultSink { + async fn send( + mut self, + result: Result, + error_extra_info: impl FnOnce(&mut PResp), + ) -> grpcio::Result { + match result { + // Note: should we batch here? + Ok(item) => self.0.send((item, WriteFlags::default())).await?, + Err(err) => match err.into() { + HandleErr::AbortStream(status) => { + self.0.fail(status.clone()).await?; + return Err(grpcio::Error::RpcFinished(Some(status))); + } + HandleErr::SendErrResp(err) => { + let mut resp = PResp::new(); + error_extra_info(&mut resp); + resp.set_error(err); + self.0.send((resp, WriteFlags::default())).await?; + } + }, + } + Ok(self) + } +} + +impl From for HandleErr { + fn from(value: Error) -> Self { + match value { + Error::Uninitialized => HandleErr::AbortStream(RpcStatus::with_message( + grpcio::RpcStatusCode::UNAVAILABLE, + "coprocessor not initialized".to_owned(), + )), + Error::RaftStore(r) => HandleErr::SendErrResp(errorpb::Error::from(r)), + Error::WaitApplyAborted(reason) => HandleErr::SendErrResp({ + let mut err = errorpb::Error::new(); + err.set_message(format!("wait apply has been aborted, perhaps epoch not match or leadership changed, note = {:?}", reason)); + match reason { + Some(AbortReason::EpochNotMatch(enm)) => err.set_epoch_not_match(enm), + Some(AbortReason::StaleCommand { .. }) => { + err.set_stale_command(StaleCommand::new()) + } + _ => {} + } + err + }), + Error::LeaseExpired => HandleErr::AbortStream(RpcStatus::with_message( + grpcio::RpcStatusCode::FAILED_PRECONDITION, + "the lease has expired, you may not send `wait_apply` because it is no meaning" + .to_string(), + )), + } + } +} + +#[derive(Clone)] +pub struct Env { + pub(crate) handle: SR, + rejector: Arc, + active_stream: Arc, + // Left: a shared tokio runtime. + // Right: a hosted runtime(usually for test cases). + runtime: Either>, +} + +impl Env { + pub fn new( + handle: SR, + rejector: Arc, + runtime: Option, + ) -> Self { + let runtime = match runtime { + None => Either::Right(Self::default_runtime()), + Some(rt) => Either::Left(rt), + }; + Self { + handle, + rejector, + active_stream: Arc::new(AtomicU64::new(0)), + runtime, + } + } + + pub fn active_stream(&self) -> u64 { + self.active_stream.load(Ordering::SeqCst) + } + + pub fn get_async_runtime(&self) -> &Handle { + match &self.runtime { + Either::Left(h) => h, + Either::Right(rt) => rt.handle(), + } + } + + fn check_initialized(&self) -> Result<()> { + if !self.rejector.initialized() { + return Err(Error::Uninitialized); + } + Ok(()) + } + + fn check_rejected(&self) -> Result<()> { + self.check_initialized()?; + if self.rejector.allowed() { + return Err(Error::LeaseExpired); + } + Ok(()) + } + + fn update_lease(&self, lease_dur: Duration) -> Result { + self.check_initialized()?; + let mut event = PResp::new(); + event.set_ty(PEvnT::UpdateLeaseResult); + event.set_last_lease_is_valid(self.rejector.update_lease(lease_dur)); + Ok(event) + } + + fn reset(&self) -> PResp { + let rejected = !self.rejector.allowed(); + self.rejector.reset(); + let mut event = PResp::new(); + event.set_ty(PEvnT::UpdateLeaseResult); + event.set_last_lease_is_valid(rejected); + event + } + + fn default_runtime() -> Arc { + let rt = tokio::runtime::Builder::new_multi_thread() + .worker_threads(DEFAULT_RT_THREADS) + .enable_all() + .with_sys_hooks() + .thread_name("snap_br_backup_prepare") + .build() + .unwrap(); + Arc::new(rt) + } +} + +pub struct StreamHandleLoop { + pending_regions: Vec)>>, + env: Env, + aborted: Abortable>, +} + +impl Drop for StreamHandleLoop { + fn drop(&mut self) { + self.env.active_stream.fetch_sub(1, Ordering::SeqCst); + } +} + +enum StreamHandleEvent { + Req(PReq), + WaitApplyDone(Region, Result<()>), + ConnectionGone(Option), + Abort, +} + +impl StreamHandleLoop { + pub fn new(env: Env) -> (Self, AbortHandle) { + let (aborted, handle) = futures_util::future::abortable(std::future::pending()); + env.active_stream.fetch_add(1, Ordering::SeqCst); + let this = Self { + env, + aborted, + pending_regions: vec![], + }; + (this, handle) + } + + fn async_wait_apply(&mut self, region: &Region) -> BoxFuture<'static, (Region, Result<()>)> { + if let Err(err) = self.env.check_rejected() { + return Box::pin(future::ready((region.clone(), Err(err)))); + } + + let (tx, rx) = oneshot::channel(); + let syncer = SnapshotBrWaitApplySyncer::new(region.id, tx); + let handle = self.env.handle.clone(); + let region = region.clone(); + let epoch = region.get_region_epoch().clone(); + let id = region.get_id(); + let send_res = handle + .send_wait_apply(id, SnapshotBrWaitApplyRequest::strict(syncer, epoch)) + .map_err(Error::RaftStore); + Box::pin( + async move { + send_res?; + rx.await + .map_err(|_| Error::WaitApplyAborted(None)) + .and_then(|report| match report.aborted { + Some(reason) => Err(Error::WaitApplyAborted(Some(reason))), + None => Ok(()), + }) + } + .map(move |res| (region, res)), + ) + } + + async fn next_event( + &mut self, + input: &mut (impl Stream> + Unpin), + ) -> StreamHandleEvent { + let pending_regions = &mut self.pending_regions; + let wait_applies = future::poll_fn(|cx| { + let selected = pending_regions.iter_mut().enumerate().find_map(|(i, fut)| { + match fut.poll_unpin(cx) { + Poll::Ready(r) => Some((i, r)), + Poll::Pending => None, + } + }); + match selected { + Some((i, region)) => { + // We have polled the future (and make sure it has ready) before, it is + // safe to drop this future directly. + let _ = pending_regions.swap_remove(i); + region.into() + } + None => Poll::Pending, + } + }); + + tokio::select! { + wres = wait_applies => { + StreamHandleEvent::WaitApplyDone(wres.0, wres.1) + } + req = input.next() => { + match req { + Some(Ok(req)) => StreamHandleEvent::Req(req), + Some(Err(err)) => StreamHandleEvent::ConnectionGone(Some(err)), + None => StreamHandleEvent::ConnectionGone(None) + } + } + _ = &mut self.aborted => { + StreamHandleEvent::Abort + } + } + } + + pub async fn run( + mut self, + mut input: impl Stream> + Unpin, + mut sink: ResultSink, + ) -> grpcio::Result<()> { + loop { + match self.next_event(&mut input).await { + StreamHandleEvent::Req(req) => match req.get_ty() { + PReqT::UpdateLease => { + let lease_dur = Duration::from_secs(req.get_lease_in_seconds()); + sink = sink + .send(self.env.update_lease(lease_dur), |resp| { + resp.set_ty(PEvnT::UpdateLeaseResult); + }) + .await?; + } + PReqT::WaitApply => { + let regions = req.get_regions(); + for region in regions { + let res = self.async_wait_apply(region); + self.pending_regions.push(res); + } + } + PReqT::Finish => { + sink.send(Ok(self.env.reset()), |_| {}) + .await? + .0 + .close() + .await?; + return Ok(()); + } + }, + StreamHandleEvent::WaitApplyDone(region, res) => { + let resp = res.map(|_| { + let mut resp = PResp::new(); + resp.set_region(region.clone()); + resp.set_ty(PEvnT::WaitApplyDone); + resp + }); + sink = sink + .send(resp, |resp| { + resp.set_ty(PEvnT::WaitApplyDone); + resp.set_region(region); + }) + .await?; + } + StreamHandleEvent::ConnectionGone(err) => { + warn!("the client has gone, aborting loop"; "err" => ?err); + return match err { + None => Ok(()), + Some(err) => Err(err), + }; + } + StreamHandleEvent::Abort => { + warn!("Aborted disk snapshot prepare loop by the server."); + return sink + .0 + .fail(RpcStatus::with_message( + RpcStatusCode::CANCELLED, + "the loop has been aborted by server".to_string(), + )) + .await; + } + } + } + } +} diff --git a/components/backup/src/endpoint.rs b/components/backup/src/endpoint.rs index a4efc162092..4ceb538bb56 100644 --- a/components/backup/src/endpoint.rs +++ b/components/backup/src/endpoint.rs @@ -44,7 +44,7 @@ use tikv_util::{ warn, worker::Runnable, }; -use tokio::runtime::Runtime; +use tokio::runtime::{Handle, Runtime}; use txn_types::{Key, Lock, TimeStamp}; use crate::{ @@ -780,9 +780,13 @@ impl Progress { /// Forward the progress by `ranges` BackupRanges /// /// The size of the returned BackupRanges should <= `ranges` - fn forward(&mut self, limit: usize, replica_read: bool) -> Vec { + /// + /// Notice: Returning an empty BackupRanges means that no leader region + /// corresponding to the current range is sought. The caller should + /// call `forward` again to seek regions for the next range. + fn forward(&mut self, limit: usize, replica_read: bool) -> Option> { if self.finished { - return Vec::new(); + return None; } let store_id = self.store_id; let (tx, rx) = mpsc::channel(); @@ -810,7 +814,13 @@ impl Progress { break; } } - let peer = find_peer(region, store_id).unwrap().to_owned(); + let peer = if let Some(peer) = find_peer(region, store_id) { + peer.to_owned() + } else { + // skip the region at this time, and would retry to backup the region in + // finegrained step. + continue; + }; // Raft peer role has to match the replica read flag. if replica_read || info.role == StateRole::Leader { let ekey = get_min_end_key(end_key.as_ref(), region); @@ -852,7 +862,7 @@ impl Progress { } else { self.try_next(); } - branges + Some(branges) } } @@ -928,7 +938,7 @@ impl Endpoint { let sst_max_size = self.config_manager.0.read().unwrap().sst_max_size.0; let limit = self.softlimit.limit(); let resource_limiter = self.resource_ctl.as_ref().and_then(|r| { - r.get_resource_limiter(&request.resource_group_name, &request.source_tag) + r.get_background_resource_limiter(&request.resource_group_name, &request.source_tag) }); self.pool.borrow_mut().spawn(async move { @@ -958,11 +968,10 @@ impl Endpoint { // (See https://tokio.rs/tokio/tutorial/shared-state) // Use &mut and mark the type for making rust-analyzer happy. let progress: &mut Progress<_> = &mut prs.lock().unwrap(); - let batch = progress.forward(batch_size, request.replica_read); - if batch.is_empty() { - return; + match progress.forward(batch_size, request.replica_read) { + Some(batch) => (batch, progress.codec.is_raw_kv, progress.cf), + None => return, } - (batch, progress.codec.is_raw_kv, progress.cf) }; for brange in batch { @@ -1155,6 +1164,13 @@ impl Endpoint { )); } } + + /// Get the internal handle of the io thread pool used by the backup + /// endpoint. This is mainly shared for disk snapshot backup (so they + /// don't need to spawn on the gRPC pool.) + pub fn io_pool_handle(&self) -> &Handle { + self.io_pool.handle() + } } impl Runnable for Endpoint { @@ -1547,7 +1563,7 @@ pub mod tests { let mut ranges = Vec::with_capacity(expect.len()); while ranges.len() != expect.len() { let n = (rand::random::() % 3) + 1; - let mut r = prs.forward(n, false); + let mut r = prs.forward(n, false).unwrap(); // The returned backup ranges should <= n assert!(r.len() <= n); @@ -1792,23 +1808,18 @@ pub mod tests { ); let mut ranges = Vec::with_capacity(expect.len()); - while ranges.len() != expect.len() { + loop { let n = (rand::random::() % 3) + 1; - let mut r = prs.forward(n, false); + let mut r = match prs.forward(n, false) { + None => break, + Some(r) => r, + }; // The returned backup ranges should <= n assert!(r.len() <= n); - if r.is_empty() { - // if return a empty vec then the progress is finished - assert_eq!( - ranges.len(), - expect.len(), - "got {:?}, expect {:?}", - ranges, - expect - ); + if !r.is_empty() { + ranges.append(&mut r); } - ranges.append(&mut r); } for (a, b) in ranges.into_iter().zip(expect) { @@ -1949,6 +1960,74 @@ pub mod tests { } } + fn fake_empty_marker() -> Vec { + vec![super::BackupRange { + start_key: None, + end_key: None, + region: Region::new(), + peer: Peer::new(), + codec: KeyValueCodec::new(false, ApiVersion::V1, ApiVersion::V1), + cf: "", + uses_replica_read: false, + }] + } + + #[test] + fn test_seek_ranges_2() { + let (_tmp, endpoint) = new_endpoint(); + + endpoint.region_info.set_regions(vec![ + (b"2".to_vec(), b"4".to_vec(), 1), + (b"6".to_vec(), b"8".to_vec(), 2), + ]); + let sub_ranges: Vec<(&[u8], &[u8])> = vec![(b"1", b"11"), (b"3", b"7"), (b"8", b"9")]; + let expect: Vec<(&[u8], &[u8])> = vec![(b"", b""), (b"3", b"4"), (b"6", b"7"), (b"", b"")]; + + let mut ranges = Vec::with_capacity(sub_ranges.len()); + for &(start_key, end_key) in &sub_ranges { + let start_key = (!start_key.is_empty()).then_some(Key::from_raw(start_key)); + let end_key = (!end_key.is_empty()).then_some(Key::from_raw(end_key)); + ranges.push((start_key, end_key)); + } + let mut prs = Progress::new_with_ranges( + endpoint.store_id, + ranges, + endpoint.region_info, + KeyValueCodec::new(false, ApiVersion::V1, ApiVersion::V1), + engine_traits::CF_DEFAULT, + ); + + let mut ranges = Vec::with_capacity(expect.len()); + loop { + let n = (rand::random::() % 2) + 1; + let mut r = match prs.forward(n, false) { + None => break, + Some(r) => r, + }; + // The returned backup ranges should <= n + assert!(r.len() <= n); + + if !r.is_empty() { + ranges.append(&mut r); + } else { + // append the empty marker + ranges.append(&mut fake_empty_marker()); + } + } + + assert!(ranges.len() == expect.len()); + for (a, b) in ranges.into_iter().zip(expect) { + assert_eq!( + a.start_key.map_or_else(Vec::new, |k| k.into_raw().unwrap()), + b.0 + ); + assert_eq!( + a.end_key.map_or_else(Vec::new, |k| k.into_raw().unwrap()), + b.1 + ); + } + } + #[test] fn test_handle_backup_task() { let limiter = Arc::new(IoRateLimiter::new_for_test()); diff --git a/components/backup/src/lib.rs b/components/backup/src/lib.rs index bf333424603..30345665369 100644 --- a/components/backup/src/lib.rs +++ b/components/backup/src/lib.rs @@ -5,6 +5,7 @@ #[allow(unused_extern_crates)] extern crate tikv_alloc; +pub mod disk_snap; mod endpoint; mod errors; mod metrics; diff --git a/components/backup/src/service.rs b/components/backup/src/service.rs index 237234c061e..bb419e9702a 100644 --- a/components/backup/src/service.rs +++ b/components/backup/src/service.rs @@ -1,88 +1,85 @@ // Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. -use std::sync::atomic::*; +use std::sync::{atomic::*, Arc, Mutex}; -use engine_traits::{KvEngine, RaftEngine}; use futures::{channel::mpsc, FutureExt, SinkExt, StreamExt, TryFutureExt}; +use futures_util::stream::AbortHandle; use grpcio::{self, *}; use kvproto::brpb::*; -use raftstore::store::{ - fsm::store::RaftRouter, - msg::{PeerMsg, SignificantMsg}, -}; -use tikv_util::{error, info, worker::*}; +use raftstore::store::snapshot_backup::SnapshotBrHandle; +use tikv_util::{error, info, warn, worker::*}; use super::Task; +use crate::disk_snap::{self, StreamHandleLoop}; /// Service handles the RPC messages for the `Backup` service. #[derive(Clone)] -pub struct Service { +pub struct Service { scheduler: Scheduler, - router: Option>, + snap_br_env: disk_snap::Env, + abort_last_req: Arc>>, } -impl Service +impl Service where - EK: KvEngine, - ER: RaftEngine, + H: SnapshotBrHandle, { - // Create a new backup service without router, this used for raftstore v2. - // because we don't have RaftStoreRouter any more. - pub fn new(scheduler: Scheduler) -> Self { + /// Create a new backup service. + pub fn new(scheduler: Scheduler, env: disk_snap::Env) -> Self { Service { scheduler, - router: None, - } - } - - // Create a new backup service with router, this used for raftstore v1. - pub fn with_router(scheduler: Scheduler, router: RaftRouter) -> Self { - Service { - scheduler, - router: Some(router), + snap_br_env: env, + abort_last_req: Arc::default(), } } } -impl Backup for Service +impl Backup for Service where - EK: KvEngine, - ER: RaftEngine, + H: SnapshotBrHandle + 'static, { + /// Check a region whether there is pending admin requests(including pending + /// merging). + /// + /// In older versions of disk snapshot backup, this will be called after we + /// paused all scheduler. + /// + /// This is kept for compatibility with previous versions. fn check_pending_admin_op( &mut self, ctx: RpcContext<'_>, _req: CheckAdminRequest, mut sink: ServerStreamingSink, ) { - let (tx, rx) = mpsc::unbounded(); - match &self.router { - Some(router) => { - router.broadcast_normal(|| { - PeerMsg::SignificantMsg(SignificantMsg::CheckPendingAdmin(tx.clone())) - }); - let send_task = async move { - let mut s = rx.map(|resp| Ok((resp, WriteFlags::default()))); - sink.send_all(&mut s).await?; - sink.close().await?; - Ok(()) - } - .map(|res: Result<()>| match res { - Ok(_) => { - info!("check admin closed"); - } - Err(e) => { - error!("check admin canceled"; "error" => ?e); - } - }); - ctx.spawn(send_task); + let handle = self.snap_br_env.handle.clone(); + let tokio_handle = self.snap_br_env.get_async_runtime().clone(); + let peer = ctx.peer(); + let task = async move { + let (tx, rx) = mpsc::unbounded(); + if let Err(err) = handle.broadcast_check_pending_admin(tx) { + return sink + .fail(RpcStatus::with_message( + RpcStatusCode::INTERNAL, + format!("{err}"), + )) + .await; } - None => { - // check pending admin reqeust is used for EBS Backup. - // for raftstore v2. we don't need it for now. so just return unimplemented - unimplemented_call!(ctx, sink) + sink.send_all(&mut rx.map(|resp| Ok((resp, WriteFlags::default())))) + .await?; + sink.close().await?; + Ok(()) + }; + + tokio_handle.spawn(async move { + match task.await { + Err(err) => { + warn!("check admin canceled"; "peer" => %peer, "err" => %err); + } + Ok(()) => { + info!("check admin closed"; "peer" => %peer); + } } - } + }); } fn backup( @@ -137,25 +134,90 @@ where ctx.spawn(send_task); } + + /// The new method for preparing a disk snapshot backup. + /// Generally there will be some steps for the client to do: + /// 1. Establish a `prepare_snapshot_backup` connection. + /// 2. Send a initial `UpdateLease`. And we should update the lease + /// periodically. + /// 3. Send `WaitApply` to each leader peer in this store. + /// 4. Once `WaitApply` for all regions have done, we can take disk + /// snapshot. + /// 5. Once all snapshots have been taken, send `Finalize` to stop. + fn prepare_snapshot_backup( + &mut self, + ctx: grpcio::RpcContext<'_>, + stream: grpcio::RequestStream, + sink: grpcio::DuplexSink, + ) { + let (l, new_cancel) = StreamHandleLoop::new(self.snap_br_env.clone()); + let peer = ctx.peer(); + // Note: should we disconnect here once there are more than one stream...? + // Generally once two streams enter here, one may exit + info!("A new prepare snapshot backup stream created!"; + "peer" => %peer, + "stream_count" => %self.snap_br_env.active_stream(), + ); + let abort_last_req = self.abort_last_req.clone(); + self.snap_br_env.get_async_runtime().spawn(async move { + { + let mut lock = abort_last_req.lock().unwrap(); + if let Some(cancel) = &*lock { + cancel.abort(); + } + *lock = Some(new_cancel); + } + let res = l.run(stream, sink.into()).await; + info!("stream closed; probably everything is done or a problem cannot be retried happens"; + "result" => ?res, "peer" => %peer); + }); + } } #[cfg(test)] mod tests { use std::{sync::Arc, time::Duration}; - use engine_rocks::RocksEngine; use external_storage_export::make_local_backend; use tikv::storage::txn::tests::{must_commit, must_prewrite_put}; use tikv_util::worker::{dummy_scheduler, ReceiverWrapper}; use txn_types::TimeStamp; use super::*; - use crate::endpoint::tests::*; + use crate::{disk_snap::Env, endpoint::tests::*}; + + #[derive(Clone)] + struct PanicHandle; + + impl SnapshotBrHandle for PanicHandle { + fn send_wait_apply( + &self, + _region: u64, + _req: raftstore::store::snapshot_backup::SnapshotBrWaitApplyRequest, + ) -> raftstore::Result<()> { + panic!("this case shouldn't call this!") + } + + fn broadcast_wait_apply( + &self, + _req: raftstore::store::snapshot_backup::SnapshotBrWaitApplyRequest, + ) -> raftstore::Result<()> { + panic!("this case shouldn't call this!") + } + + fn broadcast_check_pending_admin( + &self, + _tx: mpsc::UnboundedSender, + ) -> raftstore::Result<()> { + panic!("this case shouldn't call this!") + } + } fn new_rpc_suite() -> (Server, BackupClient, ReceiverWrapper) { let env = Arc::new(EnvBuilder::new().build()); let (scheduler, rx) = dummy_scheduler(); - let backup_service = super::Service::::new(scheduler); + let backup_service = + super::Service::new(scheduler, Env::new(PanicHandle, Default::default(), None)); let builder = ServerBuilder::new(env.clone()).register_service(create_backup(backup_service)); let mut server = builder.bind("127.0.0.1", 0).build().unwrap(); diff --git a/components/batch-system/Cargo.toml b/components/batch-system/Cargo.toml index ac69d544a21..bd1ae6c56b4 100644 --- a/components/batch-system/Cargo.toml +++ b/components/batch-system/Cargo.toml @@ -10,6 +10,7 @@ test-runner = ["derive_more"] [dependencies] collections = { workspace = true } crossbeam = "0.8" +dashmap = "5.2" derive_more = { version = "0.99", optional = true } fail = "0.5" file_system = { workspace = true } diff --git a/components/batch-system/src/OWNERS b/components/batch-system/src/OWNERS new file mode 100644 index 00000000000..70df3de925d --- /dev/null +++ b/components/batch-system/src/OWNERS @@ -0,0 +1,7 @@ +# See the OWNERS docs at https://go.k8s.io/owners +options: + no_parent_owners: true +filters: + "(OWNERS|config\\.rs)$": + approvers: + - sig-critical-approvers-config-components diff --git a/components/batch-system/src/router.rs b/components/batch-system/src/router.rs index 119b7875506..4f886fe3b3d 100644 --- a/components/batch-system/src/router.rs +++ b/components/batch-system/src/router.rs @@ -1,21 +1,17 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. // #[PerformanceCriticalPath] -use std::{ - cell::Cell, - mem, - sync::{ - atomic::{AtomicBool, AtomicUsize, Ordering}, - Arc, Mutex, - }, +use std::sync::{ + atomic::{AtomicBool, AtomicUsize, Ordering}, + Arc, }; -use collections::HashMap; use crossbeam::channel::{SendError, TrySendError}; -use tikv_util::{debug, info, lru::LruCache, time::Instant, Either}; +use dashmap::DashMap; +use tikv_util::{debug, info, time::Instant, Either}; use crate::{ - fsm::{Fsm, FsmScheduler, FsmState}, + fsm::{Fsm, FsmScheduler}, mailbox::{BasicMailbox, Mailbox}, metrics::*, }; @@ -27,18 +23,14 @@ pub struct RouterTrace { pub leak: usize, } -struct NormalMailMap { - map: HashMap>, - // Count of Mailboxes that is stored in `map`. - alive_cnt: Arc, -} - enum CheckDoResult { NotExist, Invalid, Valid(T), } +const ROUTER_SHRINK_SIZE: usize = 1000; + /// Router routes messages to its target FSM's mailbox. /// /// In our abstract model, every batch system has two different kind of @@ -54,8 +46,7 @@ enum CheckDoResult { /// Normal FSM and control FSM can have different scheduler, but this is not /// required. pub struct Router { - normals: Arc>>, - caches: Cell>>, + normals: Arc>>, pub(super) control_box: BasicMailbox, // TODO: These two schedulers should be unified as single one. However // it's not possible to write FsmScheduler + FsmScheduler @@ -85,11 +76,7 @@ where state_cnt: Arc, ) -> Router { Router { - normals: Arc::new(Mutex::new(NormalMailMap { - map: HashMap::default(), - alive_cnt: Arc::default(), - })), - caches: Cell::new(LruCache::with_capacity_and_sample(1024, 7)), + normals: Arc::new(DashMap::default()), control_box, normal_scheduler, control_scheduler, @@ -106,72 +93,32 @@ where /// A helper function that tries to unify a common access pattern to /// mailbox. /// - /// Generally, when sending a message to a mailbox, cache should be - /// check first, if not found, lock should be acquired. - /// /// Returns None means there is no mailbox inside the normal registry. /// Some(None) means there is expected mailbox inside the normal registry /// but it returns None after apply the given function. Some(Some) means - /// the given function returns Some and cache is updated if it's invalid. + /// the given function returns Some. #[inline] fn check_do(&self, addr: u64, mut f: F) -> CheckDoResult where F: FnMut(&BasicMailbox) -> Option, { - let caches = unsafe { &mut *self.caches.as_ptr() }; - let mut connected = true; - if let Some(mailbox) = caches.get(&addr) { - match f(mailbox) { - Some(r) => return CheckDoResult::Valid(r), - None => { - connected = false; - } - } - } - - let (cnt, mailbox) = { - let mut boxes = self.normals.lock().unwrap(); - let cnt = boxes.map.len(); - let b = match boxes.map.get_mut(&addr) { - Some(mailbox) => mailbox.clone(), - None => { - drop(boxes); - if !connected { - caches.remove(&addr); - } - return CheckDoResult::NotExist; - } - }; - (cnt, b) - }; - if cnt > caches.capacity() || cnt < caches.capacity() / 2 { - caches.resize(cnt); - } - - let res = f(&mailbox); - match res { - Some(r) => { - caches.insert(addr, mailbox); - CheckDoResult::Valid(r) - } + let mailbox = match self.normals.get_mut(&addr) { + Some(mailbox) => mailbox, None => { - if !connected { - caches.remove(&addr); - } - CheckDoResult::Invalid + return CheckDoResult::NotExist; } + }; + match f(&mailbox) { + Some(r) => CheckDoResult::Valid(r), + None => CheckDoResult::Invalid, } } /// Register a mailbox with given address. pub fn register(&self, addr: u64, mailbox: BasicMailbox) { - let mut normals = self.normals.lock().unwrap(); - if let Some(mailbox) = normals.map.insert(addr, mailbox) { + if let Some(mailbox) = self.normals.insert(addr, mailbox) { mailbox.close(); } - normals - .alive_cnt - .store(normals.map.len(), Ordering::Relaxed); } /// Same as send a message and then register the mailbox. @@ -183,32 +130,22 @@ where mailbox: BasicMailbox, msg: N::Message, ) -> Result<(), (BasicMailbox, N::Message)> { - let mut normals = self.normals.lock().unwrap(); - // Send has to be done within lock, otherwise the message may be handled - // before the mailbox is register. + if let Some(mailbox) = self.normals.insert(addr, mailbox.clone()) { + mailbox.close(); + } if let Err(SendError(m)) = mailbox.force_send(msg, &self.normal_scheduler) { + self.normals.remove(&addr); return Err((mailbox, m)); } - if let Some(mailbox) = normals.map.insert(addr, mailbox) { - mailbox.close(); - } - normals - .alive_cnt - .store(normals.map.len(), Ordering::Relaxed); Ok(()) } pub fn register_all(&self, mailboxes: Vec<(u64, BasicMailbox)>) { - let mut normals = self.normals.lock().unwrap(); - normals.map.reserve(mailboxes.len()); for (addr, mailbox) in mailboxes { - if let Some(m) = normals.map.insert(addr, mailbox) { + if let Some(m) = self.normals.insert(addr, mailbox) { m.close(); } } - normals - .alive_cnt - .store(normals.map.len(), Ordering::Relaxed); } /// Get the mailbox of specified address. @@ -280,13 +217,11 @@ where pub fn force_send(&self, addr: u64, msg: N::Message) -> Result<(), SendError> { match self.send(addr, msg) { Ok(()) => Ok(()), - Err(TrySendError::Full(m)) => { - let caches = unsafe { &mut *self.caches.as_ptr() }; - caches - .get(&addr) - .unwrap() - .force_send(m, &self.normal_scheduler) - } + Err(TrySendError::Full(m)) => self + .normals + .get(&addr) + .unwrap() + .force_send(m, &self.normal_scheduler), Err(TrySendError::Disconnected(m)) => { if self.is_shutdown() { Ok(()) @@ -321,10 +256,9 @@ where /// Try to notify all normal FSMs a message. pub fn broadcast_normal(&self, mut msg_gen: impl FnMut() -> N::Message) { let timer = Instant::now_coarse(); - let mailboxes = self.normals.lock().unwrap(); - for mailbox in mailboxes.map.values() { + self.normals.iter().for_each(|mailbox| { let _ = mailbox.force_send(msg_gen(), &self.normal_scheduler); - } + }); BROADCAST_NORMAL_DURATION.observe(timer.saturating_elapsed_secs()); } @@ -332,12 +266,13 @@ where pub fn broadcast_shutdown(&self) { info!("broadcasting shutdown"); self.shutdown.store(true, Ordering::SeqCst); - unsafe { &mut *self.caches.as_ptr() }.clear(); - let mut mailboxes = self.normals.lock().unwrap(); - for (addr, mailbox) in mailboxes.map.drain() { + for e in self.normals.iter() { + let addr = e.key(); + let mailbox = e.value(); debug!("[region {}] shutdown mailbox", addr); mailbox.close(); } + self.normals.clear(); self.control_box.close(); self.normal_scheduler.shutdown(); self.control_scheduler.shutdown(); @@ -346,51 +281,32 @@ where /// Close the mailbox of address. pub fn close(&self, addr: u64) { info!("shutdown mailbox"; "region_id" => addr); - unsafe { &mut *self.caches.as_ptr() }.remove(&addr); - let mut mailboxes = self.normals.lock().unwrap(); - if let Some(mb) = mailboxes.map.remove(&addr) { + if let Some((_, mb)) = self.normals.remove(&addr) { mb.close(); } - mailboxes - .alive_cnt - .store(mailboxes.map.len(), Ordering::Relaxed); - } - - pub fn clear_cache(&self) { - unsafe { &mut *self.caches.as_ptr() }.clear(); + if self.normals.capacity() - self.normals.len() > ROUTER_SHRINK_SIZE { + self.normals.shrink_to_fit(); + } } pub fn state_cnt(&self) -> &Arc { &self.state_cnt } - pub fn alive_cnt(&self) -> Arc { - self.normals.lock().unwrap().alive_cnt.clone() + pub fn alive_cnt(&self) -> usize { + self.normals.len() } pub fn trace(&self) -> RouterTrace { - let alive = self.normals.lock().unwrap().alive_cnt.clone(); + let alive = self.alive_cnt(); let total = self.state_cnt.load(Ordering::Relaxed); - let alive = alive.load(Ordering::Relaxed); // 1 represents the control fsm. let leak = if total > alive + 1 { total - alive - 1 } else { 0 }; - let mailbox_unit = mem::size_of::<(u64, BasicMailbox)>(); - let state_unit = mem::size_of::>(); - // Every message in crossbeam sender needs 8 bytes to store state. - let message_unit = mem::size_of::() + 8; - // crossbeam unbounded channel sender has a list of blocks. Every block has 31 - // unit and every sender has at least one sender. - let sender_block_unit = 31; - RouterTrace { - alive: (mailbox_unit * 8 / 7 // hashmap uses 7/8 of allocated memory. - + state_unit + message_unit * sender_block_unit) - * alive, - leak: (state_unit + message_unit * sender_block_unit) * leak, - } + RouterTrace { alive, leak } } } @@ -398,7 +314,6 @@ impl Clone for Router { fn clone(&self) -> Router { Router { normals: self.normals.clone(), - caches: Cell::new(LruCache::with_capacity_and_sample(1024, 7)), control_box: self.control_box.clone(), // These two schedulers should be unified as single one. However // it's not possible to write FsmScheduler + FsmScheduler diff --git a/components/batch-system/tests/cases/router.rs b/components/batch-system/tests/cases/router.rs index d746dfad5cb..66d0770d544 100644 --- a/components/batch-system/tests/cases/router.rs +++ b/components/batch-system/tests/cases/router.rs @@ -143,25 +143,19 @@ fn test_router_trace() { router.close(addr); }; - let router_clone = router.clone(); + let mut mailboxes = vec![]; for i in 0..10 { register_runner(i); - // Read mailbox to cache. - router_clone.mailbox(i).unwrap(); + mailboxes.push(router.mailbox(i).unwrap()); } - assert_eq!(router.alive_cnt().load(Ordering::Relaxed), 10); + assert_eq!(router.alive_cnt(), 10); assert_eq!(router.state_cnt().load(Ordering::Relaxed), 11); - // Routers closed but exist in the cache. for i in 0..10 { close_runner(i); } - assert_eq!(router.alive_cnt().load(Ordering::Relaxed), 0); + assert_eq!(router.alive_cnt(), 0); assert_eq!(router.state_cnt().load(Ordering::Relaxed), 11); - for i in 0..1024 { - register_runner(i); - // Read mailbox to cache, closed routers should be evicted. - router_clone.mailbox(i).unwrap(); - } - assert_eq!(router.alive_cnt().load(Ordering::Relaxed), 1024); - assert_eq!(router.state_cnt().load(Ordering::Relaxed), 1025); + drop(mailboxes); + assert_eq!(router.alive_cnt(), 0); + assert_eq!(router.state_cnt().load(Ordering::Relaxed), 1); } diff --git a/components/cdc/src/OWNERS b/components/cdc/src/OWNERS new file mode 100644 index 00000000000..774ac39e9f3 --- /dev/null +++ b/components/cdc/src/OWNERS @@ -0,0 +1,7 @@ +# See the OWNERS docs at https://go.k8s.io/owners +options: + no_parent_owners: true +filters: + "(OWNERS|config\\.rs)$": + approvers: + - sig-critical-approvers-config-cdc diff --git a/components/cdc/src/channel.rs b/components/cdc/src/channel.rs index b11799d87c1..ad9ebdd0d44 100644 --- a/components/cdc/src/channel.rs +++ b/components/cdc/src/channel.rs @@ -3,7 +3,7 @@ use std::{ fmt, sync::{ - atomic::{AtomicUsize, Ordering}, + atomic::{AtomicBool, Ordering}, Arc, }, time::Duration, @@ -20,9 +20,15 @@ use futures::{ use grpcio::WriteFlags; use kvproto::cdcpb::{ChangeDataEvent, Event, ResolvedTs}; use protobuf::Message; -use tikv_util::{future::block_on_timeout, impl_display_as_debug, time::Instant, warn}; +use tikv_util::{ + future::block_on_timeout, + impl_display_as_debug, info, + memory::{MemoryQuota, MemoryQuotaExceeded}, + time::Instant, + warn, +}; -use crate::metrics::*; +use crate::{metrics::*, service::ConnId}; /// The maximum bytes of events can be batched into one `CdcEvent::Event`, 32KB. pub const CDC_EVENT_MAX_BYTES: usize = 32 * 1024; @@ -57,6 +63,9 @@ pub enum CdcEvent { impl CdcEvent { pub fn size(&self) -> u32 { + fail::fail_point!("cdc_event_size", |size| size + .map(|s| s.parse::().unwrap()) + .unwrap_or(0)); match self { CdcEvent::ResolvedTs(ref r) => { // For region id, it is unlikely to exceed 100,000,000 which is @@ -185,71 +194,7 @@ impl EventBatcher { } } -#[derive(Clone)] -pub struct MemoryQuota { - capacity: Arc, - in_use: Arc, -} - -impl MemoryQuota { - pub fn new(capacity: usize) -> MemoryQuota { - MemoryQuota { - capacity: Arc::new(AtomicUsize::new(capacity)), - in_use: Arc::new(AtomicUsize::new(0)), - } - } - - pub fn in_use(&self) -> usize { - self.in_use.load(Ordering::Relaxed) - } - - pub(crate) fn capacity(&self) -> usize { - self.capacity.load(Ordering::Acquire) - } - - pub(crate) fn set_capacity(&self, capacity: usize) { - self.capacity.store(capacity, Ordering::Release) - } - - fn alloc(&self, bytes: usize) -> bool { - let mut in_use_bytes = self.in_use.load(Ordering::Relaxed); - let capacity = self.capacity.load(Ordering::Acquire); - loop { - if in_use_bytes + bytes > capacity { - return false; - } - let new_in_use_bytes = in_use_bytes + bytes; - match self.in_use.compare_exchange_weak( - in_use_bytes, - new_in_use_bytes, - Ordering::Acquire, - Ordering::Relaxed, - ) { - Ok(_) => return true, - Err(current) => in_use_bytes = current, - } - } - } - - fn free(&self, bytes: usize) { - let mut in_use_bytes = self.in_use.load(Ordering::Relaxed); - loop { - // Saturating at the numeric bounds instead of overflowing. - let new_in_use_bytes = in_use_bytes - std::cmp::min(bytes, in_use_bytes); - match self.in_use.compare_exchange_weak( - in_use_bytes, - new_in_use_bytes, - Ordering::Acquire, - Ordering::Relaxed, - ) { - Ok(_) => return, - Err(current) => in_use_bytes = current, - } - } - } -} - -pub fn channel(buffer: usize, memory_quota: MemoryQuota) -> (Sink, Drain) { +pub fn channel(conn_id: ConnId, buffer: usize, memory_quota: Arc) -> (Sink, Drain) { let (unbounded_sender, unbounded_receiver) = unbounded(); let (bounded_sender, bounded_receiver) = bounded(buffer); ( @@ -262,6 +207,7 @@ pub fn channel(buffer: usize, memory_quota: MemoryQuota) -> (Sink, Drain) { unbounded_receiver, bounded_receiver, memory_quota, + conn_id, }, ) } @@ -297,24 +243,73 @@ macro_rules! impl_from_future_send_error { impl_from_future_send_error! { FuturesSendError, - TrySendError<(CdcEvent, usize)>, + TrySendError, + TrySendError, +} + +impl From for SendError { + fn from(_: MemoryQuotaExceeded) -> Self { + SendError::Congested + } +} + +pub struct ObservedEvent { + pub created: Instant, + pub event: CdcEvent, + pub size: usize, +} + +pub struct ScanedEvent { + pub created: Instant, + pub event: CdcEvent, + pub size: usize, + // Incremental scan can be canceled by region errors. We must check it when draing + // an event instead of emit it to `Sink`. + pub truncated: Arc, +} + +impl ObservedEvent { + fn new(created: Instant, event: CdcEvent, size: usize) -> Self { + ObservedEvent { + created, + event, + size, + } + } +} + +impl ScanedEvent { + fn new(created: Instant, event: CdcEvent, size: usize, truncated: Arc) -> Self { + ScanedEvent { + created, + event, + size, + truncated, + } + } } #[derive(Clone)] pub struct Sink { - unbounded_sender: UnboundedSender<(CdcEvent, usize)>, - bounded_sender: Sender<(CdcEvent, usize)>, - memory_quota: MemoryQuota, + unbounded_sender: UnboundedSender, + bounded_sender: Sender, + memory_quota: Arc, } impl Sink { - pub fn unbounded_send(&self, event: CdcEvent, force: bool) -> Result<(), SendError> { + /// Only observed events can be sent by `unbounded_send`. + pub fn unbounded_send(&self, observed_event: CdcEvent, force: bool) -> Result<(), SendError> { // Try it's best to send error events. - let bytes = if !force { event.size() as usize } else { 0 }; - if bytes != 0 && !self.memory_quota.alloc(bytes) { - return Err(SendError::Congested); + let bytes = if !force { + observed_event.size() as usize + } else { + 0 + }; + if bytes != 0 { + self.memory_quota.alloc(bytes)?; } - match self.unbounded_sender.unbounded_send((event, bytes)) { + let ob_event = ObservedEvent::new(Instant::now_coarse(), observed_event, bytes); + match self.unbounded_sender.unbounded_send(ob_event) { Ok(_) => Ok(()), Err(e) => { // Free quota if send fails. @@ -324,19 +319,25 @@ impl Sink { } } - pub async fn send_all(&mut self, events: Vec) -> Result<(), SendError> { + /// Only scaned events can be sent by `send_all`. + pub async fn send_all( + &mut self, + scaned_events: Vec, + truncated: Arc, + ) -> Result<(), SendError> { // Allocate quota in advance. let mut total_bytes = 0; - for event in &events { + for event in &scaned_events { let bytes = event.size(); total_bytes += bytes; } - if !self.memory_quota.alloc(total_bytes as _) { - return Err(SendError::Congested); - } - for event in events { + self.memory_quota.alloc(total_bytes as _)?; + + let now = Instant::now_coarse(); + for event in scaned_events { let bytes = event.size() as usize; - if let Err(e) = self.bounded_sender.feed((event, bytes)).await { + let sc_event = ScanedEvent::new(now, event, bytes, truncated.clone()); + if let Err(e) = self.bounded_sender.feed(sc_event).await { // Free quota if send fails. self.memory_quota.free(total_bytes as _); return Err(SendError::from(e)); @@ -352,24 +353,33 @@ impl Sink { } pub struct Drain { - unbounded_receiver: UnboundedReceiver<(CdcEvent, usize)>, - bounded_receiver: Receiver<(CdcEvent, usize)>, - memory_quota: MemoryQuota, + unbounded_receiver: UnboundedReceiver, + bounded_receiver: Receiver, + memory_quota: Arc, + conn_id: ConnId, } impl<'a> Drain { pub fn drain(&'a mut self) -> impl Stream + 'a { - stream::select(&mut self.bounded_receiver, &mut self.unbounded_receiver).map( - |(mut event, size)| { - if let CdcEvent::Barrier(ref mut barrier) = event { - if let Some(barrier) = barrier.take() { - // Unset barrier when it is received. - barrier(()); - } + let observed = (&mut self.unbounded_receiver).map(|x| (x.created, x.event, x.size)); + let scaned = (&mut self.bounded_receiver).filter_map(|x| { + if x.truncated.load(Ordering::Acquire) { + self.memory_quota.free(x.size as _); + return futures::future::ready(None); + } + futures::future::ready(Some((x.created, x.event, x.size))) + }); + + stream::select(scaned, observed).map(|(start, mut event, size)| { + CDC_EVENTS_PENDING_DURATION.observe(start.saturating_elapsed_secs() * 1000.0); + if let CdcEvent::Barrier(ref mut barrier) = event { + if let Some(barrier) = barrier.take() { + // Unset barrier when it is received. + barrier(()); } - (event, size) - }, - ) + } + (event, size) + }) } // Forwards contents to the sink, simulates StreamExt::forward. @@ -413,14 +423,17 @@ impl Drop for Drain { self.bounded_receiver.close(); self.unbounded_receiver.close(); let start = Instant::now(); - let mut drain = Box::pin(async { + let mut total_bytes = 0; + let mut drain = Box::pin(async move { + let conn_id = self.conn_id; let memory_quota = self.memory_quota.clone(); - let mut total_bytes = 0; let mut drain = self.drain(); while let Some((_, bytes)) = drain.next().await { total_bytes += bytes; } memory_quota.free(total_bytes); + info!("drop Drain finished, free memory"; "conn_id" => ?conn_id, + "freed_bytes" => total_bytes, "inuse_bytes" => memory_quota.in_use()); }); block_on(&mut drain); let takes = start.saturating_elapsed(); @@ -440,7 +453,11 @@ where #[cfg(test)] mod tests { - use std::{assert_matches::assert_matches, sync::mpsc, time::Duration}; + use std::{ + assert_matches::assert_matches, + sync::{mpsc, Arc}, + time::Duration, + }; use futures::executor::block_on; use kvproto::cdcpb::{ @@ -451,20 +468,52 @@ mod tests { type Send = Box Result<(), SendError>>; fn new_test_channel(buffer: usize, capacity: usize, force_send: bool) -> (Send, Drain) { - let memory_quota = MemoryQuota::new(capacity); - let (mut tx, rx) = channel(buffer, memory_quota); + let memory_quota = Arc::new(MemoryQuota::new(capacity)); + let (mut tx, rx) = channel(ConnId::default(), buffer, memory_quota); let mut flag = true; let send = move |event| { flag = !flag; if flag { tx.unbounded_send(event, force_send) } else { - block_on(tx.send_all(vec![event])) + block_on(tx.send_all(vec![event], Arc::new(Default::default()))) } }; (Box::new(send), rx) } + #[test] + fn test_scanned_event() { + let mut e = Event::default(); + e.region_id = 233; + { + let memory_quota = Arc::new(MemoryQuota::new(1024)); + let (mut tx, mut rx) = channel(ConnId::default(), 10, memory_quota); + + let truncated = Arc::new(AtomicBool::new(false)); + let event = CdcEvent::Event(e.clone()); + let size = event.size() as usize; + let _ = block_on(tx.send_all(vec![event], truncated)); + + let memory_quota = rx.memory_quota.clone(); + let mut drain = rx.drain(); + assert_matches!(block_on(drain.next()), Some((CdcEvent::Event(_), _))); + assert_eq!(memory_quota.in_use(), size); + } + { + let memory_quota = Arc::new(MemoryQuota::new(1024)); + let (mut tx, mut rx) = channel(ConnId::default(), 10, memory_quota); + + let truncated = Arc::new(AtomicBool::new(true)); + let _ = block_on(tx.send_all(vec![CdcEvent::Event(e)], truncated)); + + let memory_quota = rx.memory_quota.clone(); + let mut drain = rx.drain(); + recv_timeout(&mut drain, Duration::from_millis(100)).unwrap_err(); + assert_eq!(memory_quota.in_use(), 0); + } + } + #[test] fn test_barrier() { let force_send = false; @@ -599,8 +648,8 @@ mod tests { // 1KB let max_pending_bytes = 1024; let buffer = max_pending_bytes / event.size(); - let memory_quota = MemoryQuota::new(max_pending_bytes as _); - let (tx, _rx) = channel(buffer as _, memory_quota); + let memory_quota = Arc::new(MemoryQuota::new(max_pending_bytes as _)); + let (tx, _rx) = channel(ConnId::default(), buffer as _, memory_quota); for _ in 0..buffer { tx.unbounded_send(CdcEvent::Event(e.clone()), false) .unwrap(); @@ -636,9 +685,9 @@ mod tests { } } let memory_quota = rx.memory_quota.clone(); - assert_eq!(memory_quota.alloc(event.size() as _), false,); + memory_quota.alloc(event.size() as _).unwrap_err(); drop(rx); - assert_eq!(memory_quota.alloc(1024), true); + memory_quota.alloc(1024).unwrap(); } // Make sure memory quota is freed when tx is dropped before rx. { @@ -653,10 +702,10 @@ mod tests { } } let memory_quota = rx.memory_quota.clone(); - assert_eq!(memory_quota.alloc(event.size() as _), false,); + memory_quota.alloc(event.size() as _).unwrap_err(); drop(send); drop(rx); - assert_eq!(memory_quota.alloc(1024), true); + memory_quota.alloc(1024).unwrap(); } // Make sure sending message to a closed channel does not leak memory quota. { @@ -668,7 +717,7 @@ mod tests { send(CdcEvent::Event(e.clone())).unwrap_err(); } assert_eq!(memory_quota.in_use(), 0); - assert_eq!(memory_quota.alloc(1024), true); + memory_quota.alloc(1024).unwrap(); // Freeing bytes should not cause overflow. memory_quota.free(1024); diff --git a/components/cdc/src/delegate.rs b/components/cdc/src/delegate.rs index 4c8b2226f49..70d0d605c6d 100644 --- a/components/cdc/src/delegate.rs +++ b/components/cdc/src/delegate.rs @@ -4,7 +4,7 @@ use std::{ mem, string::String, sync::{ - atomic::{AtomicUsize, Ordering}, + atomic::{AtomicBool, AtomicUsize, Ordering}, Arc, }, }; @@ -28,9 +28,13 @@ use raftstore::{ store::util::compare_region_epoch, Error as RaftStoreError, }; -use resolved_ts::Resolver; +use resolved_ts::{Resolver, TsSource, ON_DROP_WARN_HEAP_SIZE}; use tikv::storage::{txn::TxnEntry, Statistics}; -use tikv_util::{debug, info, warn}; +use tikv_util::{ + debug, info, + memory::{HeapSize, MemoryQuota}, + warn, +}; use txn_types::{Key, Lock, LockType, TimeStamp, WriteBatchFlags, WriteRef, WriteType}; use crate::{ @@ -132,6 +136,10 @@ pub struct Downstream { kv_api: ChangeDataRequestKvApi, filter_loop: bool, pub(crate) observed_range: ObservedRange, + + // When meet region errors like split or merge, we can cancel incremental scan draining + // by `scan_truncated`. + pub(crate) scan_truncated: Arc, } impl Downstream { @@ -159,10 +167,14 @@ impl Downstream { kv_api, filter_loop, observed_range, + + scan_truncated: Arc::new(AtomicBool::new(false)), } } - /// Sink events to the downstream. + // NOTE: it's not allowed to sink `EventError` directly by this function, + // because the sink can be also used by an incremental scan. We must ensure + // no more events can be pushed to the sink after an `EventError` is sent. pub fn sink_event(&self, mut event: Event, force: bool) -> Result<()> { event.set_request_id(self.req_id); if self.sink.is_none() { @@ -187,7 +199,14 @@ impl Downstream { } } + /// EventErrors must be sent by this function. And we must ensure no more + /// events or ResolvedTs will be sent to the downstream after + /// `sink_error_event` is called. pub fn sink_error_event(&self, region_id: u64, err_event: EventError) -> Result<()> { + info!("cdc downstream meets region error"; + "conn_id" => ?self.conn_id, "downstream_id" => ?self.id, "req_id" => self.req_id); + + self.scan_truncated.store(true, Ordering::Release); let mut change_data_event = Event::default(); change_data_event.event = Some(Event_oneof_event::Error(err_event)); change_data_event.region_id = region_id; @@ -196,12 +215,6 @@ impl Downstream { self.sink_event(change_data_event, force_send) } - pub fn sink_region_not_found(&self, region_id: u64) -> Result<()> { - let mut err_event = EventError::default(); - err_event.mut_region_not_found().region_id = region_id; - self.sink_error_event(region_id, err_event) - } - pub fn set_sink(&mut self, sink: Sink) { self.sink = Some(sink); } @@ -226,16 +239,73 @@ impl Downstream { } } -#[derive(Default)] struct Pending { - pub downstreams: Vec, - pub locks: Vec, - pub pending_bytes: usize, + downstreams: Vec, + locks: Vec, + pending_bytes: usize, + memory_quota: Arc, +} + +impl Pending { + fn new(memory_quota: Arc) -> Pending { + Pending { + downstreams: vec![], + locks: vec![], + pending_bytes: 0, + memory_quota, + } + } + + fn push_pending_lock(&mut self, lock: PendingLock) -> Result<()> { + let bytes = lock.approximate_heap_size(); + self.memory_quota.alloc(bytes)?; + self.locks.push(lock); + self.pending_bytes += bytes; + CDC_PENDING_BYTES_GAUGE.add(bytes as i64); + Ok(()) + } + + fn on_region_ready(&mut self, resolver: &mut Resolver) -> Result<()> { + fail::fail_point!("cdc_pending_on_region_ready", |_| Err( + Error::MemoryQuotaExceeded(tikv_util::memory::MemoryQuotaExceeded) + )); + // Must take locks, otherwise it may double free memory quota on drop. + for lock in mem::take(&mut self.locks) { + self.memory_quota.free(lock.approximate_heap_size()); + match lock { + PendingLock::Track { key, start_ts } => { + resolver.track_lock(start_ts, key, None)?; + } + PendingLock::Untrack { key } => resolver.untrack_lock(&key, None), + } + } + Ok(()) + } } impl Drop for Pending { fn drop(&mut self) { CDC_PENDING_BYTES_GAUGE.sub(self.pending_bytes as i64); + let locks = mem::take(&mut self.locks); + if locks.is_empty() { + return; + } + + // Free memory quota used by pending locks and unlocks. + let mut bytes = 0; + let num_locks = locks.len(); + for lock in locks { + bytes += lock.approximate_heap_size(); + } + if bytes > ON_DROP_WARN_HEAP_SIZE { + warn!("cdc drop huge Pending"; + "bytes" => bytes, + "num_locks" => num_locks, + "memory_quota_in_use" => self.memory_quota.in_use(), + "memory_quota_capacity" => self.memory_quota.capacity(), + ); + } + self.memory_quota.free(bytes); } } @@ -244,6 +314,16 @@ enum PendingLock { Untrack { key: Vec }, } +impl HeapSize for PendingLock { + fn approximate_heap_size(&self) -> usize { + match self { + PendingLock::Track { key, .. } | PendingLock::Untrack { key } => { + key.approximate_heap_size() + } + } + } +} + /// A CDC delegate of a raftstore region peer. /// /// It converts raft commands into CDC events and broadcast to downstreams. @@ -265,14 +345,18 @@ pub struct Delegate { impl Delegate { /// Create a Delegate the given region. - pub fn new(region_id: u64, txn_extra_op: Arc>) -> Delegate { + pub fn new( + region_id: u64, + txn_extra_op: Arc>, + memory_quota: Arc, + ) -> Delegate { Delegate { region_id, handle: ObserveHandle::new(), resolver: None, region: None, resolved_downstreams: Vec::new(), - pending: Some(Pending::default()), + pending: Some(Pending::new(memory_quota)), txn_extra_op, failed: false, } @@ -350,10 +434,15 @@ impl Delegate { downstream.state.store(DownstreamState::Stopped); let error_event = error.clone(); if let Err(err) = downstream.sink_error_event(region_id, error_event) { - warn!("cdc broadcast error failed"; + warn!("cdc send region error failed"; "region_id" => region_id, "error" => ?err, "origin_error" => ?error, "downstream_id" => ?downstream.id, "downstream" => ?downstream.peer, "request_id" => downstream.req_id, "conn_id" => ?downstream.conn_id); + } else { + info!("cdc send region error success"; + "region_id" => region_id, "origin_error" => ?error, + "downstream_id" => ?downstream.id, "downstream" => ?downstream.peer, + "request_id" => downstream.req_id, "conn_id" => ?downstream.conn_id); } Ok(()) }; @@ -373,9 +462,9 @@ impl Delegate { self.txn_extra_op.as_ref() } - fn broadcast(&self, send: F) -> Result<()> + fn broadcast(&self, mut send: F) -> Result<()> where - F: Fn(&Downstream) -> Result<()>, + F: FnMut(&Downstream) -> Result<()>, { let downstreams = self.downstreams(); assert!( @@ -395,7 +484,7 @@ impl Delegate { &mut self, mut resolver: Resolver, region: Region, - ) -> Vec<(&Downstream, Error)> { + ) -> Result> { assert!( self.resolver.is_none(), "region {} resolver should not be ready", @@ -408,26 +497,24 @@ impl Delegate { } // Mark the delegate as initialized. - let mut pending = self.pending.take().unwrap(); - self.region = Some(region); info!("cdc region is ready"; "region_id" => self.region_id); + // Downstreams in pending must be moved to resolved_downstreams + // immediately and must not return in the middle, otherwise the delegate + // loses downstreams. + let mut pending = self.pending.take().unwrap(); + self.resolved_downstreams = mem::take(&mut pending.downstreams); - for lock in mem::take(&mut pending.locks) { - match lock { - PendingLock::Track { key, start_ts } => resolver.track_lock(start_ts, key, None), - PendingLock::Untrack { key } => resolver.untrack_lock(&key, None), - } - } + pending.on_region_ready(&mut resolver)?; self.resolver = Some(resolver); + self.region = Some(region); - self.resolved_downstreams = mem::take(&mut pending.downstreams); let mut failed_downstreams = Vec::new(); for downstream in self.downstreams() { if let Err(e) = self.check_epoch_on_ready(downstream) { failed_downstreams.push((downstream, e)); } } - failed_downstreams + Ok(failed_downstreams) } /// Try advance and broadcast resolved ts. @@ -439,7 +526,7 @@ impl Delegate { } debug!("cdc try to advance ts"; "region_id" => self.region_id, "min_ts" => min_ts); let resolver = self.resolver.as_mut().unwrap(); - let resolved_ts = resolver.resolve(min_ts, None); + let resolved_ts = resolver.resolve(min_ts, None, TsSource::Cdc); debug!("cdc resolved ts updated"; "region_id" => self.region_id, "resolved_ts" => resolved_ts); Some(resolved_ts) @@ -491,7 +578,6 @@ impl Delegate { request_id: u64, entries: Vec>, filter_loop: bool, - observed_range: &ObservedRange, ) -> Result> { let entries_len = entries.len(); let mut rows = vec![Vec::with_capacity(entries_len)]; @@ -509,25 +595,19 @@ impl Delegate { lock, old_value, })) => { - if !observed_range.contains_encoded_key(&lock.0) { - continue; - } let l = Lock::parse(&lock.1).unwrap(); if decode_lock(lock.0, l, &mut row, &mut _has_value) { continue; } decode_default(default.1, &mut row, &mut _has_value); row.old_value = old_value.finalized().unwrap_or_default(); - row_size = row.key.len() + row.value.len(); + row_size = row.key.len() + row.value.len() + row.old_value.len(); } Some(KvEntry::TxnEntry(TxnEntry::Commit { default, write, old_value, })) => { - if !observed_range.contains_encoded_key(&write.0) { - continue; - } if decode_write(write.0, &write.1, &mut row, &mut _has_value, false) { continue; } @@ -546,7 +626,7 @@ impl Delegate { } set_event_row_type(&mut row, EventLogType::Committed); row.old_value = old_value.finalized().unwrap_or_default(); - row_size = row.key.len() + row.value.len(); + row_size = row.key.len() + row.value.len() + row.old_value.len(); } None => { // This type means scan has finished. @@ -597,26 +677,21 @@ impl Delegate { is_one_pc: bool, ) -> Result<()> { debug_assert_eq!(self.txn_extra_op.load(), TxnExtraOp::ReadOldValue); - let mut read_old_value = |row: &mut EventRow, read_old_ts| -> Result<()> { + let read_old_value = |row: &mut EventRow, read_old_ts| -> Result<()> { let key = Key::from_raw(&row.key).append_ts(row.start_ts.into()); let old_value = old_value_cb(key, read_old_ts, old_value_cache, statistics)?; row.old_value = old_value.unwrap_or_default(); Ok(()) }; - // map[key] -> (event, has_value). - let mut txn_rows: HashMap, (EventRow, bool)> = HashMap::default(); + // map[key] -> (event, has_value, old_value_ts). + let mut txn_rows: HashMap, (EventRow, bool, Option)> = + HashMap::default(); let mut raw_rows: Vec = Vec::new(); for mut req in requests { - match req.get_cmd_type() { + let res = match req.get_cmd_type() { CmdType::Put => { - self.sink_put( - req.take_put(), - is_one_pc, - &mut txn_rows, - &mut raw_rows, - &mut read_old_value, - )?; + self.sink_put(req.take_put(), is_one_pc, &mut txn_rows, &mut raw_rows) } CmdType::Delete => self.sink_delete(req.take_delete()), _ => { @@ -625,31 +700,94 @@ impl Delegate { "region_id" => self.region_id, "command" => ?req, ); + Ok(()) } + }; + if res.is_err() { + self.mark_failed(); + return res; } } - let mut rows = Vec::with_capacity(txn_rows.len()); - for (_, (v, has_value)) in txn_rows { - if v.r_type == EventLogType::Prewrite && v.op_type == EventRowOpType::Put && !has_value + self.sink_downstream_tidb(txn_rows.into_values(), read_old_value)?; + self.sink_downstream_raw(raw_rows, index)?; + Ok(()) + } + + fn sink_downstream_tidb( + &mut self, + entries: impl Iterator)>, + mut read_old_value: impl FnMut(&mut EventRow, TimeStamp) -> Result<()>, + ) -> Result<()> { + let mut entries = entries + .filter(|x| !TxnSource::is_lossy_ddl_reorg_source_set(x.0.txn_source)) + // It's possible that a prewrite command only contains lock but without + // default. It's not documented by classic Percolator but introduced with + // Large-Transaction. Those prewrites are not complete, we must skip them. + .filter(|x| !(x.0.r_type == EventLogType::Prewrite && x.0.op_type == EventRowOpType::Put && !x.1)) + .map(|x| (x.0, x.2)) + .collect::>(); + + let downstreams = self.downstreams(); + assert!( + !downstreams.is_empty(), + "region {} miss downstream", + self.region_id + ); + + let region_id = self.region_id; + let send = move |downstream: &Downstream| { + // No ready downstream or a downstream that does not match the kv_api type, will + // be ignored. There will be one region that contains both Txn & Raw entries. + // The judgement here is for sending entries to downstreams with correct kv_api. + if !downstream.state.load().ready_for_change_events() + || downstream.kv_api != ChangeDataRequestKvApi::TiDb { - // It's possible that a prewrite command only contains lock but without - // default. It's not documented by classic Percolator but introduced with - // Large-Transaction. Those prewrites are not complete, we must skip them. - continue; + return Ok(()); + } + + let mut d_entries = Vec::with_capacity(entries.len()); + for (r, old_value_ts) in &mut entries { + if !downstream.observed_range.contains_raw_key(&r.key) + || downstream.filter_loop && TxnSource::is_cdc_write_source_set(r.txn_source) + { + continue; + } + if let Some(ts) = old_value_ts { + read_old_value(r, *ts)?; + *old_value_ts = None; + } + d_entries.push(r.clone()); + } + + if d_entries.is_empty() { + return Ok(()); + } + + let event = Event { + region_id, + request_id: downstream.get_req_id(), + event: Some(Event_oneof_event::Entries(EventEntries { + entries: d_entries.into(), + ..Default::default() + })), + ..Default::default() + }; + + // Do not force send for real time change data events. + let force_send = false; + downstream.sink_event(event, force_send) + }; + match self.broadcast(send) { + Ok(()) => Ok(()), + Err(e) => { + self.mark_failed(); + Err(e) } - rows.push(v); } - self.sink_downstream(rows, index, ChangeDataRequestKvApi::TiDb)?; - self.sink_downstream(raw_rows, index, ChangeDataRequestKvApi::RawKv) } - fn sink_downstream( - &mut self, - entries: Vec, - index: u64, - kv_api: ChangeDataRequestKvApi, - ) -> Result<()> { + fn sink_downstream_raw(&mut self, entries: Vec, index: u64) -> Result<()> { if entries.is_empty() { return Ok(()); } @@ -692,7 +830,9 @@ impl Delegate { // No ready downstream or a downstream that does not match the kv_api type, will // be ignored. There will be one region that contains both Txn & Raw entries. // The judgement here is for sending entries to downstreams with correct kv_api. - if !downstream.state.load().ready_for_change_events() || downstream.kv_api != kv_api { + if !downstream.state.load().ready_for_change_events() + || downstream.kv_api != ChangeDataRequestKvApi::RawKv + { return Ok(()); } if downstream.filter_loop && filtered_entries.is_none() { @@ -739,15 +879,14 @@ impl Delegate { &mut self, put: PutRequest, is_one_pc: bool, - txn_rows: &mut HashMap, (EventRow, bool)>, + txn_rows: &mut HashMap, (EventRow, bool, Option)>, raw_rows: &mut Vec, - read_old_value: impl FnMut(&mut EventRow, TimeStamp) -> Result<()>, ) -> Result<()> { let key_mode = ApiV2::parse_key_mode(put.get_key()); if key_mode == KeyMode::Raw { self.sink_raw_put(put, raw_rows) } else { - self.sink_txn_put(put, is_one_pc, txn_rows, read_old_value) + self.sink_txn_put(put, is_one_pc, txn_rows) } } @@ -762,21 +901,19 @@ impl Delegate { &mut self, mut put: PutRequest, is_one_pc: bool, - rows: &mut HashMap, (EventRow, bool)>, - mut read_old_value: impl FnMut(&mut EventRow, TimeStamp) -> Result<()>, + rows: &mut HashMap, (EventRow, bool, Option)>, ) -> Result<()> { match put.cf.as_str() { "write" => { - let (mut row, mut has_value) = (EventRow::default(), false); + let (mut row, mut has_value, mut old_value_ts) = (EventRow::default(), false, None); if decode_write(put.take_key(), &put.value, &mut row, &mut has_value, true) { return Ok(()); } let commit_ts = if is_one_pc { set_event_row_type(&mut row, EventLogType::Committed); - let commit_ts = TimeStamp::from(row.commit_ts); - read_old_value(&mut row, commit_ts.prev())?; - Some(commit_ts) + old_value_ts = Some(TimeStamp::from(row.commit_ts)); + Some(TimeStamp::from(row.commit_ts)) } else { // 2PC if row.commit_ts == 0 { @@ -802,9 +939,12 @@ impl Delegate { let o = o.into_mut(); mem::swap(&mut o.0.value, &mut row.value); o.0 = row; + if old_value_ts.is_some() { + o.2 = old_value_ts; + } } HashMapEntry::Vacant(v) => { - v.insert((row, has_value)); + v.insert((row, has_value, old_value_ts)); } } } @@ -816,23 +956,20 @@ impl Delegate { return Ok(()); } - let read_old_ts = std::cmp::max(for_update_ts, row.start_ts.into()); - read_old_value(&mut row, read_old_ts)?; + let old_value_ts = Some(std::cmp::max(for_update_ts, row.start_ts.into())); // In order to compute resolved ts, we must track inflight txns. match self.resolver { Some(ref mut resolver) => { - resolver.track_lock(row.start_ts.into(), row.key.clone(), None) + resolver.track_lock(row.start_ts.into(), row.key.clone(), None)?; } None => { assert!(self.pending.is_some(), "region resolver not ready"); let pending = self.pending.as_mut().unwrap(); - pending.locks.push(PendingLock::Track { + pending.push_pending_lock(PendingLock::Track { key: row.key.clone(), start_ts: row.start_ts.into(), - }); - pending.pending_bytes += row.key.len(); - CDC_PENDING_BYTES_GAUGE.add(row.key.len() as i64); + })?; } } @@ -841,8 +978,9 @@ impl Delegate { assert!(!has_value); has_value = true; mem::swap(&mut occupied.0.value, &mut row.value); + occupied.2 = old_value_ts; } - *occupied = (row, has_value); + *occupied = (row, has_value, old_value_ts); } "" | "default" => { let key = Key::from_encoded(put.take_key()).truncate_ts().unwrap(); @@ -854,7 +992,7 @@ impl Delegate { Ok(()) } - fn sink_delete(&mut self, mut delete: DeleteRequest) { + fn sink_delete(&mut self, mut delete: DeleteRequest) -> Result<()> { match delete.cf.as_str() { "lock" => { let raw_key = Key::from_encoded(delete.take_key()).into_raw().unwrap(); @@ -862,11 +1000,8 @@ impl Delegate { Some(ref mut resolver) => resolver.untrack_lock(&raw_key, None), None => { assert!(self.pending.is_some(), "region resolver not ready"); - let key_len = raw_key.len(); let pending = self.pending.as_mut().unwrap(); - pending.locks.push(PendingLock::Untrack { key: raw_key }); - pending.pending_bytes += key_len; - CDC_PENDING_BYTES_GAUGE.add(key_len as i64); + pending.push_pending_lock(PendingLock::Untrack { key: raw_key })?; } } } @@ -875,6 +1010,7 @@ impl Delegate { panic!("invalid cf {}", other); } } + Ok(()) } fn sink_admin(&mut self, request: AdminRequest, mut response: AdminResponse) -> Result<()> { @@ -945,7 +1081,7 @@ impl Delegate { } fn stop_observing(&self) { - info!("stop observing"; "region_id" => self.region_id, "failed" => self.failed); + info!("cdc stop observing"; "region_id" => self.region_id, "failed" => self.failed); // Stop observe further events. self.handle.stop_observing(); // To inform transaction layer no more old values are required for the region. @@ -1142,6 +1278,10 @@ impl ObservedRange { entries.retain(|e| self.is_key_in_range(&self.start_key_raw, &self.end_key_raw, &e.key)); entries } + + fn contains_raw_key(&self, key: &[u8]) -> bool { + self.is_key_in_range(&self.start_key_raw, &self.end_key_raw, key) + } } #[cfg(test)] @@ -1151,9 +1291,10 @@ mod tests { use api_version::RawValue; use futures::{executor::block_on, stream::StreamExt}; use kvproto::{errorpb::Error as ErrorHeader, metapb::Region}; + use tikv_util::memory::MemoryQuota; use super::*; - use crate::channel::{channel, recv_timeout, MemoryQuota}; + use crate::channel::{channel, recv_timeout}; #[test] fn test_error() { @@ -1165,8 +1306,8 @@ mod tests { region.mut_region_epoch().set_conf_ver(2); let region_epoch = region.get_region_epoch().clone(); - let quota = crate::channel::MemoryQuota::new(usize::MAX); - let (sink, mut drain) = crate::channel::channel(1, quota); + let quota = Arc::new(MemoryQuota::new(usize::MAX)); + let (sink, mut drain) = channel(ConnId::default(), 1, quota); let rx = drain.drain(); let request_id = 123; let mut downstream = Downstream::new( @@ -1179,11 +1320,18 @@ mod tests { ObservedRange::default(), ); downstream.set_sink(sink); - let mut delegate = Delegate::new(region_id, Default::default()); + let memory_quota = Arc::new(MemoryQuota::new(usize::MAX)); + let mut delegate = Delegate::new(region_id, Default::default(), memory_quota); delegate.subscribe(downstream).unwrap(); assert!(delegate.handle.is_observing()); - let resolver = Resolver::new(region_id); - assert!(delegate.on_region_ready(resolver, region).is_empty()); + let memory_quota = Arc::new(MemoryQuota::new(std::usize::MAX)); + let resolver = Resolver::new(region_id, memory_quota); + assert!( + delegate + .on_region_ready(resolver, region) + .unwrap() + .is_empty() + ); assert!(delegate.downstreams()[0].observed_range.all_key_covered); let rx_wrap = Cell::new(Some(rx)); @@ -1307,8 +1455,9 @@ mod tests { }; // Create a new delegate. + let memory_quota = Arc::new(MemoryQuota::new(usize::MAX)); let txn_extra_op = Arc::new(AtomicCell::new(TxnExtraOp::Noop)); - let mut delegate = Delegate::new(1, txn_extra_op.clone()); + let mut delegate = Delegate::new(1, txn_extra_op.clone(), memory_quota); assert_eq!(txn_extra_op.load(), TxnExtraOp::Noop); assert!(delegate.handle.is_observing()); @@ -1333,7 +1482,10 @@ mod tests { region.mut_region_epoch().set_conf_ver(1); region.mut_region_epoch().set_version(1); { - let failures = delegate.on_region_ready(Resolver::new(1), region); + let memory_quota = Arc::new(MemoryQuota::new(std::usize::MAX)); + let failures = delegate + .on_region_ready(Resolver::new(1, memory_quota), region) + .unwrap(); assert_eq!(failures.len(), 1); let id = failures[0].0.id; delegate.unsubscribe(id, None); @@ -1424,8 +1576,9 @@ mod tests { Key::from_raw(b"d").into_encoded(), ) .unwrap(); + let memory_quota = Arc::new(MemoryQuota::new(usize::MAX)); let txn_extra_op = Arc::new(AtomicCell::new(TxnExtraOp::Noop)); - let mut delegate = Delegate::new(1, txn_extra_op); + let mut delegate = Delegate::new(1, txn_extra_op, memory_quota); assert!(delegate.handle.is_observing()); let mut map = HashMap::default(); @@ -1438,41 +1591,35 @@ mod tests { put.key.clone(), 1.into(), 10, - None, + Some(b"value".to_vec()), TimeStamp::zero(), 0, TimeStamp::zero(), false, ) .to_bytes(); - delegate - .sink_txn_put( - put, - false, - &mut map, - |_: &mut EventRow, _: TimeStamp| Ok(()), - ) - .unwrap(); + delegate.sink_txn_put(put, false, &mut map).unwrap(); } assert_eq!(map.len(), 5); - let (sink, mut drain) = channel(1, MemoryQuota::new(1024)); + let conn_id = ConnId::default(); + let (sink, mut drain) = channel(conn_id, 1, Arc::new(MemoryQuota::new(1024))); let downstream = Downstream { id: DownstreamId::new(), req_id: 1, - conn_id: ConnId::new(), + conn_id, peer: String::new(), region_epoch: RegionEpoch::default(), sink: Some(sink), state: Arc::new(AtomicCell::new(DownstreamState::Normal)), + scan_truncated: Arc::new(Default::default()), kv_api: ChangeDataRequestKvApi::TiDb, filter_loop: false, observed_range, }; delegate.add_downstream(downstream); - let entries = map.values().map(|(r, _)| r).cloned().collect(); delegate - .sink_downstream(entries, 1, ChangeDataRequestKvApi::TiDb) + .sink_downstream_tidb(map.into_values(), |_, _| Ok(())) .unwrap(); let (mut tx, mut rx) = futures::channel::mpsc::unbounded(); @@ -1493,8 +1640,9 @@ mod tests { Key::from_raw(b"f").into_encoded(), ) .unwrap(); + let memory_quota = Arc::new(MemoryQuota::new(usize::MAX)); let txn_extra_op = Arc::new(AtomicCell::new(TxnExtraOp::Noop)); - let mut delegate = Delegate::new(1, txn_extra_op); + let mut delegate = Delegate::new(1, txn_extra_op, memory_quota); assert!(delegate.handle.is_observing()); let mut map = HashMap::default(); @@ -1507,7 +1655,7 @@ mod tests { put.key.clone(), 1.into(), 10, - None, + Some(b"value".to_vec()), TimeStamp::zero(), 0, TimeStamp::zero(), @@ -1518,34 +1666,28 @@ mod tests { lock = lock.set_txn_source(txn_source.into()); } put.value = lock.to_bytes(); - delegate - .sink_txn_put( - put, - false, - &mut map, - |_: &mut EventRow, _: TimeStamp| Ok(()), - ) - .unwrap(); + delegate.sink_txn_put(put, false, &mut map).unwrap(); } assert_eq!(map.len(), 5); - let (sink, mut drain) = channel(1, MemoryQuota::new(1024)); + let conn_id = ConnId::default(); + let (sink, mut drain) = channel(conn_id, 1, Arc::new(MemoryQuota::new(1024))); let downstream = Downstream { id: DownstreamId::new(), req_id: 1, - conn_id: ConnId::new(), + conn_id, peer: String::new(), region_epoch: RegionEpoch::default(), sink: Some(sink), state: Arc::new(AtomicCell::new(DownstreamState::Normal)), + scan_truncated: Arc::new(Default::default()), kv_api: ChangeDataRequestKvApi::TiDb, filter_loop, observed_range, }; delegate.add_downstream(downstream); - let entries = map.values().map(|(r, _)| r).cloned().collect(); delegate - .sink_downstream(entries, 1, ChangeDataRequestKvApi::TiDb) + .sink_downstream_tidb(map.into_values(), |_, _| Ok(())) .unwrap(); let (mut tx, mut rx) = futures::channel::mpsc::unbounded(); diff --git a/components/cdc/src/endpoint.rs b/components/cdc/src/endpoint.rs index 23a3e410467..1b9f5bb1a52 100644 --- a/components/cdc/src/endpoint.rs +++ b/components/cdc/src/endpoint.rs @@ -36,11 +36,12 @@ use raftstore::{ use resolved_ts::{resolve_by_raft, LeadershipResolver, Resolver}; use security::SecurityManager; use tikv::{ - config::CdcConfig, + config::{CdcConfig, ResolvedTsConfig}, storage::{kv::LocalTablets, Statistics}, }; use tikv_util::{ debug, defer, error, impl_display_as_debug, info, + memory::MemoryQuota, mpsc::bounded, slow_log, sys::thread::ThreadBuildWrapper, @@ -56,9 +57,9 @@ use tokio::{ use txn_types::{TimeStamp, TxnExtra, TxnExtraScheduler}; use crate::{ - channel::{CdcEvent, MemoryQuota, SendError}, + channel::{CdcEvent, SendError}, delegate::{on_init_downstream, Delegate, Downstream, DownstreamId, DownstreamState}, - initializer::Initializer, + initializer::{InitializeStats, Initializer}, metrics::*, old_value::{OldValueCache, OldValueCallback}, service::{validate_kv_api, Conn, ConnId, FeatureGate}, @@ -79,6 +80,11 @@ pub enum Deregister { conn_id: ConnId, request_id: u64, }, + Region { + conn_id: ConnId, + request_id: u64, + region_id: u64, + }, Downstream { conn_id: ConnId, request_id: u64, @@ -111,6 +117,16 @@ impl fmt::Debug for Deregister { .field("conn_id", conn_id) .field("request_id", request_id) .finish(), + Deregister::Region { + ref conn_id, + ref request_id, + ref region_id, + } => de + .field("deregister", &"region") + .field("conn_id", conn_id) + .field("request_id", request_id) + .field("region_id", region_id) + .finish(), Deregister::Downstream { ref conn_id, ref request_id, @@ -144,6 +160,7 @@ type InitCallback = Box; pub enum Validate { Region(u64, Box) + Send>), OldValueCache(Box), + InitializeStats(Box), } pub enum Task { @@ -271,6 +288,7 @@ impl fmt::Debug for Task { Task::Validate(validate) => match validate { Validate::Region(region_id, _) => de.field("region_id", ®ion_id).finish(), Validate::OldValueCache(_) => de.finish(), + Validate::InitializeStats(_) => de.finish(), }, Task::ChangeConfig(change) => de .field("type", &"change_config") @@ -362,15 +380,17 @@ pub struct Endpoint { raftstore_v2: bool, config: CdcConfig, + resolved_ts_config: ResolvedTsConfig, api_version: ApiVersion, // Incremental scan workers: Runtime, scan_concurrency_semaphore: Arc, scan_speed_limiter: Limiter, + fetch_speed_limiter: Limiter, max_scan_batch_bytes: usize, max_scan_batch_size: usize, - sink_memory_quota: MemoryQuota, + sink_memory_quota: Arc, old_value_cache: OldValueCache, resolved_region_heap: RefCell, @@ -384,12 +404,16 @@ pub struct Endpoint { resolved_region_count: usize, unresolved_region_count: usize, warn_resolved_ts_repeat_count: usize, + + // Validate statistics of the next incremental scan. Only for tests. + validate_next_initialize_stats: Option>, } impl, E: KvEngine, S: StoreRegionMeta> Endpoint { pub fn new( cluster_id: u64, config: &CdcConfig, + resolved_ts_config: &ResolvedTsConfig, raftstore_v2: bool, api_version: ApiVersion, pd_client: Arc, @@ -401,7 +425,7 @@ impl, E: KvEngine, S: StoreRegionMeta> Endpoint, security_mgr: Arc, - sink_memory_quota: MemoryQuota, + sink_memory_quota: Arc, causal_ts_provider: Option>, ) -> Endpoint { let workers = Builder::new_multi_thread() @@ -423,11 +447,16 @@ impl, E: KvEngine, S: StoreRegionMeta> Endpoint 0 { + let scan_speed_limiter = Limiter::new(if config.incremental_scan_speed_limit.0 > 0 { config.incremental_scan_speed_limit.0 as f64 } else { f64::INFINITY }); + let fetch_speed_limiter = Limiter::new(if config.incremental_fetch_speed_limit.0 > 0 { + config.incremental_fetch_speed_limit.0 as f64 + } else { + f64::INFINITY + }); CDC_SINK_CAP.set(sink_memory_quota.capacity() as i64); // For scan efficiency, the scan batch bytes should be around 1MB. @@ -453,10 +482,12 @@ impl, E: KvEngine, S: StoreRegionMeta> Endpoint, E: KvEngine, S: StoreRegionMeta> Endpoint, E: KvEngine, S: StoreRegionMeta> Endpoint 0 { + self.config.incremental_fetch_speed_limit.0 as f64 + } else { + f64::INFINITY + }; + + self.fetch_speed_limiter.set_speed_limit(new_speed_limit); + } } pub fn set_max_scan_batch_size(&mut self, max_scan_batch_size: usize) { @@ -582,8 +624,20 @@ impl, E: KvEngine, S: StoreRegionMeta> Endpoint { let conn = self.connections.get_mut(&conn_id).unwrap(); - for (region, downstream) in conn.unsubscribe_request(request_id) { - self.deregister_downstream(region, downstream, None); + for (region_id, downstream) in conn.unsubscribe_request(request_id) { + let err = Some(Error::Other("region not found".into())); + self.deregister_downstream(region_id, downstream, err); + } + } + Deregister::Region { + conn_id, + request_id, + region_id, + } => { + let conn = self.connections.get_mut(&conn_id).unwrap(); + if let Some(downstream) = conn.unsubscribe(request_id, region_id) { + let err = Some(Error::Other("region not found".into())); + self.deregister_downstream(region_id, downstream, err); } } Deregister::Downstream { @@ -647,8 +701,20 @@ impl, E: KvEngine, S: StoreRegionMeta> Endpoint conn, + None => { + info!("cdc register region on an deregistered connection, ignore"; + "region_id" => region_id, + "conn_id" => ?conn_id, + "req_id" => request_id, + "downstream_id" => ?downstream_id); + return; + } + }; downstream.set_sink(conn.get_sink().clone()); // Check if the cluster id matches if supported. @@ -681,7 +747,9 @@ impl, E: KvEngine, S: StoreRegionMeta> Endpoint reader.txn_extra_op.clone(), None => { error!("cdc register for a not found region"; "region_id" => region_id); - let _ = downstream.sink_region_not_found(region_id); + let mut err_event = EventError::default(); + err_event.mut_region_not_found().region_id = region_id; + let _ = downstream.sink_error_event(region_id, err_event); return; } }; @@ -708,7 +776,11 @@ impl, E: KvEngine, S: StoreRegionMeta> Endpoint e.into_mut(), HashMapEntry::Vacant(e) => { is_new_delegate = true; - e.insert(Delegate::new(region_id, txn_extra_op)) + e.insert(Delegate::new( + region_id, + txn_extra_op, + self.sink_memory_quota.clone(), + )) } }; @@ -750,6 +822,7 @@ impl, E: KvEngine, S: StoreRegionMeta> Endpoint, E: KvEngine, S: StoreRegionMeta> Endpoint, E: KvEngine, S: StoreRegionMeta> Endpoint { + Ok(stats) => { CDC_SCAN_TASKS.with_label_values(&["finish"]).inc(); + if let Some(validate) = validate_initialize_stats { + validate(stats); + } } Err(e) => { CDC_SCAN_TASKS.with_label_values(&["abort"]).inc(); @@ -830,33 +910,43 @@ impl, E: KvEngine, S: StoreRegionMeta> Endpoint { + debug!("cdc region not found on region ready (finish building resolver)"; + "region_id" => region.get_id()); + } + Some(delegate) => { + if delegate.handle.id != observe_id { + debug!("cdc stale region ready"; + "region_id" => region.get_id(), + "observe_id" => ?observe_id, + "current_id" => ?delegate.handle.id); + return; + } + match delegate.on_region_ready(resolver, region) { + Ok(fails) => { + let mut deregisters = Vec::new(); + for (downstream, e) in fails { + deregisters.push(Deregister::Downstream { + conn_id: downstream.get_conn_id(), + request_id: downstream.get_req_id(), + region_id, + downstream_id: downstream.get_id(), + err: Some(e), + }); + } + // Deregister downstreams if there is any downstream fails to subscribe. + for deregister in deregisters { + self.on_deregister(deregister); + } + } + Err(e) => self.on_deregister(Deregister::Delegate { region_id, - downstream_id: downstream.get_id(), - err: Some(e), - }); + observe_id, + err: e, + }), } - } else { - debug!("cdc stale region ready"; - "region_id" => region.get_id(), - "observe_id" => ?observe_id, - "current_id" => ?delegate.handle.id); } - } else { - debug!("cdc region not found on region ready (finish building resolver)"; - "region_id" => region.get_id()); - } - - // Deregister downstreams if there is any downstream fails to subscribe. - for deregister in failed_downstreams { - self.on_deregister(deregister); } } @@ -1060,6 +1150,7 @@ impl, E: KvEngine, S: StoreRegionMeta> Endpoint, E: KvEngine, S: StoreRegionMeta> Endpoint (), // Must schedule `RegisterMinTsEvent` event otherwise resolved ts can not // advance normally. - Err(err) => panic!("failed to regiester min ts event, error: {:?}", err), + Err(err) => panic!("failed to register min ts event, error: {:?}", err), } } else { // During shutdown, tso runtime drops future immediately, @@ -1114,7 +1205,9 @@ impl, E: KvEngine, S: StoreRegionMeta> Endpoint, E: KvEngine, S: StoreRegionMeta + Send> Runnable Validate::OldValueCache(validate) => { validate(&self.old_value_cache); } + Validate::InitializeStats(validate) => { + self.validate_next_initialize_stats = Some(validate); + } }, Task::ChangeConfig(change) => self.on_change_cfg(change), } @@ -1247,13 +1343,12 @@ impl, E: KvEngine, S: StoreRegionMeta + Send> Runnable for Endpoint { fn on_timeout(&mut self) { - CDC_ENDPOINT_PENDING_TASKS.set(self.scheduler.pending_tasks() as _); - // Reclaim resolved_region_heap memory. self.resolved_region_heap .borrow_mut() .reset_and_shrink_to(self.capture_regions.len()); + CDC_ENDPOINT_PENDING_TASKS.set(self.scheduler.pending_tasks() as _); CDC_CAPTURED_REGION_COUNT.set(self.capture_regions.len() as i64); CDC_REGION_RESOLVE_STATUS_GAUGE_VEC .with_label_values(&["unresolved"]) @@ -1261,6 +1356,7 @@ impl, E: KvEngine, S: StoreRegionMeta + Send> Runnable CDC_REGION_RESOLVE_STATUS_GAUGE_VEC .with_label_values(&["resolved"]) .set(self.resolved_region_count as _); + if self.min_resolved_ts != TimeStamp::max() { CDC_MIN_RESOLVED_TS_REGION.set(self.min_ts_region_id as i64); CDC_MIN_RESOLVED_TS.set(self.min_resolved_ts.physical() as i64); @@ -1438,6 +1534,7 @@ mod tests { let ep = Endpoint::new( DEFAULT_CLUSTER_ID, cfg, + &ResolvedTsConfig::default(), false, api_version, pd_client, @@ -1455,7 +1552,7 @@ mod tests { ConcurrencyManager::new(1.into()), env, security_mgr, - MemoryQuota::new(usize::MAX), + Arc::new(MemoryQuota::new(usize::MAX)), causal_ts_provider, ); @@ -1476,11 +1573,11 @@ mod tests { let mut suite = mock_endpoint(&cfg, None, ApiVersion::V1); suite.add_region(1, 100); - let quota = crate::channel::MemoryQuota::new(usize::MAX); - let (tx, mut rx) = channel::channel(1, quota); + let quota = Arc::new(MemoryQuota::new(usize::MAX)); + let (tx, mut rx) = channel::channel(ConnId::default(), 1, quota); let mut rx = rx.drain(); - let conn = Conn::new(tx, String::new()); + let conn = Conn::new(ConnId::default(), tx, String::new()); let conn_id = conn.get_id(); suite.run(Task::OpenConn { conn }); suite.run(set_conn_verion_task( @@ -1728,19 +1825,46 @@ mod tests { < f64::EPSILON ); } + + // Modify incremental_fetch_speed_limit. + { + let mut updated_cfg = cfg.clone(); + { + updated_cfg.incremental_fetch_speed_limit = ReadableSize::mb(2048); + } + let diff = cfg.diff(&updated_cfg); + + assert_eq!( + ep.config.incremental_fetch_speed_limit, + ReadableSize::mb(512) + ); + assert!( + (ep.fetch_speed_limiter.speed_limit() - ReadableSize::mb(512).0 as f64).abs() + < f64::EPSILON + ); + ep.run(Task::ChangeConfig(diff)); + assert_eq!( + ep.config.incremental_fetch_speed_limit, + ReadableSize::mb(2048) + ); + assert!( + (ep.fetch_speed_limiter.speed_limit() - ReadableSize::mb(2048).0 as f64).abs() + < f64::EPSILON + ); + } } #[test] fn test_raftstore_is_busy() { - let quota = crate::channel::MemoryQuota::new(usize::MAX); - let (tx, _rx) = channel::channel(1, quota); + let quota = Arc::new(MemoryQuota::new(usize::MAX)); + let (tx, _rx) = channel::channel(ConnId::default(), 1, quota); let mut suite = mock_endpoint(&CdcConfig::default(), None, ApiVersion::V1); // Fill the channel. suite.add_region(1 /* region id */, 1 /* cap */); suite.fill_raft_rx(1); - let conn = Conn::new(tx, String::new()); + let conn = Conn::new(ConnId::default(), tx, String::new()); let conn_id = conn.get_id(); suite.run(Task::OpenConn { conn }); suite.run(set_conn_verion_task(conn_id, semver::Version::new(0, 0, 0))); @@ -1785,11 +1909,11 @@ mod tests { }; let mut suite = mock_endpoint(&cfg, None, ApiVersion::V1); suite.add_region(1, 100); - let quota = crate::channel::MemoryQuota::new(usize::MAX); - let (tx, mut rx) = channel::channel(1, quota); + let quota = Arc::new(MemoryQuota::new(usize::MAX)); + let (tx, mut rx) = channel::channel(ConnId::default(), 1, quota); let mut rx = rx.drain(); - let conn = Conn::new(tx, String::new()); + let conn = Conn::new(ConnId::default(), tx, String::new()); let conn_id = conn.get_id(); suite.run(Task::OpenConn { conn }); @@ -1966,12 +2090,12 @@ mod tests { let mut suite = mock_endpoint(&cfg, None, ApiVersion::V1); suite.add_region(1, 100); - let quota = crate::channel::MemoryQuota::new(usize::MAX); - let (tx, mut rx) = channel::channel(1, quota); + let quota = Arc::new(MemoryQuota::new(usize::MAX)); + let (tx, mut rx) = channel::channel(ConnId::default(), 1, quota); let mut rx = rx.drain(); let mut region = Region::default(); region.set_id(1); - let conn = Conn::new(tx, String::new()); + let conn = Conn::new(ConnId::default(), tx, String::new()); let conn_id = conn.get_id(); suite.run(Task::OpenConn { conn }); @@ -1999,7 +2123,8 @@ mod tests { downstream, conn_id, }); - let resolver = Resolver::new(1); + let memory_quota = Arc::new(MemoryQuota::new(std::usize::MAX)); + let resolver = Resolver::new(1, memory_quota); let observe_id = suite.endpoint.capture_regions[&1].handle.id; suite.on_region_ready(observe_id, resolver, region.clone()); suite.run(Task::MinTs { @@ -2035,7 +2160,8 @@ mod tests { downstream, conn_id, }); - let resolver = Resolver::new(2); + let memory_quota = Arc::new(MemoryQuota::new(std::usize::MAX)); + let resolver = Resolver::new(2, memory_quota); region.set_id(2); let observe_id = suite.endpoint.capture_regions[&2].handle.id; suite.on_region_ready(observe_id, resolver, region); @@ -2056,12 +2182,12 @@ mod tests { } // Register region 3 to another conn which is not support batch resolved ts. - let quota = crate::channel::MemoryQuota::new(usize::MAX); - let (tx, mut rx2) = channel::channel(1, quota); + let quota = Arc::new(MemoryQuota::new(usize::MAX)); + let (tx, mut rx2) = channel::channel(ConnId::default(), 1, quota); let mut rx2 = rx2.drain(); let mut region = Region::default(); region.set_id(3); - let conn = Conn::new(tx, String::new()); + let conn = Conn::new(ConnId::default(), tx, String::new()); let conn_id = conn.get_id(); suite.run(Task::OpenConn { conn }); suite.run(set_conn_verion_task(conn_id, semver::Version::new(4, 0, 5))); @@ -2084,7 +2210,8 @@ mod tests { downstream, conn_id, }); - let resolver = Resolver::new(3); + let memory_quota = Arc::new(MemoryQuota::new(std::usize::MAX)); + let resolver = Resolver::new(3, memory_quota); region.set_id(3); let observe_id = suite.endpoint.capture_regions[&3].handle.id; suite.on_region_ready(observe_id, resolver, region); @@ -2127,11 +2254,11 @@ mod tests { fn test_deregister() { let mut suite = mock_endpoint(&CdcConfig::default(), None, ApiVersion::V1); suite.add_region(1, 100); - let quota = crate::channel::MemoryQuota::new(usize::MAX); - let (tx, mut rx) = channel::channel(1, quota); + let quota = Arc::new(MemoryQuota::new(usize::MAX)); + let (tx, mut rx) = channel::channel(ConnId::default(), 1, quota); let mut rx = rx.drain(); - let conn = Conn::new(tx, String::new()); + let conn = Conn::new(ConnId::default(), tx, String::new()); let conn_id = conn.get_id(); suite.run(Task::OpenConn { conn }); suite.run(set_conn_verion_task(conn_id, semver::Version::new(0, 0, 0))); @@ -2279,12 +2406,12 @@ mod tests { // Open two connections a and b, registers region 1, 2 to conn a and // region 3 to conn b. let mut conn_rxs = vec![]; - let quota = channel::MemoryQuota::new(usize::MAX); + let quota = Arc::new(MemoryQuota::new(usize::MAX)); for region_ids in vec![vec![1, 2], vec![3]] { - let (tx, rx) = channel::channel(1, quota.clone()); + let conn_id = ConnId::default(); + let (tx, rx) = channel::channel(conn_id, 1, quota.clone()); conn_rxs.push(rx); - let conn = Conn::new(tx, String::new()); - let conn_id = conn.get_id(); + let conn = Conn::new(conn_id, tx, String::new()); suite.run(Task::OpenConn { conn }); let version = FeatureGate::batch_resolved_ts(); suite.run(set_conn_verion_task(conn_id, version)); @@ -2311,7 +2438,8 @@ mod tests { downstream, conn_id, }); - let resolver = Resolver::new(region_id); + let memory_quota = Arc::new(MemoryQuota::new(std::usize::MAX)); + let resolver = Resolver::new(region_id, memory_quota); let observe_id = suite.endpoint.capture_regions[®ion_id].handle.id; let mut region = Region::default(); region.set_id(region_id); @@ -2392,11 +2520,11 @@ mod tests { fn test_deregister_conn_then_delegate() { let mut suite = mock_endpoint(&CdcConfig::default(), None, ApiVersion::V1); suite.add_region(1, 100); - let quota = crate::channel::MemoryQuota::new(usize::MAX); + let quota = Arc::new(MemoryQuota::new(usize::MAX)); // Open conn a - let (tx1, _rx1) = channel::channel(1, quota.clone()); - let conn_a = Conn::new(tx1, String::new()); + let (tx1, _rx1) = channel::channel(ConnId::default(), 1, quota.clone()); + let conn_a = Conn::new(ConnId::default(), tx1, String::new()); let conn_id_a = conn_a.get_id(); suite.run(Task::OpenConn { conn: conn_a }); suite.run(set_conn_verion_task( @@ -2405,9 +2533,9 @@ mod tests { )); // Open conn b - let (tx2, mut rx2) = channel::channel(1, quota); + let (tx2, mut rx2) = channel::channel(ConnId::default(), 1, quota); let mut rx2 = rx2.drain(); - let conn_b = Conn::new(tx2, String::new()); + let conn_b = Conn::new(ConnId::default(), tx2, String::new()); let conn_id_b = conn_b.get_id(); suite.run(Task::OpenConn { conn: conn_b }); suite.run(set_conn_verion_task( @@ -2470,10 +2598,11 @@ mod tests { let mut region = Region::default(); region.id = 1; region.set_region_epoch(region_epoch_2); + let memory_quota = Arc::new(MemoryQuota::new(std::usize::MAX)); suite.run(Task::ResolverReady { observe_id, region: region.clone(), - resolver: Resolver::new(1), + resolver: Resolver::new(1, memory_quota), }); // Deregister deletgate due to epoch not match for conn b. @@ -2557,11 +2686,11 @@ mod tests { ..Default::default() }; let mut suite = mock_endpoint(&cfg, None, ApiVersion::V1); - let quota = crate::channel::MemoryQuota::new(usize::MAX); - let (tx, mut rx) = channel::channel(1, quota); + let quota = Arc::new(MemoryQuota::new(usize::MAX)); + let (tx, mut rx) = channel::channel(ConnId::default(), 1, quota); let mut rx = rx.drain(); - let conn = Conn::new(tx, String::new()); + let conn = Conn::new(ConnId::default(), tx, String::new()); let conn_id = conn.get_id(); suite.run(Task::OpenConn { conn }); // Enable batch resolved ts in the test. @@ -2596,8 +2725,11 @@ mod tests { conn_id, }); - let mut resolver = Resolver::new(id); - resolver.track_lock(TimeStamp::compose(0, id), vec![], None); + let memory_quota = Arc::new(MemoryQuota::new(std::usize::MAX)); + let mut resolver = Resolver::new(id, memory_quota); + resolver + .track_lock(TimeStamp::compose(0, id), vec![], None) + .unwrap(); let mut region = Region::default(); region.id = id; region.set_region_epoch(region_epoch); @@ -2605,7 +2737,8 @@ mod tests { .capture_regions .get_mut(&id) .unwrap() - .on_region_ready(resolver, region); + .on_region_ready(resolver, region) + .unwrap(); assert!(failed.is_empty()); } suite @@ -2646,11 +2779,11 @@ mod tests { }; let mut suite = mock_endpoint(&cfg, None, ApiVersion::V1); suite.add_region(1, 100); - let quota = crate::channel::MemoryQuota::new(usize::MAX); - let (tx, mut rx) = channel::channel(1, quota); + let quota = Arc::new(MemoryQuota::new(usize::MAX)); + let (tx, mut rx) = channel::channel(ConnId::default(), 1, quota); let mut rx = rx.drain(); - let conn = Conn::new(tx, String::new()); + let conn = Conn::new(ConnId::default(), tx, String::new()); let conn_id = conn.get_id(); suite.run(Task::OpenConn { conn }); @@ -2834,5 +2967,106 @@ mod tests { })); assert_eq!(suite.connections[&conn_id].downstreams_count(), 0); assert_eq!(suite.capture_regions.len(), 0); + for _ in 0..2 { + let cdc_event = channel::recv_timeout(&mut rx, Duration::from_millis(500)) + .unwrap() + .unwrap(); + let check = matches!(cdc_event.0, CdcEvent::Event(e) if { + matches!(e.event, Some(Event_oneof_event::Error(ref err)) if { + err.has_region_not_found() + }) + }); + assert!(check); + } + + // Resubscribe the region. + suite.add_region(2, 100); + for i in 1..=2 { + req.set_request_id(1); + req.set_region_id(i); + let downstream = Downstream::new( + "".to_string(), + region_epoch.clone(), + 1, + conn_id, + ChangeDataRequestKvApi::TiDb, + false, + ObservedRange::default(), + ); + suite.run(Task::Register { + request: req.clone(), + downstream, + conn_id, + }); + assert_eq!(suite.connections[&conn_id].downstreams_count(), i as usize); + } + + // Deregister regions one by one in the request. + suite.run(Task::Deregister(Deregister::Region { + conn_id, + request_id: 1, + region_id: 1, + })); + assert_eq!(suite.connections[&conn_id].downstreams_count(), 1); + assert_eq!(suite.capture_regions.len(), 1); + + suite.run(Task::Deregister(Deregister::Region { + conn_id, + request_id: 1, + region_id: 2, + })); + assert_eq!(suite.connections[&conn_id].downstreams_count(), 0); + assert_eq!(suite.capture_regions.len(), 0); + + for _ in 0..2 { + let cdc_event = channel::recv_timeout(&mut rx, Duration::from_millis(500)) + .unwrap() + .unwrap(); + let check = matches!(cdc_event.0, CdcEvent::Event(e) if { + matches!(e.event, Some(Event_oneof_event::Error(ref err)) if { + err.has_region_not_found() + }) + }); + assert!(check); + } + } + + #[test] + fn test_register_after_connection_deregistered() { + let cfg = CdcConfig { + min_ts_interval: ReadableDuration(Duration::from_secs(60)), + ..Default::default() + }; + let mut suite = mock_endpoint(&cfg, None, ApiVersion::V1); + suite.add_region(1, 100); + let quota = Arc::new(MemoryQuota::new(usize::MAX)); + let (tx, _rx) = channel::channel(ConnId::default(), 1, quota); + + let conn = Conn::new(ConnId::default(), tx, String::new()); + let conn_id = conn.get_id(); + suite.run(Task::OpenConn { conn }); + + suite.run(Task::Deregister(Deregister::Conn(conn_id))); + + let mut req = ChangeDataRequest::default(); + + req.set_region_id(1); + req.set_request_id(1); + let region_epoch = req.get_region_epoch().clone(); + let downstream = Downstream::new( + "".to_string(), + region_epoch, + 1, + conn_id, + ChangeDataRequestKvApi::TiDb, + false, + ObservedRange::default(), + ); + suite.run(Task::Register { + request: req, + downstream, + conn_id, + }); + assert!(suite.connections.is_empty()); } } diff --git a/components/cdc/src/errors.rs b/components/cdc/src/errors.rs index c9a61c73dc4..e7bd7605e7d 100644 --- a/components/cdc/src/errors.rs +++ b/components/cdc/src/errors.rs @@ -10,6 +10,7 @@ use tikv::storage::{ mvcc::{Error as MvccError, ErrorInner as MvccErrorInner}, txn::{Error as TxnError, ErrorInner as TxnErrorInner}, }; +use tikv_util::memory::MemoryQuotaExceeded; use txn_types::Error as TxnTypesError; use crate::channel::SendError; @@ -35,6 +36,8 @@ pub enum Error { EngineTraits(#[from] EngineTraitsError), #[error("Sink send error {0:?}")] Sink(#[from] SendError), + #[error("Memory quota exceeded")] + MemoryQuotaExceeded(#[from] MemoryQuotaExceeded), } macro_rules! impl_from { diff --git a/components/cdc/src/initializer.rs b/components/cdc/src/initializer.rs index 2c0884bb303..7222f51ecf1 100644 --- a/components/cdc/src/initializer.rs +++ b/components/cdc/src/initializer.rs @@ -1,5 +1,8 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use std::sync::Arc; +use std::{ + sync::{atomic::AtomicBool, Arc}, + time::Duration, +}; use api_version::ApiV2; use crossbeam::atomic::AtomicCell; @@ -23,7 +26,7 @@ use raftstore::{ msg::{Callback, ReadResponse}, }, }; -use resolved_ts::Resolver; +use resolved_ts::{Resolver, TsSource}; use tikv::storage::{ kv::Snapshot, mvcc::{DeltaScanner, ScannerBuilder}, @@ -35,9 +38,10 @@ use tikv_kv::Iterator; use tikv_util::{ box_err, codec::number, - debug, error, info, + debug, defer, error, info, + memory::MemoryQuota, sys::inspector::{self_thread_inspector, ThreadInspector}, - time::{Instant, Limiter}, + time::{duration_to_sec, Instant, Limiter}, warn, worker::Scheduler, Either, @@ -55,15 +59,6 @@ use crate::{ Error, Result, Task, }; -struct ScanStat { - // Fetched bytes to the scanner. - emit: usize, - // Bytes from the device, `None` if not possible to get it. - disk_read: Option, - // Perf delta for RocksDB. - perf_delta: ReadPerfContext, -} - pub(crate) enum KvEntry { TxnEntry(TxnEntry), RawKvEntry(KvPair), @@ -85,11 +80,14 @@ pub(crate) struct Initializer { pub(crate) observe_id: ObserveId, pub(crate) downstream_id: DownstreamId, pub(crate) downstream_state: Arc>, + pub(crate) scan_truncated: Arc, pub(crate) conn_id: ConnId, pub(crate) request_id: u64, pub(crate) checkpoint_ts: TimeStamp, - pub(crate) speed_limiter: Limiter, + pub(crate) scan_speed_limiter: Limiter, + pub(crate) fetch_speed_limiter: Limiter, + pub(crate) max_scan_batch_bytes: usize, pub(crate) max_scan_batch_size: usize, @@ -107,29 +105,11 @@ impl Initializer { change_observer: ChangeObserver, cdc_handle: T, concurrency_semaphore: Arc, - ) -> Result<()> { + memory_quota: Arc, + ) -> Result { fail_point!("cdc_before_initialize"); let _permit = concurrency_semaphore.acquire().await; - // When downstream_state is Stopped, it means the corresponding delegate - // is stopped. The initialization can be safely canceled. - // - // Acquiring a permit may take some time, it is possible that - // initialization can be canceled. - if self.downstream_state.load() == DownstreamState::Stopped { - info!("cdc async incremental scan canceled"; - "region_id" => self.region_id, - "downstream_id" => ?self.downstream_id, - "observe_id" => ?self.observe_id, - "conn_id" => ?self.conn_id); - return Err(box_err!("scan canceled")); - } - - CDC_SCAN_TASKS.with_label_values(&["ongoing"]).inc(); - tikv_util::defer!({ - CDC_SCAN_TASKS.with_label_values(&["ongoing"]).dec(); - }); - // To avoid holding too many snapshots and holding them too long, // we need to acquire scan concurrency permit before taking snapshot. let sched = self.sched.clone(); @@ -172,7 +152,7 @@ impl Initializer { } match fut.await { - Ok(resp) => self.on_change_cmd_response(resp).await, + Ok(resp) => self.on_change_cmd_response(resp, memory_quota).await, Err(e) => Err(Error::Other(box_err!(e))), } } @@ -180,11 +160,13 @@ impl Initializer { pub(crate) async fn on_change_cmd_response( &mut self, mut resp: ReadResponse, - ) -> Result<()> { + memory_quota: Arc, + ) -> Result { if let Some(region_snapshot) = resp.snapshot { - assert_eq!(self.region_id, region_snapshot.get_region().get_id()); let region = region_snapshot.get_region().clone(); - self.async_incremental_scan(region_snapshot, region).await + assert_eq!(self.region_id, region.get_id()); + self.async_incremental_scan(region_snapshot, region, memory_quota) + .await } else { assert!( resp.response.get_header().has_error(), @@ -200,11 +182,29 @@ impl Initializer { &mut self, snap: S, region: Region, - ) -> Result<()> { - let downstream_id = self.downstream_id; + memory_quota: Arc, + ) -> Result { + CDC_SCAN_TASKS.with_label_values(&["ongoing"]).inc(); + defer!(CDC_SCAN_TASKS.with_label_values(&["ongoing"]).dec()); + let region_id = region.get_id(); + let downstream_id = self.downstream_id; let observe_id = self.observe_id; + let conn_id = self.conn_id; let kv_api = self.kv_api; + let on_cancel = || -> Result { + info!("cdc async incremental scan canceled"; + "region_id" => region_id, + "downstream_id" => ?downstream_id, + "observe_id" => ?observe_id, + "conn_id" => ?conn_id); + Err(box_err!("scan canceled")) + }; + + if self.downstream_state.load() == DownstreamState::Stopped { + return on_cancel(); + } + self.observed_range.update_region_key_range(®ion); debug!("cdc async incremental scan"; "region_id" => region_id, @@ -215,7 +215,7 @@ impl Initializer { "end_key" => log_wrappers::Value::key(snap.upper_bound().unwrap_or_default())); let mut resolver = if self.build_resolver { - Some(Resolver::new(region_id)) + Some(Resolver::new(region_id, memory_quota)) } else { None }; @@ -253,9 +253,9 @@ impl Initializer { }; fail_point!("cdc_incremental_scan_start"); - let conn_id = self.conn_id; let mut done = false; let start = Instant::now_coarse(); + let mut sink_time = Duration::default(); let curr_state = self.downstream_state.load(); assert!(matches!( @@ -263,15 +263,7 @@ impl Initializer { DownstreamState::Initializing | DownstreamState::Stopped )); - let on_cancel = || -> Result<()> { - info!("cdc async incremental scan canceled"; - "region_id" => region_id, - "downstream_id" => ?downstream_id, - "observe_id" => ?observe_id, - "conn_id" => ?conn_id); - Err(box_err!("scan canceled")) - }; - + let mut stats = InitializeStats::default(); while !done { // When downstream_state is Stopped, it means the corresponding // delegate is stopped. The initialization can be safely canceled. @@ -280,14 +272,18 @@ impl Initializer { } let cursors = old_value_cursors.as_mut(); let resolver = resolver.as_mut(); - let entries = self.scan_batch(&mut scanner, cursors, resolver).await?; + let entries = self + .scan_batch(&mut scanner, cursors, resolver, &mut stats) + .await?; if let Some(None) = entries.last() { // If the last element is None, it means scanning is finished. done = true; } debug!("cdc scan entries"; "len" => entries.len(), "region_id" => region_id); fail_point!("before_schedule_incremental_scan"); + let start_sink = Instant::now_coarse(); self.sink_scan_events(entries, done).await?; + sink_time += start_sink.saturating_elapsed(); } fail_point!("before_post_incremental_scan"); @@ -307,7 +303,8 @@ impl Initializer { } CDC_SCAN_DURATION_HISTOGRAM.observe(takes.as_secs_f64()); - Ok(()) + CDC_SCAN_SINK_DURATION_HISTOGRAM.observe(duration_to_sec(sink_time)); + Ok(stats) } // It's extracted from `Initializer::scan_batch` to avoid becoming an @@ -318,7 +315,7 @@ impl Initializer { scanner: &mut Scanner, mut old_value_cursors: Option<&mut OldValueCursors>, entries: &mut Vec>, - ) -> Result { + ) -> Result { let mut read_old_value = |v: &mut OldValue, stats: &mut Statistics| -> Result<()> { let (wc, dc) = match old_value_cursors { Some(ref mut x) => (&mut x.write, &mut x.default), @@ -333,21 +330,28 @@ impl Initializer { Ok(()) }; + let mut stats = InitializeStats::default(); + // This code block shouldn't be switched to other threads. let mut total_bytes = 0; let mut total_size = 0; let perf_instant = ReadPerfInstant::new(); let inspector = self_thread_inspector().ok(); let old_io_stat = inspector.as_ref().and_then(|x| x.io_stat().unwrap_or(None)); - let mut stats = Statistics::default(); while total_bytes <= self.max_scan_batch_bytes && total_size < self.max_scan_batch_size { total_size += 1; match scanner { Scanner::TxnKvScanner(scanner) => match scanner.next_entry()? { Some(mut entry) => { - read_old_value(entry.old_value(), &mut stats)?; - total_bytes += entry.size(); - entries.push(Some(KvEntry::TxnEntry(entry))); + let key = match entry { + TxnEntry::Prewrite { ref lock, .. } => &lock.0, + TxnEntry::Commit { ref write, .. } => &write.0, + }; + if self.observed_range.contains_encoded_key(key) { + read_old_value(entry.old_value(), &mut stats.old_value)?; + total_bytes += entry.size(); + entries.push(Some(KvEntry::TxnEntry(entry))); + } } None => { entries.push(None); @@ -371,19 +375,17 @@ impl Initializer { } } } - flush_oldvalue_stats(&stats, TAG_INCREMENTAL_SCAN); + flush_oldvalue_stats(&stats.old_value, TAG_INCREMENTAL_SCAN); let new_io_stat = inspector.as_ref().and_then(|x| x.io_stat().unwrap_or(None)); - let disk_read = match (old_io_stat, new_io_stat) { + + stats.scan.emit = total_bytes; + stats.scan.disk_read = match (old_io_stat, new_io_stat) { (Some(s1), Some(s2)) => Some((s2.read - s1.read) as usize), _ => None, }; - let perf_delta = perf_instant.delta(); - let emit = total_bytes; - Ok(ScanStat { - emit, - disk_read, - perf_delta, - }) + stats.scan.perf_delta = perf_instant.delta(); + + Ok(stats) } async fn scan_batch( @@ -391,24 +393,22 @@ impl Initializer { scanner: &mut Scanner, old_value_cursors: Option<&mut OldValueCursors>, resolver: Option<&mut Resolver>, + stats: &mut InitializeStats, ) -> Result>> { let mut entries = Vec::with_capacity(self.max_scan_batch_size); - let ScanStat { - emit, - disk_read, - perf_delta, - } = self.do_scan(scanner, old_value_cursors, &mut entries)?; - - CDC_SCAN_BYTES.inc_by(emit as _); - TLS_CDC_PERF_STATS.with(|x| *x.borrow_mut() += perf_delta); + let delta_stats = self.do_scan(scanner, old_value_cursors, &mut entries)?; + stats.add(&delta_stats); + + TLS_CDC_PERF_STATS.with(|x| *x.borrow_mut() += delta_stats.scan.perf_delta); tls_flush_perf_stats(); - let require = if let Some(bytes) = disk_read { + if let Some(bytes) = delta_stats.scan.disk_read { CDC_SCAN_DISK_READ_BYTES.inc_by(bytes as _); - bytes - } else { - perf_delta.block_read_byte as usize - }; - self.speed_limiter.consume(require).await; + self.scan_speed_limiter.consume(bytes).await; + } + CDC_SCAN_BYTES.inc_by(delta_stats.scan.emit as _); + self.fetch_speed_limiter + .consume(delta_stats.scan.emit as _) + .await; if let Some(resolver) = resolver { // Track the locks. @@ -418,7 +418,9 @@ impl Initializer { let key = Key::from_encoded_slice(encoded_key).into_raw().unwrap(); let lock = Lock::parse(value)?; match lock.lock_type { - LockType::Put | LockType::Delete => resolver.track_lock(lock.ts, key, None), + LockType::Put | LockType::Delete => { + resolver.track_lock(lock.ts, key, None)?; + } _ => (), }; } @@ -434,14 +436,17 @@ impl Initializer { self.request_id, entries, self.filter_loop, - &self.observed_range, )?; if done { let (cb, fut) = tikv_util::future::paired_future_callback(); events.push(CdcEvent::Barrier(Some(cb))); barrier = Some(fut); } - if let Err(e) = self.sink.send_all(events).await { + if let Err(e) = self + .sink + .send_all(events, self.scan_truncated.clone()) + .await + { error!("cdc send scan event failed"; "req_id" => ?self.request_id); return Err(Error::Sink(e)); } @@ -458,7 +463,7 @@ impl Initializer { fn finish_building_resolver(&self, mut resolver: Resolver, region: Region) { let observe_id = self.observe_id; - let rts = resolver.resolve(TimeStamp::zero(), None); + let rts = resolver.resolve(TimeStamp::zero(), None, TsSource::Cdc); info!( "cdc resolver initialized and schedule resolver ready"; "region_id" => region.get_id(), @@ -508,6 +513,8 @@ impl Initializer { } fn ts_filter_is_helpful(&self, snap: &S) -> bool { + fail_point!("ts_filter_is_helpful_always_true", |_| true); + if self.ts_filter_ratio < f64::EPSILON { return false; } @@ -559,17 +566,46 @@ impl Initializer { } } +#[derive(Default, Debug)] +pub struct InitializeStats { + pub old_value: Statistics, + pub scan: ScanStats, +} + +#[derive(Default, Debug)] +pub struct ScanStats { + // Fetched bytes to the scanner. + emit: usize, + // Bytes from the device, `None` if not possible to get it. + disk_read: Option, + // Perf delta for RocksDB. + perf_delta: ReadPerfContext, +} + +impl InitializeStats { + fn add(&mut self, other: &InitializeStats) { + self.old_value.add(&other.old_value); + self.scan.emit += other.scan.emit; + if let Some(x) = self.scan.disk_read.as_mut() { + *x += other.scan.disk_read.unwrap_or_default(); + } + self.scan.perf_delta += other.scan.perf_delta; + } +} + #[cfg(test)] mod tests { use std::{ collections::BTreeMap, fmt::Display, - sync::mpsc::{channel, sync_channel, Receiver, RecvTimeoutError, Sender}, + sync::{ + mpsc::{channel, sync_channel, Receiver, RecvTimeoutError, Sender}, + Arc, + }, time::Duration, }; - use collections::HashSet; - use engine_rocks::RocksEngine; + use engine_rocks::{BlobRunMode, RocksEngine}; use engine_traits::{MiscExt, CF_WRITE}; use futures::{executor::block_on, StreamExt}; use kvproto::{ @@ -577,16 +613,22 @@ mod tests { errorpb::Error as ErrorHeader, }; use raftstore::{coprocessor::ObserveHandle, router::CdcRaftRouter, store::RegionSnapshot}; + use resolved_ts::TxnLocks; use test_raftstore::MockRaftStoreRouter; - use tikv::storage::{ - kv::Engine, - txn::tests::{ - must_acquire_pessimistic_lock, must_commit, must_prewrite_delete, must_prewrite_put, - must_prewrite_put_with_txn_soucre, + use tikv::{ + config::DbConfig, + storage::{ + kv::Engine, + txn::tests::{ + must_acquire_pessimistic_lock, must_commit, must_prewrite_delete, + must_prewrite_put, must_prewrite_put_with_txn_soucre, + }, + TestEngineBuilder, }, - TestEngineBuilder, }; use tikv_util::{ + config::ReadableSize, + memory::MemoryQuota, sys::thread::ThreadBuildWrapper, worker::{LazyWorker, Runnable}, }; @@ -616,7 +658,8 @@ mod tests { } fn mock_initializer( - speed_limit: usize, + scan_limit: usize, + fetch_limit: usize, buffer: usize, engine: Option, kv_api: ChangeDataRequestKvApi, @@ -629,8 +672,8 @@ mod tests { crate::channel::Drain, ) { let (receiver_worker, rx) = new_receiver_worker(); - let quota = crate::channel::MemoryQuota::new(usize::MAX); - let (sink, drain) = crate::channel::channel(buffer, quota); + let quota = Arc::new(MemoryQuota::new(usize::MAX)); + let (sink, drain) = crate::channel::channel(ConnId::default(), buffer, quota); let pool = Builder::new_multi_thread() .thread_name("test-initializer-worker") @@ -654,10 +697,12 @@ mod tests { observe_id: ObserveId::new(), downstream_id: DownstreamId::new(), downstream_state, + scan_truncated: Arc::new(Default::default()), conn_id: ConnId::new(), request_id: 0, checkpoint_ts: 1.into(), - speed_limiter: Limiter::new(speed_limit as _), + scan_speed_limiter: Limiter::new(scan_limit as _), + fetch_speed_limiter: Limiter::new(fetch_limit as _), max_scan_batch_bytes: 1024 * 1024, max_scan_batch_size: 1024, build_resolver: true, @@ -673,7 +718,7 @@ mod tests { fn test_initializer_build_resolver() { let mut engine = TestEngineBuilder::new().build_without_cache().unwrap(); - let mut expected_locks = BTreeMap::>>::new(); + let mut expected_locks = BTreeMap::::new(); // Only observe ["", "b\0x90"] let observed_range = ObservedRange::new( @@ -696,10 +741,14 @@ mod tests { total_bytes += v.len(); let ts = TimeStamp::new(i as _); must_prewrite_put(&mut engine, k, v, k, ts); - expected_locks - .entry(ts) - .or_default() - .insert(k.to_vec().into()); + if i < 90 { + let txn_locks = expected_locks.entry(ts).or_insert_with(|| { + let mut txn_locks = TxnLocks::default(); + txn_locks.sample_lock = Some(k.to_vec().into()); + txn_locks + }); + txn_locks.lock_count += 1; + } } let region = Region::default(); @@ -707,6 +756,7 @@ mod tests { // Buffer must be large enough to unblock async incremental scan. let buffer = 1000; let (mut worker, pool, mut initializer, rx, mut drain) = mock_initializer( + total_bytes, total_bytes, buffer, engine.kv_engine(), @@ -737,21 +787,37 @@ mod tests { } }); - block_on(initializer.async_incremental_scan(snap.clone(), region.clone())).unwrap(); + let memory_quota = Arc::new(MemoryQuota::new(usize::MAX)); + block_on(initializer.async_incremental_scan( + snap.clone(), + region.clone(), + memory_quota.clone(), + )) + .unwrap(); check_result(); initializer .downstream_state .store(DownstreamState::Initializing); initializer.max_scan_batch_bytes = total_bytes; - block_on(initializer.async_incremental_scan(snap.clone(), region.clone())).unwrap(); + block_on(initializer.async_incremental_scan( + snap.clone(), + region.clone(), + memory_quota.clone(), + )) + .unwrap(); check_result(); initializer .downstream_state .store(DownstreamState::Initializing); initializer.build_resolver = false; - block_on(initializer.async_incremental_scan(snap.clone(), region.clone())).unwrap(); + block_on(initializer.async_incremental_scan( + snap.clone(), + region.clone(), + memory_quota.clone(), + )) + .unwrap(); loop { let task = rx.recv_timeout(Duration::from_millis(100)); @@ -764,7 +830,8 @@ mod tests { // Test cancellation. initializer.downstream_state.store(DownstreamState::Stopped); - block_on(initializer.async_incremental_scan(snap.clone(), region)).unwrap_err(); + block_on(initializer.async_incremental_scan(snap.clone(), region, memory_quota.clone())) + .unwrap_err(); // Cancel error should trigger a deregsiter. let mut region = Region::default(); @@ -776,14 +843,15 @@ mod tests { response: Default::default(), txn_extra_op: Default::default(), }; - block_on(initializer.on_change_cmd_response(resp.clone())).unwrap_err(); + block_on(initializer.on_change_cmd_response(resp.clone(), memory_quota.clone())) + .unwrap_err(); // Disconnect sink by dropping runtime (it also drops drain). drop(pool); initializer .downstream_state .store(DownstreamState::Initializing); - block_on(initializer.on_change_cmd_response(resp)).unwrap_err(); + block_on(initializer.on_change_cmd_response(resp, memory_quota)).unwrap_err(); worker.stop(); } @@ -804,6 +872,7 @@ mod tests { // Buffer must be large enough to unblock async incremental scan. let buffer = 1000; let (mut worker, pool, mut initializer, _rx, mut drain) = mock_initializer( + total_bytes, total_bytes, buffer, engine.kv_engine(), @@ -811,8 +880,9 @@ mod tests { filter_loop, ); let th = pool.spawn(async move { + let memory_quota = Arc::new(MemoryQuota::new(usize::MAX)); initializer - .async_incremental_scan(snap, Region::default()) + .async_incremental_scan(snap, Region::default(), memory_quota) .await .unwrap(); }); @@ -885,6 +955,7 @@ mod tests { // Do incremental scan with different `hint_min_ts` values. for checkpoint_ts in [200, 100, 150] { let (mut worker, pool, mut initializer, _rx, mut drain) = mock_initializer( + usize::MAX, usize::MAX, 1000, engine.kv_engine(), @@ -896,8 +967,9 @@ mod tests { let snap = engine.snapshot(Default::default()).unwrap(); let th = pool.spawn(async move { + let memory_qutoa = Arc::new(MemoryQuota::new(usize::MAX)); initializer - .async_incremental_scan(snap, Region::default()) + .async_incremental_scan(snap, Region::default(), memory_qutoa) .await .unwrap(); }); @@ -949,6 +1021,7 @@ mod tests { let total_bytes = 1; let buffer = 1; let (mut worker, _pool, mut initializer, rx, _drain) = mock_initializer( + total_bytes, total_bytes, buffer, None, @@ -1004,17 +1077,19 @@ mod tests { let total_bytes = 1; let buffer = 1; let (mut worker, pool, mut initializer, _rx, _drain) = - mock_initializer(total_bytes, buffer, None, kv_api, false); + mock_initializer(total_bytes, total_bytes, buffer, None, kv_api, false); let change_cmd = ChangeObserver::from_cdc(1, ObserveHandle::new()); let raft_router = CdcRaftRouter(MockRaftStoreRouter::new()); let concurrency_semaphore = Arc::new(Semaphore::new(1)); + let memory_quota = Arc::new(MemoryQuota::new(usize::MAX)); initializer.downstream_state.store(DownstreamState::Stopped); block_on(initializer.initialize( change_cmd, raft_router.clone(), concurrency_semaphore.clone(), + memory_quota.clone(), )) .unwrap_err(); @@ -1040,7 +1115,7 @@ mod tests { &concurrency_semaphore, ); let res = initializer - .initialize(change_cmd, raft_router, concurrency_semaphore) + .initialize(change_cmd, raft_router, concurrency_semaphore, memory_quota) .await; tx1.send(res).unwrap(); }); @@ -1054,4 +1129,57 @@ mod tests { worker.stop(); } + + #[test] + fn test_scanner_with_titan() { + let mut cfg = DbConfig::default(); + cfg.titan.enabled = true; + cfg.defaultcf.titan.blob_run_mode = BlobRunMode::Normal; + cfg.defaultcf.titan.min_blob_size = ReadableSize(0); + cfg.writecf.titan.blob_run_mode = BlobRunMode::Normal; + cfg.writecf.titan.min_blob_size = ReadableSize(0); + cfg.lockcf.titan.blob_run_mode = BlobRunMode::Normal; + cfg.lockcf.titan.min_blob_size = ReadableSize(0); + let mut engine = TestEngineBuilder::new().build_with_cfg(&cfg).unwrap(); + + must_prewrite_put(&mut engine, b"zkey", b"value", b"zkey", 100); + must_commit(&mut engine, b"zkey", 100, 110); + for cf in &[CF_WRITE, CF_DEFAULT] { + engine.kv_engine().unwrap().flush_cf(cf, true).unwrap(); + } + must_prewrite_put(&mut engine, b"zkey", b"value", b"zkey", 150); + must_commit(&mut engine, b"zkey", 150, 160); + for cf in &[CF_WRITE, CF_DEFAULT] { + engine.kv_engine().unwrap().flush_cf(cf, true).unwrap(); + } + + let (mut worker, pool, mut initializer, _rx, mut drain) = mock_initializer( + usize::MAX, + usize::MAX, + 1000, + engine.kv_engine(), + ChangeDataRequestKvApi::TiDb, + false, + ); + initializer.checkpoint_ts = 120.into(); + let snap = engine.snapshot(Default::default()).unwrap(); + + let th = pool.spawn(async move { + let memory_quota = Arc::new(MemoryQuota::new(usize::MAX)); + initializer + .async_incremental_scan(snap, Region::default(), memory_quota) + .await + .unwrap(); + }); + + let mut total_entries = 0; + while let Some((event, _)) = block_on(drain.drain().next()) { + if let CdcEvent::Event(e) = event { + total_entries += e.get_entries().get_entries().len(); + } + } + assert_eq!(total_entries, 2); + block_on(th).unwrap(); + worker.stop(); + } } diff --git a/components/cdc/src/lib.rs b/components/cdc/src/lib.rs index c913cefb92e..64f110f5c45 100644 --- a/components/cdc/src/lib.rs +++ b/components/cdc/src/lib.rs @@ -15,7 +15,7 @@ mod old_value; mod service; mod txn_source; -pub use channel::{recv_timeout, CdcEvent, MemoryQuota}; +pub use channel::{recv_timeout, CdcEvent}; pub use config::CdcConfigManager; pub use delegate::Delegate; pub use endpoint::{CdcTxnExtraScheduler, Endpoint, Task, Validate}; diff --git a/components/cdc/src/metrics.rs b/components/cdc/src/metrics.rs index 5db91572112..6bef4313959 100644 --- a/components/cdc/src/metrics.rs +++ b/components/cdc/src/metrics.rs @@ -88,6 +88,11 @@ lazy_static! { exponential_buckets(0.005, 2.0, 20).unwrap() ) .unwrap(); + pub static ref CDC_SCAN_SINK_DURATION_HISTOGRAM: Histogram = register_histogram!( + "tikv_cdc_scan_sink_duration_seconds", + "Bucketed histogram of cdc async scan sink time duration", + ) + .unwrap(); pub static ref CDC_SCAN_BYTES: IntCounter = register_int_counter!( "tikv_cdc_scan_bytes_total", "Total fetched bytes of CDC incremental scan" @@ -214,6 +219,13 @@ lazy_static! { pub static ref CDC_ROCKSDB_PERF_COUNTER_STATIC: PerfCounter = auto_flush_from!(CDC_ROCKSDB_PERF_COUNTER, PerfCounter); + + pub static ref CDC_EVENTS_PENDING_DURATION: Histogram = register_histogram!( + "tikv_cdc_events_pending_duration", + "Pending duration for all events, in milliseconds", + exponential_buckets(0.01, 2.0, 17).unwrap(), + ) + .unwrap(); } thread_local! { diff --git a/components/cdc/src/observer.rs b/components/cdc/src/observer.rs index aac2842e404..a6586f60765 100644 --- a/components/cdc/src/observer.rs +++ b/components/cdc/src/observer.rs @@ -119,8 +119,6 @@ impl CmdObserver for CdcObserver { let mut region = Region::default(); region.mut_peers().push(Peer::default()); // Create a snapshot here for preventing the old value was GC-ed. - // TODO: only need it after enabling old value, may add a flag to indicate - // whether to get it. let snapshot = RegionSnapshot::from_snapshot(Arc::new(engine.snapshot()), Arc::new(region)); let get_old_value = move |key, query_ts, @@ -177,20 +175,26 @@ impl RegionChangeObserver for CdcObserver { event: RegionChangeEvent, _: StateRole, ) { - if let RegionChangeEvent::Destroy = event { - let region_id = ctx.region().get_id(); - if let Some(observe_id) = self.is_subscribed(region_id) { - // Unregister all downstreams. - let store_err = RaftStoreError::RegionNotFound(region_id); - let deregister = Deregister::Delegate { - region_id, - observe_id, - err: CdcError::request(store_err.into()), - }; - if let Err(e) = self.sched.schedule(Task::Deregister(deregister)) { - error!("cdc schedule cdc task failed"; "error" => ?e); + match event { + RegionChangeEvent::Destroy + | RegionChangeEvent::Update( + RegionChangeReason::Split | RegionChangeReason::CommitMerge, + ) => { + let region_id = ctx.region().get_id(); + if let Some(observe_id) = self.is_subscribed(region_id) { + // Unregister all downstreams. + let store_err = RaftStoreError::RegionNotFound(region_id); + let deregister = Deregister::Delegate { + region_id, + observe_id, + err: CdcError::request(store_err.into()), + }; + if let Err(e) = self.sched.schedule(Task::Deregister(deregister)) { + error!("cdc schedule cdc task failed"; "error" => ?e); + } } } + _ => {} } } } diff --git a/components/cdc/src/service.rs b/components/cdc/src/service.rs index d07b5283380..8a2f644de3a 100644 --- a/components/cdc/src/service.rs +++ b/components/cdc/src/service.rs @@ -16,10 +16,10 @@ use kvproto::{ }, kvrpcpb::ApiVersion, }; -use tikv_util::{error, info, warn, worker::*}; +use tikv_util::{error, info, memory::MemoryQuota, warn, worker::*}; use crate::{ - channel::{channel, MemoryQuota, Sink, CDC_CHANNLE_CAPACITY}, + channel::{channel, Sink, CDC_CHANNLE_CAPACITY}, delegate::{Downstream, DownstreamId, DownstreamState, ObservedRange}, endpoint::{Deregister, Task}, }; @@ -100,9 +100,9 @@ struct DownstreamValue { } impl Conn { - pub fn new(sink: Sink, peer: String) -> Conn { + pub fn new(conn_id: ConnId, sink: Sink, peer: String) -> Conn { Conn { - id: ConnId::new(), + id: conn_id, sink, downstreams: HashMap::default(), peer, @@ -244,14 +244,14 @@ impl EventFeedHeaders { #[derive(Clone)] pub struct Service { scheduler: Scheduler, - memory_quota: MemoryQuota, + memory_quota: Arc, } impl Service { /// Create a ChangeData service. /// /// It requires a scheduler of an `Endpoint` in order to schedule tasks. - pub fn new(scheduler: Scheduler, memory_quota: MemoryQuota) -> Service { + pub fn new(scheduler: Scheduler, memory_quota: Arc) -> Service { Service { scheduler, memory_quota, @@ -304,6 +304,13 @@ impl Service { scheduler.schedule(task).map_err(|e| format!("{:?}", e)) } + // ### Command types: + // * Register registers a region. 1) both `request_id` and `region_id` must be + // specified; 2) `request_id` can be 0 but `region_id` can not. + // * Deregister deregisters some regions in one same `request_id` or just one + // region. 1) if both `request_id` and `region_id` are specified, just + // deregister the region; 2) if only `request_id` is specified, all region + // subscriptions with the same `request_id` will be deregistered. fn handle_request( scheduler: &Scheduler, peer: &str, @@ -327,18 +334,19 @@ impl Service { request: ChangeDataRequest, conn_id: ConnId, ) -> Result<(), String> { - let observed_range = - match ObservedRange::new(request.start_key.clone(), request.end_key.clone()) { - Ok(observed_range) => observed_range, - Err(e) => { - warn!( - "cdc invalid observed start key or end key version"; - "downstream" => ?peer, "region_id" => request.region_id, - "error" => ?e, - ); - ObservedRange::default() - } - }; + let observed_range = ObservedRange::new(request.start_key.clone(), request.end_key.clone()) + .unwrap_or_else(|e| { + warn!( + "cdc invalid observed start key or end key version"; + "downstream" => ?peer, + "region_id" => request.region_id, + "request_id" => request.region_id, + "error" => ?e, + "start_key" => log_wrappers::Value::key(&request.start_key), + "end_key" => log_wrappers::Value::key(&request.end_key), + ); + ObservedRange::default() + }); let downstream = Downstream::new( peer.to_owned(), request.get_region_epoch().clone(), @@ -361,10 +369,18 @@ impl Service { request: ChangeDataRequest, conn_id: ConnId, ) -> Result<(), String> { - let task = Task::Deregister(Deregister::Request { - conn_id, - request_id: request.request_id, - }); + let task = if request.region_id != 0 { + Task::Deregister(Deregister::Region { + conn_id, + request_id: request.request_id, + region_id: request.region_id, + }) + } else { + Task::Deregister(Deregister::Request { + conn_id, + request_id: request.request_id, + }) + }; scheduler.schedule(task).map_err(|e| format!("{:?}", e)) } @@ -390,10 +406,10 @@ impl Service { event_feed_v2: bool, ) { sink.enhance_batch(true); + let conn_id = ConnId::new(); let (event_sink, mut event_drain) = - channel(CDC_CHANNLE_CAPACITY, self.memory_quota.clone()); - let conn = Conn::new(event_sink, ctx.peer()); - let conn_id = conn.get_id(); + channel(conn_id, CDC_CHANNLE_CAPACITY, self.memory_quota.clone()); + let conn = Conn::new(conn_id, event_sink, ctx.peer()); let mut explicit_features = vec![]; if event_feed_v2 { @@ -518,7 +534,7 @@ mod tests { use crate::channel::{recv_timeout, CdcEvent}; fn new_rpc_suite(capacity: usize) -> (Server, ChangeDataClient, ReceiverWrapper) { - let memory_quota = MemoryQuota::new(capacity); + let memory_quota = Arc::new(MemoryQuota::new(capacity)); let (scheduler, rx) = dummy_scheduler(); let cdc_service = Service::new(scheduler, memory_quota); let env = Arc::new(EnvBuilder::new().build()); @@ -560,7 +576,14 @@ mod tests { let send = || { let rts_ = rts.clone(); let mut sink_ = sink.clone(); - Box::pin(async move { sink_.send_all(vec![CdcEvent::ResolvedTs(rts_)]).await }) + Box::pin(async move { + sink_ + .send_all( + vec![CdcEvent::ResolvedTs(rts_)], + Arc::new(Default::default()), + ) + .await + }) }; let must_fill_window = || { let mut window_size = 0; diff --git a/components/cdc/tests/failpoints/mod.rs b/components/cdc/tests/failpoints/mod.rs index 082b1c15f67..619ee200985 100644 --- a/components/cdc/tests/failpoints/mod.rs +++ b/components/cdc/tests/failpoints/mod.rs @@ -4,6 +4,7 @@ #![test_runner(test_util::run_failpoint_tests)] mod test_endpoint; +mod test_memory_quota; mod test_observe; mod test_register; mod test_resolve; diff --git a/components/cdc/tests/failpoints/test_endpoint.rs b/components/cdc/tests/failpoints/test_endpoint.rs index f7cc387625d..fae542a45d0 100644 --- a/components/cdc/tests/failpoints/test_endpoint.rs +++ b/components/cdc/tests/failpoints/test_endpoint.rs @@ -9,13 +9,17 @@ use std::{ use api_version::{test_kv_format_impl, KvFormat}; use causal_ts::CausalTsProvider; use cdc::{recv_timeout, Delegate, OldValueCache, Task, Validate}; +use engine_traits::{ + IterOptions, Iterable, Iterator, MiscExt, Mutable, WriteBatch, WriteBatchExt, WriteOptions, + CF_DEFAULT, CF_WRITE, +}; use futures::{executor::block_on, sink::SinkExt}; use grpcio::{ChannelBuilder, Environment, WriteFlags}; use kvproto::{cdcpb::*, kvrpcpb::*, tikvpb_grpc::TikvClient}; use pd_client::PdClient; use test_raftstore::*; -use tikv_util::{debug, worker::Scheduler, HandyRwLock}; -use txn_types::TimeStamp; +use tikv_util::{debug, keybuilder::KeyBuilder, worker::Scheduler, HandyRwLock}; +use txn_types::{Key, TimeStamp}; use crate::{new_event_feed, new_event_feed_v2, ClientReceiver, TestSuite, TestSuiteBuilder}; @@ -569,3 +573,179 @@ fn test_cdc_stream_multiplexing() { } assert!(request_2_ready); } + +// This case tests pending regions can still get region split/merge +// notifications. +#[test] +fn test_cdc_notify_pending_regions() { + let cluster = new_server_cluster(0, 1); + cluster.pd_client.disable_default_operator(); + let mut suite = TestSuiteBuilder::new().cluster(cluster).build(); + let region = suite.cluster.get_region(&[]); + let rid = region.id; + let (mut req_tx, _, receive_event) = new_event_feed_v2(suite.get_region_cdc_client(rid)); + + fail::cfg("cdc_before_initialize", "pause").unwrap(); + let mut req = suite.new_changedata_request(rid); + req.request_id = 1; + block_on(req_tx.send((req, WriteFlags::default()))).unwrap(); + + thread::sleep(Duration::from_millis(100)); + suite.cluster.must_split(®ion, b"x"); + let event = receive_event(false); + matches!( + event.get_events()[0].event, + Some(Event_oneof_event::Error(ref e)) if e.has_region_not_found(), + ); + fail::remove("cdc_before_initialize"); +} + +// The case check whether https://github.com/tikv/tikv/issues/17233 is fixed or not. +#[test] +fn test_delegate_fail_during_incremental_scan() { + let mut cluster = new_server_cluster(0, 1); + configure_for_lease_read(&mut cluster.cfg, Some(100), Some(10)); + cluster.pd_client.disable_default_operator(); + let mut suite = TestSuiteBuilder::new().cluster(cluster).build(); + let region = suite.cluster.get_region(&[]); + let rid = region.id; + let cf_tso = block_on(suite.cluster.pd_client.get_tso()).unwrap(); + + let start_tso = cf_tso.next(); + let pk = format!("key_{:03}", 0).into_bytes(); + let mut mutations = Vec::with_capacity(10); + for i in 0..10 { + let mut mutation = Mutation::default(); + mutation.set_op(Op::Put); + mutation.key = format!("key_{:03}", i).into_bytes(); + mutation.value = vec![b'x'; 16]; + mutations.push(mutation); + } + suite.must_kv_prewrite(rid, mutations, pk, start_tso); + + fail::cfg("before_schedule_incremental_scan", "1*pause").unwrap(); + + let (mut req_tx, recv, receive_event) = new_event_feed_v2(suite.get_region_cdc_client(rid)); + let mut req = suite.new_changedata_request(rid); + req.request_id = 100; + req.checkpoint_ts = cf_tso.into_inner(); + req.set_start_key(Key::from_raw(b"a").into_encoded()); + req.set_end_key(Key::from_raw(b"z").into_encoded()); + block_on(req_tx.send((req.clone(), WriteFlags::default()))).unwrap(); + std::thread::sleep(Duration::from_millis(500)); + + suite.cluster.must_split(®ion, b"f"); + + // After the incremental scan is canceled, we can get the epoch_not_match error. + // And after the error is retrieved, no more entries can be received. + let mut get_epoch_not_match = false; + while !get_epoch_not_match { + for event in receive_event(false).events.to_vec() { + match event.event { + Some(Event_oneof_event::Error(err)) => { + assert!(err.has_epoch_not_match(), "{:?}", err); + get_epoch_not_match = true; + } + Some(Event_oneof_event::Entries(..)) => { + assert!(!get_epoch_not_match); + } + _ => unreachable!(), + } + } + } + + fail::remove("before_schedule_incremental_scan"); + + let mut recver = recv.replace(None).unwrap(); + recv_timeout(&mut recver, Duration::from_secs(1)).unwrap_err(); + recv.replace(Some(recver)); +} + +#[test] +fn test_cdc_load_unnecessary_old_value() { + let mut suite = TestSuite::new(1, ApiVersion::V1); + let region = suite.cluster.get_region(&[]); + let rid = region.id; + let engine = suite.cluster.get_engine(1); + + let start_tso = block_on(suite.cluster.pd_client.get_tso()).unwrap(); + let pk = format!("key_{:05}", 0).into_bytes(); + let mut mutations = Vec::with_capacity(1000); + let mut keys = Vec::with_capacity(1000); + for i in 0..1000 { + let key = format!("key_{:05}", i).into_bytes(); + keys.push(key.clone()); + + let mut mutation = Mutation::default(); + mutation.set_op(Op::Put); + mutation.key = key; + mutation.value = vec![b'x'; 16]; + mutations.push(mutation); + } + suite.must_kv_prewrite(rid, mutations, pk, start_tso); + + let commit_tso = block_on(suite.cluster.pd_client.get_tso()).unwrap(); + suite.must_kv_commit(rid, keys, start_tso, commit_tso); + engine.flush_cf(CF_WRITE, true).unwrap(); + + for cf in &[CF_WRITE, CF_DEFAULT] { + let mut wb = suite.cluster.get_engine(1).write_batch(); + let mut count = 0; + + let start = KeyBuilder::from_vec(vec![b'z'], 0, 0); + let end = KeyBuilder::from_vec(vec![b'z' + 1], 0, 0); + let iter_opts = IterOptions::new(Some(start), Some(end), false); + let mut iter = engine.iterator_opt(cf, iter_opts).unwrap(); + let mut valid = iter.seek_to_first().unwrap(); + + // skip some keys. + while valid && count < 2 { + count += 1; + valid = iter.next().unwrap(); + } + while valid { + count += 1; + let key = iter.key(); + wb.delete_cf(cf, key).unwrap(); + valid = iter.next().unwrap(); + } + assert!(count == 0 || count == 1000); + wb.write_opt(&WriteOptions::default()).unwrap(); + engine.flush_cf(cf, true).unwrap(); + } + + let scheduler = suite.endpoints.values().next().unwrap().scheduler(); + let (tx, rx) = std::sync::mpsc::sync_channel(1); + scheduler + .schedule(Task::Validate(Validate::InitializeStats(Box::new( + move |stats| tx.send(stats).unwrap(), + )))) + .unwrap(); + + fail::cfg("ts_filter_is_helpful_always_true", "return(0)").unwrap(); + let (mut req_tx, _, receive_event) = new_event_feed_v2(suite.get_region_cdc_client(rid)); + let mut req = suite.new_changedata_request(rid); + req.request_id = 100; + req.checkpoint_ts = commit_tso.into_inner() - 1; + req.set_start_key(Key::from_raw(b"aa").into_encoded()); + req.set_end_key(Key::from_raw(b"ab").into_encoded()); + block_on(req_tx.send((req.clone(), WriteFlags::default()))).unwrap(); + + let events = receive_event(false).events.to_vec(); + assert_eq!(events.len(), 1, "{:?}", events); + match events[0].event.as_ref().unwrap() { + Event_oneof_event::Entries(es) => { + assert!(es.entries.len() == 1); + assert_eq!(es.entries[0].get_type(), EventLogType::Initialized); + } + _ => unreachable!(), + } + + let stats = rx.recv().unwrap().old_value.write; + assert_eq!(stats.seek_tombstone, 0); + assert_eq!(stats.next_tombstone, 0); + assert_eq!(stats.prev_tombstone, 0); + + fail::remove("ts_filter_is_helpful_always_true"); + suite.stop(); +} diff --git a/components/cdc/tests/failpoints/test_memory_quota.rs b/components/cdc/tests/failpoints/test_memory_quota.rs new file mode 100644 index 00000000000..5b564ba61ec --- /dev/null +++ b/components/cdc/tests/failpoints/test_memory_quota.rs @@ -0,0 +1,289 @@ +// Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{sync::*, time::Duration}; + +use cdc::{Task, Validate}; +use futures::{executor::block_on, SinkExt}; +use grpcio::WriteFlags; +use kvproto::{cdcpb::*, kvrpcpb::*}; +use pd_client::PdClient; +use test_raftstore::*; + +use crate::{new_event_feed, TestSuiteBuilder}; + +#[test] +fn test_resolver_track_lock_memory_quota_exceeded() { + let mut cluster = new_server_cluster(1, 1); + // Increase the Raft tick interval to make this test case running reliably. + configure_for_lease_read(&mut cluster.cfg, Some(100), None); + let memory_quota = 1024; // 1KB + let mut suite = TestSuiteBuilder::new() + .cluster(cluster) + .memory_quota(memory_quota) + .build(); + + // Let CdcEvent size be 0 to effectively disable memory quota for CdcEvent. + fail::cfg("cdc_event_size", "return(0)").unwrap(); + + let req = suite.new_changedata_request(1); + let (mut req_tx, _event_feed_wrap, receive_event) = + new_event_feed(suite.get_region_cdc_client(1)); + block_on(req_tx.send((req, WriteFlags::default()))).unwrap(); + let event = receive_event(false); + event.events.into_iter().for_each(|e| { + match e.event.unwrap() { + // Even if there is no write, + // it should always outputs an Initialized event. + Event_oneof_event::Entries(es) => { + assert!(es.entries.len() == 1, "{:?}", es); + let e = &es.entries[0]; + assert_eq!(e.get_type(), EventLogType::Initialized, "{:?}", es); + } + other => panic!("unknown event {:?}", other), + } + }); + + // Client must receive messages when there is no congest error. + let key_size = memory_quota / 2; + let (k, v) = (vec![1; key_size], vec![5]); + // Prewrite + let start_ts = block_on(suite.cluster.pd_client.get_tso()).unwrap(); + let mut mutation = Mutation::default(); + mutation.set_op(Op::Put); + mutation.key = k.clone(); + mutation.value = v; + suite.must_kv_prewrite(1, vec![mutation], k, start_ts); + let mut events = receive_event(false).events.to_vec(); + assert_eq!(events.len(), 1, "{:?}", events); + match events.pop().unwrap().event.unwrap() { + Event_oneof_event::Entries(entries) => { + assert_eq!(entries.entries.len(), 1); + assert_eq!(entries.entries[0].get_type(), EventLogType::Prewrite); + } + other => panic!("unknown event {:?}", other), + } + + // Trigger congest error. + let key_size = memory_quota * 2; + let (k, v) = (vec![2; key_size], vec![5]); + // Prewrite + let start_ts = block_on(suite.cluster.pd_client.get_tso()).unwrap(); + let mut mutation = Mutation::default(); + mutation.set_op(Op::Put); + mutation.key = k.clone(); + mutation.value = v; + suite.must_kv_prewrite(1, vec![mutation], k, start_ts); + let mut events = receive_event(false).events.to_vec(); + assert_eq!(events.len(), 1, "{:?}", events); + match events.pop().unwrap().event.unwrap() { + Event_oneof_event::Error(e) => { + // Unknown errors are translated into region_not_found. + assert!(e.has_region_not_found(), "{:?}", e); + } + other => panic!("unknown event {:?}", other), + } + + // The delegate must be removed. + let scheduler = suite.endpoints.values().next().unwrap().scheduler(); + let (tx, rx) = mpsc::channel(); + scheduler + .schedule(Task::Validate(Validate::Region( + 1, + Box::new(move |delegate| { + tx.send(delegate.is_none()).unwrap(); + }), + ))) + .unwrap(); + + assert!( + rx.recv_timeout(Duration::from_millis(1000)).unwrap(), + "find unexpected delegate" + ); + + suite.stop(); +} + +#[test] +fn test_pending_on_region_ready_memory_quota_exceeded() { + let mut cluster = new_server_cluster(1, 1); + // Increase the Raft tick interval to make this test case running reliably. + configure_for_lease_read(&mut cluster.cfg, Some(100), None); + let memory_quota = 1024; // 1KB + let mut suite = TestSuiteBuilder::new() + .cluster(cluster) + .memory_quota(memory_quota) + .build(); + + // Let CdcEvent size be 0 to effectively disable memory quota for CdcEvent. + fail::cfg("cdc_event_size", "return(0)").unwrap(); + + // Trigger memory quota exceeded error. + fail::cfg("cdc_pending_on_region_ready", "return").unwrap(); + let req = suite.new_changedata_request(1); + let (mut req_tx, _event_feed_wrap, receive_event) = + new_event_feed(suite.get_region_cdc_client(1)); + block_on(req_tx.send((req, WriteFlags::default()))).unwrap(); + let event = receive_event(false); + event.events.into_iter().for_each(|e| { + match e.event.unwrap() { + // Even if there is no write, + // it should always outputs an Initialized event. + Event_oneof_event::Entries(es) => { + assert!(es.entries.len() == 1, "{:?}", es); + let e = &es.entries[0]; + assert_eq!(e.get_type(), EventLogType::Initialized, "{:?}", es); + } + other => panic!("unknown event {:?}", other), + } + }); + // MemoryQuotaExceeded error is triggered on_region_ready. + let mut events = receive_event(false).events.to_vec(); + assert_eq!(events.len(), 1, "{:?}", events); + match events.pop().unwrap().event.unwrap() { + Event_oneof_event::Error(e) => { + // Unknown errors are translated into region_not_found. + assert!(e.has_region_not_found(), "{:?}", e); + } + other => panic!("unknown event {:?}", other), + } + + // The delegate must be removed. + let scheduler = suite.endpoints.values().next().unwrap().scheduler(); + let (tx, rx) = mpsc::channel(); + scheduler + .schedule(Task::Validate(Validate::Region( + 1, + Box::new(move |delegate| { + tx.send(delegate.is_none()).unwrap(); + }), + ))) + .unwrap(); + + assert!( + rx.recv_timeout(Duration::from_millis(1000)).unwrap(), + "find unexpected delegate" + ); + + fail::remove("cdc_incremental_scan_start"); + suite.stop(); +} + +#[test] +fn test_pending_push_lock_memory_quota_exceeded() { + let mut cluster = new_server_cluster(1, 1); + // Increase the Raft tick interval to make this test case running reliably. + configure_for_lease_read(&mut cluster.cfg, Some(100), None); + let memory_quota = 1024; // 1KB + let mut suite = TestSuiteBuilder::new() + .cluster(cluster) + .memory_quota(memory_quota) + .build(); + + // Let CdcEvent size be 0 to effectively disable memory quota for CdcEvent. + fail::cfg("cdc_event_size", "return(0)").unwrap(); + + // Pause scan so that no region can be initialized, and all locks will be + // put in pending locks. + fail::cfg("cdc_incremental_scan_start", "pause").unwrap(); + + let req = suite.new_changedata_request(1); + let (mut req_tx, _event_feed_wrap, receive_event) = + new_event_feed(suite.get_region_cdc_client(1)); + block_on(req_tx.send((req, WriteFlags::default()))).unwrap(); + + // Trigger congest error. + let key_size = memory_quota * 2; + let (k, v) = (vec![1; key_size], vec![5]); + let start_ts = block_on(suite.cluster.pd_client.get_tso()).unwrap(); + let mut mutation = Mutation::default(); + mutation.set_op(Op::Put); + mutation.key = k.clone(); + mutation.value = v; + suite.must_kv_prewrite(1, vec![mutation], k, start_ts); + let mut events = receive_event(false).events.to_vec(); + assert_eq!(events.len(), 1, "{:?}", events); + match events.pop().unwrap().event.unwrap() { + Event_oneof_event::Error(e) => { + // Unknown errors are translated into region_not_found. + assert!(e.has_region_not_found(), "{:?}", e); + } + other => panic!("unknown event {:?}", other), + } + + // The delegate must be removed. + let scheduler = suite.endpoints.values().next().unwrap().scheduler(); + let (tx, rx) = mpsc::channel(); + scheduler + .schedule(Task::Validate(Validate::Region( + 1, + Box::new(move |delegate| { + tx.send(delegate.is_none()).unwrap(); + }), + ))) + .unwrap(); + + assert!( + rx.recv_timeout(Duration::from_millis(1000)).unwrap(), + "find unexpected delegate" + ); + + fail::remove("cdc_incremental_scan_start"); + suite.stop(); +} + +#[test] +fn test_scan_lock_memory_quota_exceeded() { + let mut cluster = new_server_cluster(1, 1); + // Increase the Raft tick interval to make this test case running reliably. + configure_for_lease_read(&mut cluster.cfg, Some(100), None); + let memory_quota = 1024; // 1KB + let mut suite = TestSuiteBuilder::new() + .cluster(cluster) + .memory_quota(memory_quota) + .build(); + + // Let CdcEvent size be 0 to effectively disable memory quota for CdcEvent. + fail::cfg("cdc_event_size", "return(0)").unwrap(); + + // Put a lock that exceeds memory quota. + let key_size = memory_quota * 2; + let (k, v) = (vec![1; key_size], vec![5]); + let start_ts = block_on(suite.cluster.pd_client.get_tso()).unwrap(); + let mut mutation = Mutation::default(); + mutation.set_op(Op::Put); + mutation.key = k.clone(); + mutation.value = v; + suite.must_kv_prewrite(1, vec![mutation], k, start_ts); + + // No region can be initialized. + let req = suite.new_changedata_request(1); + let (mut req_tx, _event_feed_wrap, receive_event) = + new_event_feed(suite.get_region_cdc_client(1)); + block_on(req_tx.send((req, WriteFlags::default()))).unwrap(); + let mut events = receive_event(false).events.to_vec(); + assert_eq!(events.len(), 1, "{:?}", events); + match events.pop().unwrap().event.unwrap() { + Event_oneof_event::Error(e) => { + // Unknown errors are translated into region_not_found. + assert!(e.has_region_not_found(), "{:?}", e); + } + other => panic!("unknown event {:?}", other), + } + let scheduler = suite.endpoints.values().next().unwrap().scheduler(); + let (tx, rx) = mpsc::channel(); + scheduler + .schedule(Task::Validate(Validate::Region( + 1, + Box::new(move |delegate| { + tx.send(delegate.is_none()).unwrap(); + }), + ))) + .unwrap(); + + assert!( + rx.recv_timeout(Duration::from_millis(1000)).unwrap(), + "find unexpected delegate" + ); + + suite.stop(); +} diff --git a/components/cdc/tests/failpoints/test_register.rs b/components/cdc/tests/failpoints/test_register.rs index 4558397f8a9..2b6be3744af 100644 --- a/components/cdc/tests/failpoints/test_register.rs +++ b/components/cdc/tests/failpoints/test_register.rs @@ -165,7 +165,11 @@ fn test_connections_register_impl() { let mut events = receive_event(false).events.to_vec(); match events.pop().unwrap().event.unwrap() { Event_oneof_event::Error(err) => { - assert!(err.has_epoch_not_match(), "{:?}", err); + assert!( + err.has_epoch_not_match() || err.has_region_not_found(), + "{:?}", + err + ); } other => panic!("unknown event {:?}", other), } diff --git a/components/cdc/tests/mod.rs b/components/cdc/tests/mod.rs index d2c4519d50d..469474a1759 100644 --- a/components/cdc/tests/mod.rs +++ b/components/cdc/tests/mod.rs @@ -6,7 +6,7 @@ use std::{ }; use causal_ts::CausalTsProvider; -use cdc::{recv_timeout, CdcObserver, Delegate, FeatureGate, MemoryQuota, Task, Validate}; +use cdc::{recv_timeout, CdcObserver, Delegate, FeatureGate, Task, Validate}; use collections::HashMap; use concurrency_manager::ConcurrencyManager; use engine_rocks::RocksEngine; @@ -23,9 +23,14 @@ use kvproto::{ use online_config::OnlineConfig; use raftstore::{coprocessor::CoprocessorHost, router::CdcRaftRouter}; use test_raftstore::*; -use tikv::{config::CdcConfig, server::DEFAULT_CLUSTER_ID, storage::kv::LocalTablets}; +use tikv::{ + config::{CdcConfig, ResolvedTsConfig}, + server::DEFAULT_CLUSTER_ID, + storage::kv::LocalTablets, +}; use tikv_util::{ config::ReadableDuration, + memory::MemoryQuota, worker::{LazyWorker, Runnable}, HandyRwLock, }; @@ -167,6 +172,7 @@ impl TestSuiteBuilder { let count = cluster.count; let pd_cli = cluster.pd_client.clone(); let mut endpoints = HashMap::default(); + let mut quotas = HashMap::default(); let mut obs = HashMap::default(); let mut concurrency_managers = HashMap::default(); // Hack! node id are generated from 1..count+1. @@ -176,15 +182,14 @@ impl TestSuiteBuilder { let mut sim = cluster.sim.wl(); // Register cdc service to gRPC server. + let memory_quota = Arc::new(MemoryQuota::new(memory_quota)); + let memory_quota_ = memory_quota.clone(); let scheduler = worker.scheduler(); sim.pending_services .entry(id) .or_default() .push(Box::new(move || { - create_change_data(cdc::Service::new( - scheduler.clone(), - MemoryQuota::new(memory_quota), - )) + create_change_data(cdc::Service::new(scheduler.clone(), memory_quota_.clone())) })); sim.txn_extra_schedulers.insert( id, @@ -199,6 +204,7 @@ impl TestSuiteBuilder { }, )); endpoints.insert(id, worker); + quotas.insert(id, memory_quota); } runner(&mut cluster); @@ -212,6 +218,7 @@ impl TestSuiteBuilder { let mut cdc_endpoint = cdc::Endpoint::new( DEFAULT_CLUSTER_ID, &cfg, + &ResolvedTsConfig::default(), false, cluster.cfg.storage.api_version(), pd_cli.clone(), @@ -223,7 +230,7 @@ impl TestSuiteBuilder { cm.clone(), env, sim.security_mgr.clone(), - MemoryQuota::new(usize::MAX), + quotas[id].clone(), sim.get_causal_ts_provider(*id), ); let mut updated_cfg = cfg.clone(); diff --git a/components/concurrency_manager/src/lib.rs b/components/concurrency_manager/src/lib.rs index ce77cb87a42..1c6bdb8dbf1 100644 --- a/components/concurrency_manager/src/lib.rs +++ b/components/concurrency_manager/src/lib.rs @@ -124,6 +124,23 @@ impl ConcurrencyManager { }); min_lock_ts } + + pub fn global_min_lock(&self) -> Option<(TimeStamp, Key)> { + let mut min_lock: Option<(TimeStamp, Key)> = None; + // TODO: The iteration looks not so efficient. It's better to be optimized. + self.lock_table.for_each_kv(|key, handle| { + if let Some(curr_ts) = handle.with_lock(|lock| lock.as_ref().map(|l| l.ts)) { + if min_lock + .as_ref() + .map(|(ts, _)| ts > &curr_ts) + .unwrap_or(true) + { + min_lock = Some((curr_ts, key.clone())); + } + } + }); + min_lock + } } #[cfg(test)] diff --git a/components/concurrency_manager/src/lock_table.rs b/components/concurrency_manager/src/lock_table.rs index db6995fa1d0..8f4fb8952c3 100644 --- a/components/concurrency_manager/src/lock_table.rs +++ b/components/concurrency_manager/src/lock_table.rs @@ -115,6 +115,14 @@ impl LockTable { } } + pub fn for_each_kv(&self, mut f: impl FnMut(&Key, Arc)) { + for entry in self.0.iter() { + if let Some(handle) = entry.value().upgrade() { + f(entry.key(), handle); + } + } + } + /// Removes the key and its key handle from the map. pub fn remove(&self, key: &Key) { self.0.remove(key); diff --git a/components/encryption/src/OWNERS b/components/encryption/src/OWNERS new file mode 100644 index 00000000000..70df3de925d --- /dev/null +++ b/components/encryption/src/OWNERS @@ -0,0 +1,7 @@ +# See the OWNERS docs at https://go.k8s.io/owners +options: + no_parent_owners: true +filters: + "(OWNERS|config\\.rs)$": + approvers: + - sig-critical-approvers-config-components diff --git a/components/encryption/src/master_key/kms.rs b/components/encryption/src/master_key/kms.rs index 643cb08a0c6..56cedc00243 100644 --- a/components/encryption/src/master_key/kms.rs +++ b/components/encryption/src/master_key/kms.rs @@ -141,7 +141,11 @@ impl KmsBackend { self.kms_provider.decrypt_data_key(&ciphertext_key), ) })) - .map_err(cloud_convert_error("decrypt encrypted key failed".into()))?; + .map_err(|e| { + Error::WrongMasterKey(box_err!(cloud_convert_error( + "decrypt encrypted key failed".into(), + )(e))) + })?; let data_key = DataKeyPair { encrypted: ciphertext_key, plaintext: PlainKey::new(plaintext, CryptographyType::AesGcm256) @@ -154,6 +158,12 @@ impl KmsBackend { } } } + + #[cfg(test)] + fn clear_state(&mut self) { + let mut opt_state = self.state.lock().unwrap(); + *opt_state = None; + } } impl Backend for KmsBackend { @@ -173,7 +183,10 @@ impl Backend for KmsBackend { #[cfg(test)] mod fake { use async_trait::async_trait; - use cloud::{error::Result, kms::KmsProvider}; + use cloud::{ + error::{Error as CloudError, KmsError, Result}, + kms::KmsProvider, + }; use super::*; @@ -183,12 +196,14 @@ mod fake { #[derive(Debug)] pub struct FakeKms { plaintext_key: PlainKey, + should_decrypt_data_key_fail: bool, } impl FakeKms { - pub fn new(plaintext_key: Vec) -> Self { + pub fn new(plaintext_key: Vec, should_decrypt_data_key_fail: bool) -> Self { Self { plaintext_key: PlainKey::new(plaintext_key, CryptographyType::AesGcm256).unwrap(), + should_decrypt_data_key_fail, } } } @@ -204,7 +219,13 @@ mod fake { } async fn decrypt_data_key(&self, _ciphertext: &EncryptedKey) -> Result> { - Ok(vec![1u8, 32]) + if self.should_decrypt_data_key_fail { + Err(CloudError::KmsError(KmsError::WrongMasterKey(box_err!( + "wrong master key" + )))) + } else { + Ok(vec![1u8, 32]) + } } fn name(&self) -> &str { @@ -241,21 +262,36 @@ mod tests { assert_eq!(state2.cached(&encrypted2), true); } + const PLAIN_TEXT_HEX: &str = "25431587e9ecffc7c37f8d6d52a9bc3310651d46fb0e3bad2726c8f2db653749"; + const CIPHER_TEXT_HEX: &str = + "84e5f23f95648fa247cb28eef53abec947dbf05ac953734618111583840bd980"; + const PLAINKEY_HEX: &str = "c3d99825f2181f4808acd2068eac7441a65bd428f14d2aab43fefc0129091139"; + const IV_HEX: &str = "cafabd9672ca6c79a2fbdc22"; + + #[cfg(test)] + fn prepare_data_for_encrypt() -> (Iv, Vec, Vec, Vec) { + let iv = Vec::from_hex(IV_HEX).unwrap(); + let iv = Iv::from_slice(iv.as_slice()).unwrap(); + let pt = Vec::from_hex(PLAIN_TEXT_HEX).unwrap(); + let plainkey = Vec::from_hex(PLAINKEY_HEX).unwrap(); + let ct = Vec::from_hex(CIPHER_TEXT_HEX).unwrap(); + (iv, pt, plainkey, ct) + } + + #[cfg(test)] + fn prepare_kms_backend(plainkey: Vec, should_decrypt_data_key_fail: bool) -> KmsBackend { + KmsBackend::new(Box::new(FakeKms::new( + plainkey, + should_decrypt_data_key_fail, + ))) + .unwrap() + } + #[test] fn test_kms_backend() { - // See more http://csrc.nist.gov/groups/STM/cavp/documents/mac/gcmtestvectors.zip - let pt = Vec::from_hex("25431587e9ecffc7c37f8d6d52a9bc3310651d46fb0e3bad2726c8f2db653749") - .unwrap(); - let ct = Vec::from_hex("84e5f23f95648fa247cb28eef53abec947dbf05ac953734618111583840bd980") - .unwrap(); - let plainkey = - Vec::from_hex("c3d99825f2181f4808acd2068eac7441a65bd428f14d2aab43fefc0129091139") - .unwrap(); - - let iv = Vec::from_hex("cafabd9672ca6c79a2fbdc22").unwrap(); - - let backend = KmsBackend::new(Box::new(FakeKms::new(plainkey))).unwrap(); - let iv = Iv::from_slice(iv.as_slice()).unwrap(); + let (iv, pt, plainkey, ct) = prepare_data_for_encrypt(); + let backend = prepare_kms_backend(plainkey, false); + let encrypted_content = backend.encrypt_content(&pt, iv).unwrap(); assert_eq!(encrypted_content.get_content(), ct.as_slice()); let plaintext = backend.decrypt_content(&encrypted_content).unwrap(); @@ -293,4 +329,19 @@ mod tests { Error::Other(_) ); } + + #[test] + fn test_kms_backend_wrong_key() { + let (iv, pt, plainkey, ..) = prepare_data_for_encrypt(); + let mut backend = prepare_kms_backend(plainkey, true); + + let encrypted_content = backend.encrypt_content(&pt, iv).unwrap(); + // Clear the cached state to ensure that the subsequent + // backend.decrypt_content() invocation bypasses the cache and triggers the + // mocked FakeKMS::decrypt_data_key() function. + backend.clear_state(); + + let err = backend.decrypt_content(&encrypted_content).unwrap_err(); + assert_matches!(err, Error::WrongMasterKey(_)); + } } diff --git a/components/engine_panic/src/compact.rs b/components/engine_panic/src/compact.rs index 988bec790de..f64c97ff5b0 100644 --- a/components/engine_panic/src/compact.rs +++ b/components/engine_panic/src/compact.rs @@ -44,6 +44,10 @@ impl CompactExt for PanicEngine { ) -> Result<()> { panic!() } + + fn check_in_range(&self, start: Option<&[u8]>, end: Option<&[u8]>) -> Result<()> { + panic!() + } } pub struct PanicCompactedEvent; diff --git a/components/engine_panic/src/db_options.rs b/components/engine_panic/src/db_options.rs index c081a5c1d12..0753bb7e0fc 100644 --- a/components/engine_panic/src/db_options.rs +++ b/components/engine_panic/src/db_options.rs @@ -40,6 +40,10 @@ impl DbOptions for PanicDbOptions { panic!() } + fn get_flush_size(&self) -> Result { + panic!() + } + fn set_rate_limiter_auto_tuned(&mut self, rate_limiter_auto_tuned: bool) -> Result<()> { panic!() } @@ -55,6 +59,10 @@ impl DbOptions for PanicDbOptions { fn set_titandb_options(&mut self, opts: &Self::TitanDbOptions) { panic!() } + + fn set_track_and_verify_wals_in_manifest(&mut self, v: bool) { + panic!() + } } pub struct PanicTitanDbOptions; diff --git a/components/engine_panic/src/misc.rs b/components/engine_panic/src/misc.rs index 8da5c48d3e6..ab7201275df 100644 --- a/components/engine_panic/src/misc.rs +++ b/components/engine_panic/src/misc.rs @@ -75,6 +75,14 @@ impl MiscExt for PanicEngine { panic!() } + fn disable_manual_compaction(&self) -> Result<()> { + panic!() + } + + fn enable_manual_compaction(&self) -> Result<()> { + panic!() + } + fn pause_background_work(&self) -> Result<()> { panic!() } diff --git a/components/engine_rocks/Cargo.toml b/components/engine_rocks/Cargo.toml index 1d275b788c2..8fcdf6eb023 100644 --- a/components/engine_rocks/Cargo.toml +++ b/components/engine_rocks/Cargo.toml @@ -60,6 +60,7 @@ txn_types = { workspace = true } git = "https://github.com/tikv/rust-rocksdb.git" package = "rocksdb" features = ["encryption"] +branch = "tikv-7.5" [dev-dependencies] rand = "0.8" diff --git a/components/engine_rocks/src/cf_options.rs b/components/engine_rocks/src/cf_options.rs index 1162c67f210..6a2372fb31f 100644 --- a/components/engine_rocks/src/cf_options.rs +++ b/components/engine_rocks/src/cf_options.rs @@ -40,6 +40,23 @@ impl RocksCfOptions { pub fn into_raw(self) -> RawCfOptions { self.0 } + + pub fn set_flush_size(&mut self, f: usize) -> Result<()> { + if let Some(m) = self.0.get_write_buffer_manager() { + m.set_flush_size(f); + } else { + return Err(box_err!("write buffer manager not found")); + } + Ok(()) + } + + pub fn get_flush_size(&self) -> Result { + if let Some(m) = self.0.get_write_buffer_manager() { + return Ok(m.flush_size() as u64); + } + + Err(box_err!("write buffer manager not found")) + } } impl Deref for RocksCfOptions { diff --git a/components/engine_rocks/src/compact.rs b/components/engine_rocks/src/compact.rs index 199b7d9f3be..f64c9a7d49e 100644 --- a/components/engine_rocks/src/compact.rs +++ b/components/engine_rocks/src/compact.rs @@ -121,6 +121,10 @@ impl CompactExt for RocksEngine { db.compact_files_cf(handle, &opts, &files, output_level) .map_err(r2e) } + + fn check_in_range(&self, start: Option<&[u8]>, end: Option<&[u8]>) -> Result<()> { + self.as_inner().check_in_range(start, end).map_err(r2e) + } } #[cfg(test)] diff --git a/components/engine_rocks/src/config.rs b/components/engine_rocks/src/config.rs index e121a1cea18..d55c5cb3dfc 100644 --- a/components/engine_rocks/src/config.rs +++ b/components/engine_rocks/src/config.rs @@ -340,9 +340,9 @@ pub enum BlobRunMode { impl From for ConfigValue { fn from(mode: BlobRunMode) -> ConfigValue { let str_value = match mode { - BlobRunMode::Normal => "normal", - BlobRunMode::ReadOnly => "read-only", - BlobRunMode::Fallback => "fallback", + BlobRunMode::Normal => "kNormal", + BlobRunMode::ReadOnly => "kReadOnly", + BlobRunMode::Fallback => "kFallback", }; ConfigValue::String(str_value.into()) } @@ -366,8 +366,11 @@ impl FromStr for BlobRunMode { "normal" => Ok(BlobRunMode::Normal), "read-only" => Ok(BlobRunMode::ReadOnly), "fallback" => Ok(BlobRunMode::Fallback), + "kNormal" => Ok(BlobRunMode::Normal), + "kReadOnly" => Ok(BlobRunMode::ReadOnly), + "kFallback" => Ok(BlobRunMode::Fallback), m => Err(format!( - "expect: normal, read-only or fallback, got: {:?}", + "expect: normal, kNormal, read-only, kReadOnly, kFallback or fallback, got: {:?}", m )), } diff --git a/components/engine_rocks/src/db_options.rs b/components/engine_rocks/src/db_options.rs index c9ef2cfda98..c95f81f8297 100644 --- a/components/engine_rocks/src/db_options.rs +++ b/components/engine_rocks/src/db_options.rs @@ -100,6 +100,14 @@ impl DbOptions for RocksDbOptions { Ok(()) } + fn get_flush_size(&self) -> Result { + if let Some(m) = self.0.get_write_buffer_manager() { + return Ok(m.flush_size() as u64); + } + + Err(box_err!("write buffer manager not found")) + } + fn set_flush_oldest_first(&mut self, f: bool) -> Result<()> { if let Some(m) = self.0.get_write_buffer_manager() { m.set_flush_oldest_first(f); @@ -112,6 +120,10 @@ impl DbOptions for RocksDbOptions { fn set_titandb_options(&mut self, opts: &Self::TitanDbOptions) { self.0.set_titandb_options(opts.as_raw()) } + + fn set_track_and_verify_wals_in_manifest(&mut self, v: bool) { + self.0.set_track_and_verify_wals_in_manifest(v) + } } pub struct RocksTitanDbOptions(RawTitanDBOptions); diff --git a/components/engine_rocks/src/event_listener.rs b/components/engine_rocks/src/event_listener.rs index 9628c61c23f..4ba4061a60f 100644 --- a/components/engine_rocks/src/event_listener.rs +++ b/components/engine_rocks/src/event_listener.rs @@ -130,6 +130,7 @@ impl rocksdb::EventListener for RocksEventListener { if let Some(path) = resolve_sst_filename_from_err(&err) { warn!( "detected rocksdb background error"; + "reason" => r, "sst" => &path, "err" => &err ); @@ -194,8 +195,15 @@ impl rocksdb::EventListener for RocksPersistenceListener { fn on_memtable_sealed(&self, info: &MemTableInfo) { // Note: first_seqno is effectively the smallest seqno of memtable. // earliest_seqno has ambiguous semantics. - self.0 - .on_memtable_sealed(info.cf_name().to_string(), info.first_seqno()); + self.0.on_memtable_sealed( + info.cf_name().to_string(), + info.first_seqno(), + info.largest_seqno(), + ); + } + + fn on_flush_begin(&self, _: &FlushJobInfo) { + fail::fail_point!("on_flush_begin"); } fn on_flush_completed(&self, job: &FlushJobInfo) { diff --git a/components/engine_rocks/src/import.rs b/components/engine_rocks/src/import.rs index 1aa65ec07fa..e0f5461d2a3 100644 --- a/components/engine_rocks/src/import.rs +++ b/components/engine_rocks/src/import.rs @@ -1,11 +1,7 @@ // Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. -use std::fs::File; - use engine_traits::{ImportExt, IngestExternalFileOptions, Result}; -use rocksdb::{ - set_external_sst_file_global_seq_no, IngestExternalFileOptions as RawIngestExternalFileOptions, -}; +use rocksdb::IngestExternalFileOptions as RawIngestExternalFileOptions; use crate::{engine::RocksEngine, r2e, util}; @@ -17,17 +13,7 @@ impl ImportExt for RocksEngine { let mut opts = RocksIngestExternalFileOptions::new(); opts.move_files(true); opts.set_write_global_seqno(false); - files.iter().try_for_each(|file| -> Result<()> { - let f = File::open(file)?; - // Prior to v5.2.0, TiKV use `write_global_seqno=true` for ingestion. For - // backward compatibility, in case TiKV is retrying an ingestion job - // generated by older version, it needs to reset the global seqno to - // 0. - set_external_sst_file_global_seq_no(self.as_inner(), cf, file, 0).map_err(r2e)?; - f.sync_all() - .map_err(|e| format!("sync {}: {:?}", file, e)) - .map_err(r2e) - })?; + // Note: no need reset the global seqno to 0 for compatibility as #16992 // This is calling a specially optimized version of // ingest_external_file_cf. In cases where the memtable needs to be // flushed it avoids blocking writers while doing the flush. The unused diff --git a/components/engine_rocks/src/misc.rs b/components/engine_rocks/src/misc.rs index b1406cacdb8..c043e9b2050 100644 --- a/components/engine_rocks/src/misc.rs +++ b/components/engine_rocks/src/misc.rs @@ -334,16 +334,26 @@ impl MiscExt for RocksEngine { self.as_inner().sync_wal().map_err(r2e) } + fn disable_manual_compaction(&self) -> Result<()> { + self.as_inner().disable_manual_compaction(); + Ok(()) + } + + fn enable_manual_compaction(&self) -> Result<()> { + self.as_inner().enable_manual_compaction(); + Ok(()) + } + fn pause_background_work(&self) -> Result<()> { // This will make manual compaction return error instead of waiting. In practice // we might want to identify this case by parsing error message. - self.as_inner().disable_manual_compaction(); + self.disable_manual_compaction()?; self.as_inner().pause_bg_work(); Ok(()) } fn continue_background_work(&self) -> Result<()> { - self.as_inner().enable_manual_compaction(); + self.enable_manual_compaction()?; self.as_inner().continue_bg_work(); Ok(()) } diff --git a/components/engine_rocks/src/properties.rs b/components/engine_rocks/src/properties.rs index 87ccab9e5ab..1739fb1036e 100644 --- a/components/engine_rocks/src/properties.rs +++ b/components/engine_rocks/src/properties.rs @@ -414,7 +414,10 @@ impl TablePropertiesCollector for MvccPropertiesCollector { // TsFilter filters sst based on max_ts and min_ts during iterating. // To prevent seeing outdated (GC) records, we should consider // RocksDB delete entry type. - if entry_type != DBEntryType::Put && entry_type != DBEntryType::Delete { + if entry_type != DBEntryType::Put + && entry_type != DBEntryType::Delete + && entry_type != DBEntryType::BlobIndex + { return; } @@ -452,37 +455,43 @@ impl TablePropertiesCollector for MvccPropertiesCollector { self.props.max_row_versions = self.row_versions; } - if self.key_mode == KeyMode::Raw { - let decode_raw_value = ApiV2::decode_raw_value(value); - match decode_raw_value { - Ok(raw_value) => { - if raw_value.is_valid(self.current_ts) { - self.props.num_puts += 1; - } else { - self.props.num_deletes += 1; + if entry_type != DBEntryType::BlobIndex { + if self.key_mode == KeyMode::Raw { + let decode_raw_value = ApiV2::decode_raw_value(value); + match decode_raw_value { + Ok(raw_value) => { + if raw_value.is_valid(self.current_ts) { + self.props.num_puts += 1; + } else { + self.props.num_deletes += 1; + } + if let Some(expire_ts) = raw_value.expire_ts { + self.props.ttl.add(expire_ts); + } } - if let Some(expire_ts) = raw_value.expire_ts { - self.props.ttl.add(expire_ts); + Err(_) => { + self.num_errors += 1; } } - Err(_) => { - self.num_errors += 1; + } else { + let write_type = match Write::parse_type(value) { + Ok(v) => v, + Err(_) => { + self.num_errors += 1; + return; + } + }; + + match write_type { + WriteType::Put => self.props.num_puts += 1, + WriteType::Delete => self.props.num_deletes += 1, + _ => {} } } } else { - let write_type = match Write::parse_type(value) { - Ok(v) => v, - Err(_) => { - self.num_errors += 1; - return; - } - }; - - match write_type { - WriteType::Put => self.props.num_puts += 1, - WriteType::Delete => self.props.num_deletes += 1, - _ => {} - } + // NOTE: if titan is enabled, the entry will always be treated as PUT. + // Be careful if you try to enable Titan on CF_WRITE. + self.props.num_puts += 1; } // Add new row. @@ -564,6 +573,7 @@ pub fn get_range_stats( num_entries, num_versions: props.num_versions, num_rows: props.num_rows, + num_deletes: props.num_deletes, }) } diff --git a/components/engine_rocks/src/rocks_metrics.rs b/components/engine_rocks/src/rocks_metrics.rs index 522696cb150..2b32af111ec 100644 --- a/components/engine_rocks/src/rocks_metrics.rs +++ b/components/engine_rocks/src/rocks_metrics.rs @@ -920,6 +920,7 @@ struct CfStats { blob_cache_size: Option, readers_mem: Option, mem_tables: Option, + mem_tables_all: Option, num_keys: Option, pending_compaction_bytes: Option, num_immutable_mem_table: Option, @@ -978,6 +979,9 @@ impl StatisticsReporter for RocksStatisticsReporter { if let Some(v) = db.get_property_int_cf(handle, ROCKSDB_CUR_SIZE_ALL_MEM_TABLES) { *cf_stats.mem_tables.get_or_insert_default() += v; } + if let Some(v) = db.get_property_int_cf(handle, ROCKSDB_SIZE_ALL_MEM_TABLES) { + *cf_stats.mem_tables_all.get_or_insert_default() += v; + } // TODO: add cache usage and pinned usage. if let Some(v) = db.get_property_int_cf(handle, ROCKSDB_ESTIMATE_NUM_KEYS) { *cf_stats.num_keys.get_or_insert_default() += v; @@ -1119,6 +1123,11 @@ impl StatisticsReporter for RocksStatisticsReporter { .with_label_values(&[&self.name, cf, "mem-tables"]) .set(v as i64); } + if let Some(v) = cf_stats.mem_tables_all { + STORE_ENGINE_MEMORY_GAUGE_VEC + .with_label_values(&[&self.name, cf, "mem-tables-all"]) + .set(v as i64); + } if let Some(v) = cf_stats.num_keys { STORE_ENGINE_ESTIMATE_NUM_KEYS_VEC .with_label_values(&[&self.name, cf]) @@ -1538,9 +1547,9 @@ lazy_static! { "Number of times titan blob file sync is done", &["db"] ).unwrap(); - pub static ref STORE_ENGINE_BLOB_FILE_SYNCED: SimpleEngineTickerMetrics = - auto_flush_from!(STORE_ENGINE_BLOB_FILE_SYNCED_VEC, SimpleEngineTickerMetrics); - + pub static ref STORE_ENGINE_BLOB_FILE_SYNCED: SimpleEngineTickerMetrics = + auto_flush_from!(STORE_ENGINE_BLOB_FILE_SYNCED_VEC, SimpleEngineTickerMetrics); + pub static ref STORE_ENGINE_BLOB_CACHE_EFFICIENCY_VEC: IntCounterVec = register_int_counter_vec!( "tikv_engine_blob_cache_efficiency", "Efficiency of titan's blob cache", diff --git a/components/engine_rocks/src/rocks_metrics_defs.rs b/components/engine_rocks/src/rocks_metrics_defs.rs index 042949f1c09..5bbc6245c72 100644 --- a/components/engine_rocks/src/rocks_metrics_defs.rs +++ b/components/engine_rocks/src/rocks_metrics_defs.rs @@ -5,6 +5,7 @@ use rocksdb::{DBStatisticsHistogramType as HistType, DBStatisticsTickerType as T pub const ROCKSDB_TOTAL_SST_FILES_SIZE: &str = "rocksdb.total-sst-files-size"; pub const ROCKSDB_TABLE_READERS_MEM: &str = "rocksdb.estimate-table-readers-mem"; pub const ROCKSDB_CUR_SIZE_ALL_MEM_TABLES: &str = "rocksdb.cur-size-all-mem-tables"; +pub const ROCKSDB_SIZE_ALL_MEM_TABLES: &str = "rocksdb.size-all-mem-tables"; pub const ROCKSDB_ESTIMATE_NUM_KEYS: &str = "rocksdb.estimate-num-keys"; pub const ROCKSDB_PENDING_COMPACTION_BYTES: &str = "rocksdb.\ estimate-pending-compaction-bytes"; diff --git a/components/engine_rocks/src/sst_partitioner.rs b/components/engine_rocks/src/sst_partitioner.rs index fc1dcd40270..f642a94f28f 100644 --- a/components/engine_rocks/src/sst_partitioner.rs +++ b/components/engine_rocks/src/sst_partitioner.rs @@ -23,6 +23,8 @@ impl rocksdb::SstPartitionerFactory output_level: context.output_level, smallest_key: context.smallest_key, largest_key: context.largest_key, + next_level_boundaries: context.next_level_boundaries.clone(), + next_level_sizes: context.next_level_sizes.clone(), }; self.0.create_partitioner(&ctx).map(RocksSstPartitioner) } diff --git a/components/engine_rocks/src/ttl_properties.rs b/components/engine_rocks/src/ttl_properties.rs index 8e6021939bc..62731ac1aa4 100644 --- a/components/engine_rocks/src/ttl_properties.rs +++ b/components/engine_rocks/src/ttl_properties.rs @@ -74,6 +74,7 @@ pub struct TtlPropertiesCollector { impl TablePropertiesCollector for TtlPropertiesCollector { fn add(&mut self, key: &[u8], value: &[u8], entry_type: DBEntryType, _: u64, _: u64) { + // DBEntryType::BlobIndex will be skipped because we can't parse the value. if entry_type != DBEntryType::Put { return; } diff --git a/components/engine_rocks/src/util.rs b/components/engine_rocks/src/util.rs index 225cd1d7f06..e4991419eed 100644 --- a/components/engine_rocks/src/util.rs +++ b/components/engine_rocks/src/util.rs @@ -3,6 +3,7 @@ use std::{ffi::CString, fs, path::Path, str::FromStr, sync::Arc}; use engine_traits::{Engines, Range, Result, CF_DEFAULT}; +use fail::fail_point; use rocksdb::{ load_latest_options, CColumnFamilyDescriptor, CFHandle, ColumnFamilyOptions, CompactionFilter, CompactionFilterContext, CompactionFilterDecision, CompactionFilterFactory, @@ -462,6 +463,13 @@ pub struct RangeCompactionFilterFactory(Arc); impl RangeCompactionFilterFactory { pub fn new(start_key: Box<[u8]>, end_key: Box<[u8]>) -> Self { + fail_point!("unlimited_range_compaction_filter", |_| { + let range = OwnedRange { + start_key: keys::data_key(b"").into_boxed_slice(), + end_key: keys::data_end_key(b"").into_boxed_slice(), + }; + Self(Arc::new(range)) + }); let range = OwnedRange { start_key, end_key }; Self(Arc::new(range)) } diff --git a/components/engine_test/src/lib.rs b/components/engine_test/src/lib.rs index dd56d9a5db4..85d9d4c1b78 100644 --- a/components/engine_test/src/lib.rs +++ b/components/engine_test/src/lib.rs @@ -127,7 +127,7 @@ pub mod kv { } fn destroy_tablet(&self, _ctx: TabletContext, path: &Path) -> Result<()> { - encryption::trash_dir_all(path, self.db_opt.key_manager.as_deref())?; + encryption::trash_dir_all(path, self.db_opt.get_key_manager().as_deref())?; Ok(()) } @@ -202,13 +202,17 @@ pub mod ctor { #[derive(Clone, Default)] pub struct DbOptions { - pub(crate) key_manager: Option>, + key_manager: Option>, rate_limiter: Option>, state_storage: Option>, enable_multi_batch_write: bool, } impl DbOptions { + pub fn get_key_manager(&self) -> Option> { + self.key_manager.clone() + } + pub fn set_key_manager(&mut self, key_manager: Option>) { self.key_manager = key_manager; } diff --git a/components/engine_traits/src/compact.rs b/components/engine_traits/src/compact.rs index 05590a1ff32..2a4341a6788 100644 --- a/components/engine_traits/src/compact.rs +++ b/components/engine_traits/src/compact.rs @@ -71,6 +71,9 @@ pub trait CompactExt: CfNamesExt { max_subcompactions: u32, exclude_l0: bool, ) -> Result<()>; + + // Check all data is in the range [start, end). + fn check_in_range(&self, start: Option<&[u8]>, end: Option<&[u8]>) -> Result<()>; } pub trait CompactedEvent: Send { diff --git a/components/engine_traits/src/db_options.rs b/components/engine_traits/src/db_options.rs index 2c6e9c3d4e8..60cacb1f76f 100644 --- a/components/engine_traits/src/db_options.rs +++ b/components/engine_traits/src/db_options.rs @@ -21,8 +21,10 @@ pub trait DbOptions { fn get_rate_limiter_auto_tuned(&self) -> Option; fn set_rate_limiter_auto_tuned(&mut self, rate_limiter_auto_tuned: bool) -> Result<()>; fn set_flush_size(&mut self, f: usize) -> Result<()>; + fn get_flush_size(&self) -> Result; fn set_flush_oldest_first(&mut self, f: bool) -> Result<()>; fn set_titandb_options(&mut self, opts: &Self::TitanDbOptions); + fn set_track_and_verify_wals_in_manifest(&mut self, v: bool); } /// Titan-specefic options diff --git a/components/engine_traits/src/flush.rs b/components/engine_traits/src/flush.rs index eebf0e7c32a..8590236e126 100644 --- a/components/engine_traits/src/flush.rs +++ b/components/engine_traits/src/flush.rs @@ -18,14 +18,17 @@ use std::{ atomic::{AtomicU64, Ordering}, Arc, Mutex, RwLock, }, + time::Duration, }; use kvproto::import_sstpb::SstMeta; -use slog_global::info; -use tikv_util::set_panic_mark; +use slog_global::{info, warn}; +use tikv_util::{set_panic_mark, time::Instant}; use crate::{data_cf_offset, RaftEngine, RaftLogBatch, DATA_CFS_LEN}; +const HEAVY_WORKER_THRESHOLD: Duration = Duration::from_millis(25); + #[derive(Debug)] pub struct ApplyProgress { cf: String, @@ -203,7 +206,16 @@ impl PersistenceListener { /// Called when memtable is frozen. /// /// `smallest_seqno` should be the smallest seqno of the memtable. - pub fn on_memtable_sealed(&self, cf: String, smallest_seqno: u64) { + /// + /// Note: After https://github.com/tikv/rocksdb/pull/347, rocksdb global lock will + /// be held during this method, so we should avoid do heavy things in it. + pub fn on_memtable_sealed(&self, cf: String, smallest_seqno: u64, largest_seqno: u64) { + let t = Instant::now_coarse(); + (|| { + fail_point!("on_memtable_sealed", |t| { + assert_eq!(t.unwrap().as_str(), cf); + }) + })(); // The correctness relies on the assumption that there will be only one // thread writting to the DB and increasing apply index. // Apply index will be set within DB lock, so it's correct even with manual @@ -214,8 +226,9 @@ impl PersistenceListener { let flushed = prs.last_flushed[offset]; if flushed > smallest_seqno { panic!( - "sealed seqno has been flushed {} {} {} <= {}", - cf, apply_index, smallest_seqno, flushed + "sealed seqno conflict with latest flushed index, cf {}, + sealed smallest_seqno {}, sealed largest_seqno {}, last_flushed {}, apply_index {}", + cf, smallest_seqno, largest_seqno, flushed, apply_index, ); } prs.prs.push_back(ApplyProgress { @@ -223,13 +236,18 @@ impl PersistenceListener { apply_index, smallest_seqno, }); + if t.saturating_elapsed() > HEAVY_WORKER_THRESHOLD { + warn!( + "heavy work in on_memtable_sealed, the code should be reviewed"; + ); + } } /// Called a memtable finished flushing. /// /// `largest_seqno` should be the largest seqno of the generated file. pub fn on_flush_completed(&self, cf: &str, largest_seqno: u64, file_no: u64) { - fail_point!("on_flush_completed"); + fail_point!("on_flush_completed", |_| {}); // Maybe we should hook the compaction to avoid the file is compacted before // being recorded. let offset = data_cf_offset(cf); @@ -239,7 +257,13 @@ impl PersistenceListener { if flushed >= largest_seqno { // According to facebook/rocksdb#11183, it's possible OnFlushCompleted can be // called out of order. But it's guaranteed files are installed in order. - info!("flush complete reorder found"; "flushed" => flushed, "largest_seqno" => largest_seqno, "file_no" => file_no, "cf" => cf); + info!( + "flush complete reorder found"; + "flushed" => flushed, + "largest_seqno" => largest_seqno, + "file_no" => file_no, + "cf" => cf + ); return; } prs.last_flushed[offset] = largest_seqno; diff --git a/components/engine_traits/src/misc.rs b/components/engine_traits/src/misc.rs index 7871b3b8ecc..b4cb4c9a233 100644 --- a/components/engine_traits/src/misc.rs +++ b/components/engine_traits/src/misc.rs @@ -57,12 +57,25 @@ pub trait StatisticsReporter { #[derive(Default)] pub struct RangeStats { - // The number of entries + // The number of entries in write cf. pub num_entries: u64, // The number of MVCC versions of all rows (num_entries - tombstones). pub num_versions: u64, // The number of rows. pub num_rows: u64, + // The number of MVCC deletes of all rows. + pub num_deletes: u64, +} + +impl RangeStats { + /// The number of redundant keys in the range. + /// It's calculated by `num_entries - num_versions + num_deleted`. + pub fn redundant_keys(&self) -> u64 { + // Consider the number of `mvcc_deletes` as the number of redundant keys. + self.num_entries + .saturating_sub(self.num_rows) + .saturating_add(self.num_deletes) + } } pub trait MiscExt: CfNamesExt + FlowControlFactorsExt + WriteBatchExt { @@ -121,6 +134,12 @@ pub trait MiscExt: CfNamesExt + FlowControlFactorsExt + WriteBatchExt { fn sync_wal(&self) -> Result<()>; + /// Disable manual compactions, some on-going manual compactions may be + /// aborted. + fn disable_manual_compaction(&self) -> Result<()>; + + fn enable_manual_compaction(&self) -> Result<()>; + /// Depending on the implementation, some on-going manual compactions may be /// aborted. fn pause_background_work(&self) -> Result<()>; diff --git a/components/engine_traits/src/sst_partitioner.rs b/components/engine_traits/src/sst_partitioner.rs index bc6ec13a4eb..4a8ee9e71bc 100644 --- a/components/engine_traits/src/sst_partitioner.rs +++ b/components/engine_traits/src/sst_partitioner.rs @@ -22,6 +22,8 @@ pub struct SstPartitionerContext<'a> { pub output_level: i32, pub smallest_key: &'a [u8], pub largest_key: &'a [u8], + pub next_level_boundaries: Vec<&'a [u8]>, + pub next_level_sizes: Vec, } pub trait SstPartitioner { diff --git a/components/error_code/src/coprocessor.rs b/components/error_code/src/coprocessor.rs index d98c85162bf..31f0ed4224a 100644 --- a/components/error_code/src/coprocessor.rs +++ b/components/error_code/src/coprocessor.rs @@ -6,6 +6,7 @@ define_error_codes!( LOCKED => ("Locked", "", ""), DEADLINE_EXCEEDED => ("DeadlineExceeded", "", ""), MAX_PENDING_TASKS_EXCEEDED => ("MaxPendingTasksExceeded", "", ""), + MEMORY_QUOTA_EXCEEDED => ("MemoryQuotaExceeded", "", ""), INVALID_DATA_TYPE => ("InvalidDataType", "", ""), ENCODING => ("Encoding", "", ""), diff --git a/components/error_code/src/sst_importer.rs b/components/error_code/src/sst_importer.rs index 001f4f146f6..b092796d467 100644 --- a/components/error_code/src/sst_importer.rs +++ b/components/error_code/src/sst_importer.rs @@ -22,5 +22,11 @@ define_error_codes!( TTL_LEN_NOT_EQUALS_TO_PAIRS => ("TtlLenNotEqualsToPairs", "", ""), INCOMPATIBLE_API_VERSION => ("IncompatibleApiVersion", "", ""), INVALID_KEY_MODE => ("InvalidKeyMode", "", ""), - RESOURCE_NOT_ENOUTH => ("ResourceNotEnough", "", "") + RESOURCE_NOT_ENOUTH => ("ResourceNotEnough", "", ""), + SUSPENDED => ("Suspended", + "this request has been suspended.", + "Probably there are some export tools don't support exporting data inserted by `ingest`(say, snapshot backup). Check the user manual and stop them."), + REQUEST_TOO_NEW => ("RequestTooNew", "", ""), + REQUEST_TOO_OLD => ("RequestTooOld", "", ""), + DISK_SPACE_NOT_ENOUGH => ("DiskSpaceNotEnough", "", "") ); diff --git a/components/pd_client/src/OWNERS b/components/pd_client/src/OWNERS new file mode 100644 index 00000000000..70df3de925d --- /dev/null +++ b/components/pd_client/src/OWNERS @@ -0,0 +1,7 @@ +# See the OWNERS docs at https://go.k8s.io/owners +options: + no_parent_owners: true +filters: + "(OWNERS|config\\.rs)$": + approvers: + - sig-critical-approvers-config-components diff --git a/components/pd_client/src/client_v2.rs b/components/pd_client/src/client_v2.rs index 5b0d563f2b8..97b2702fc39 100644 --- a/components/pd_client/src/client_v2.rs +++ b/components/pd_client/src/client_v2.rs @@ -117,7 +117,7 @@ impl RawClient { /// Returns Ok(true) when a new connection is established. async fn maybe_reconnect(&mut self, ctx: &ConnectContext, force: bool) -> Result { - PD_RECONNECT_COUNTER_VEC.with_label_values(&["try"]).inc(); + PD_RECONNECT_COUNTER_VEC.try_connect.inc(); let start = Instant::now(); let members = self.members.clone(); @@ -135,21 +135,15 @@ impl RawClient { .await { Err(e) => { - PD_RECONNECT_COUNTER_VEC - .with_label_values(&["failure"]) - .inc(); + PD_RECONNECT_COUNTER_VEC.failure.inc(); return Err(e); } Ok(None) => { - PD_RECONNECT_COUNTER_VEC - .with_label_values(&["no-need"]) - .inc(); + PD_RECONNECT_COUNTER_VEC.no_need.inc(); return Ok(false); } Ok(Some(tuple)) => { - PD_RECONNECT_COUNTER_VEC - .with_label_values(&["success"]) - .inc(); + PD_RECONNECT_COUNTER_VEC.success.inc(); tuple } }; diff --git a/components/pd_client/src/lib.rs b/components/pd_client/src/lib.rs index 7a9d2cd2a61..ad2881a0b8c 100644 --- a/components/pd_client/src/lib.rs +++ b/components/pd_client/src/lib.rs @@ -24,7 +24,10 @@ use kvproto::{ resource_manager::TokenBucketsRequest, }; use pdpb::QueryStats; -use tikv_util::time::{Instant, UnixSecs}; +use tikv_util::{ + memory::HeapSize, + time::{Instant, UnixSecs}, +}; use txn_types::TimeStamp; pub use self::{ @@ -133,6 +136,12 @@ impl BucketMeta { } } +impl HeapSize for BucketMeta { + fn approximate_heap_size(&self) -> usize { + self.keys.approximate_heap_size() + self.sizes.approximate_heap_size() + } +} + #[derive(Debug, Clone)] pub struct BucketStat { pub meta: Arc, @@ -211,6 +220,15 @@ impl BucketStat { } } + pub fn clean_stats(&mut self, idx: usize) { + self.stats.write_keys[idx] = 0; + self.stats.write_bytes[idx] = 0; + self.stats.read_qps[idx] = 0; + self.stats.write_qps[idx] = 0; + self.stats.read_keys[idx] = 0; + self.stats.read_bytes[idx] = 0; + } + pub fn split(&mut self, idx: usize) { assert!(idx != 0); // inherit the traffic stats for splited bucket diff --git a/components/pd_client/src/metrics.rs b/components/pd_client/src/metrics.rs index d92e334396a..7e7121170d6 100644 --- a/components/pd_client/src/metrics.rs +++ b/components/pd_client/src/metrics.rs @@ -2,7 +2,7 @@ use lazy_static::lazy_static; use prometheus::*; -use prometheus_static_metric::{make_static_metric, register_static_histogram_vec}; +use prometheus_static_metric::*; make_static_metric! { pub label_enum PDRequestEventType { @@ -40,9 +40,34 @@ make_static_metric! { meta_storage_watch, } + pub label_enum PDReconnectEventKind { + success, + failure, + no_need, + cancel, + try_connect, + } + + pub label_enum StoreSizeEventType { + capacity, + available, + used, + snap_size, + raft_size, + kv_size, + import_size, + } + + pub struct StoreSizeEventIntrVec: IntGauge { + "type" => StoreSizeEventType, + } + pub struct PDRequestEventHistogramVec: Histogram { "type" => PDRequestEventType, } + pub struct PDReconnectEventCounterVec: IntCounter { + "type" => PDReconnectEventKind, + } } lazy_static! { @@ -66,12 +91,14 @@ lazy_static! { &["type"] ) .unwrap(); - pub static ref PD_RECONNECT_COUNTER_VEC: IntCounterVec = register_int_counter_vec!( - "tikv_pd_reconnect_total", - "Total number of PD reconnections.", - &["type"] - ) - .unwrap(); + pub static ref PD_RECONNECT_COUNTER_VEC: PDReconnectEventCounterVec = + register_static_int_counter_vec!( + PDReconnectEventCounterVec, + "tikv_pd_reconnect_total", + "Total number of PD reconnections.", + &["type"] + ) + .unwrap(); pub static ref PD_PENDING_HEARTBEAT_GAUGE: IntGauge = register_int_gauge!( "tikv_pd_pending_heartbeat_total", "Total number of pending region heartbeat" @@ -88,8 +115,14 @@ lazy_static! { &["type"] ) .unwrap(); - pub static ref STORE_SIZE_GAUGE_VEC: IntGaugeVec = - register_int_gauge_vec!("tikv_store_size_bytes", "Size of storage.", &["type"]).unwrap(); + pub static ref STORE_SIZE_EVENT_INT_VEC: StoreSizeEventIntrVec = + register_static_int_gauge_vec!( + StoreSizeEventIntrVec, + "tikv_store_size_bytes", + "Size of storage.", + &["type"] + ) + .unwrap(); pub static ref REGION_READ_KEYS_HISTOGRAM: Histogram = register_histogram!( "tikv_region_read_keys", "Histogram of keys written for regions", diff --git a/components/pd_client/src/util.rs b/components/pd_client/src/util.rs index 5491a51c047..66b084d4998 100644 --- a/components/pd_client/src/util.rs +++ b/components/pd_client/src/util.rs @@ -50,6 +50,7 @@ const MAX_RETRY_TIMES: u64 = 5; // The max duration when retrying to connect to leader. No matter if the // MAX_RETRY_TIMES is reached. const MAX_RETRY_DURATION: Duration = Duration::from_secs(10); +const MAX_BACKOFF: Duration = Duration::from_secs(3); // FIXME: Use a request-independent way to handle reconnection. pub const REQUEST_RECONNECT_INTERVAL: Duration = Duration::from_secs(1); // 1s @@ -116,6 +117,7 @@ pub struct Inner { pub rg_resp: Option>, last_try_reconnect: Instant, + bo: ExponentialBackoff, } impl Inner { @@ -168,7 +170,6 @@ pub struct Client { pub(crate) inner: RwLock, pub feature_gate: FeatureGate, enable_forwarding: bool, - retry_interval: Duration, } impl Client { @@ -219,6 +220,7 @@ impl Client { pending_heartbeat: Arc::default(), pending_buckets: Arc::default(), last_try_reconnect: Instant::now(), + bo: ExponentialBackoff::new(retry_interval), tso, meta_storage, rg_sender: Either::Left(Some(rg_sender)), @@ -226,7 +228,6 @@ impl Client { }), feature_gate: FeatureGate::default(), enable_forwarding, - retry_interval, } } @@ -363,17 +364,15 @@ impl Client { /// Note: Retrying too quickly will return an error due to cancellation. /// Please always try to reconnect after sending the request first. pub async fn reconnect(&self, force: bool) -> Result<()> { - PD_RECONNECT_COUNTER_VEC.with_label_values(&["try"]).inc(); + PD_RECONNECT_COUNTER_VEC.try_connect.inc(); let start = Instant::now(); let future = { let inner = self.inner.rl(); - if start.saturating_duration_since(inner.last_try_reconnect) < self.retry_interval { + if start.saturating_duration_since(inner.last_try_reconnect) < inner.bo.get_interval() { // Avoid unnecessary updating. // Prevent a large number of reconnections in a short time. - PD_RECONNECT_COUNTER_VEC - .with_label_values(&["cancel"]) - .inc(); + PD_RECONNECT_COUNTER_VEC.cancel.inc(); return Err(box_err!("cancel reconnection due to too small interval")); } let connector = PdConnector::new(inner.env.clone(), inner.security_mgr.clone()); @@ -394,36 +393,38 @@ impl Client { { let mut inner = self.inner.wl(); - if start.saturating_duration_since(inner.last_try_reconnect) < self.retry_interval { + if start.saturating_duration_since(inner.last_try_reconnect) < inner.bo.get_interval() { // There may be multiple reconnections that pass the read lock at the same time. // Check again in the write lock to avoid unnecessary updating. - PD_RECONNECT_COUNTER_VEC - .with_label_values(&["cancel"]) - .inc(); + PD_RECONNECT_COUNTER_VEC.cancel.inc(); return Err(box_err!("cancel reconnection due to too small interval")); } inner.last_try_reconnect = start; + inner.bo.next_backoff(); } slow_log!(start.saturating_elapsed(), "try reconnect pd"); let (client, target_info, members, tso) = match future.await { Err(e) => { - PD_RECONNECT_COUNTER_VEC - .with_label_values(&["failure"]) - .inc(); + PD_RECONNECT_COUNTER_VEC.failure.inc(); return Err(e); } - Ok(None) => { - PD_RECONNECT_COUNTER_VEC - .with_label_values(&["no-need"]) - .inc(); - return Ok(()); - } - Ok(Some(tuple)) => { - PD_RECONNECT_COUNTER_VEC - .with_label_values(&["success"]) - .inc(); - tuple + Ok(res) => { + // Reset the retry count. + { + let mut inner = self.inner.wl(); + inner.bo.reset() + } + match res { + None => { + PD_RECONNECT_COUNTER_VEC.no_need.inc(); + return Ok(()); + } + Some(tuple) => { + PD_RECONNECT_COUNTER_VEC.success.inc(); + tuple + } + } } }; @@ -900,6 +901,33 @@ impl PdConnector { } } +/// Simple backoff strategy. +struct ExponentialBackoff { + base: Duration, + interval: Duration, +} + +impl ExponentialBackoff { + pub fn new(base: Duration) -> Self { + Self { + base, + interval: base, + } + } + pub fn next_backoff(&mut self) -> Duration { + self.interval = std::cmp::min(self.interval * 2, MAX_BACKOFF); + self.interval + } + + pub fn get_interval(&self) -> Duration { + self.interval + } + + pub fn reset(&mut self) { + self.interval = self.base; + } +} + pub fn trim_http_prefix(s: &str) -> &str { s.trim_start_matches("http://") .trim_start_matches("https://") @@ -1045,8 +1073,11 @@ pub fn merge_bucket_stats, I: AsRef<[u8]>>( mod test { use kvproto::metapb::BucketStats; + use super::*; use crate::{merge_bucket_stats, util::find_bucket_index}; + const BASE_BACKOFF: Duration = Duration::from_millis(100); + #[test] fn test_merge_bucket_stats() { #[allow(clippy::type_complexity)] @@ -1162,4 +1193,23 @@ mod test { assert_eq!(find_bucket_index(b"k7", &keys), Some(4)); assert_eq!(find_bucket_index(b"k8", &keys), Some(4)); } + + #[test] + fn test_exponential_backoff() { + let mut backoff = ExponentialBackoff::new(BASE_BACKOFF); + assert_eq!(backoff.get_interval(), BASE_BACKOFF); + + assert_eq!(backoff.next_backoff(), 2 * BASE_BACKOFF); + assert_eq!(backoff.next_backoff(), Duration::from_millis(400)); + assert_eq!(backoff.get_interval(), Duration::from_millis(400)); + + // Should not exceed MAX_BACKOFF + for _ in 0..20 { + backoff.next_backoff(); + } + assert_eq!(backoff.get_interval(), MAX_BACKOFF); + + backoff.reset(); + assert_eq!(backoff.get_interval(), BASE_BACKOFF); + } } diff --git a/components/raft_log_engine/Cargo.toml b/components/raft_log_engine/Cargo.toml index 4304e181707..34b83ed7352 100644 --- a/components/raft_log_engine/Cargo.toml +++ b/components/raft_log_engine/Cargo.toml @@ -4,6 +4,9 @@ version = "0.0.1" publish = false edition = "2021" +[features] +failpoints = ["raft-engine/failpoints"] + [dependencies] codec = { workspace = true } encryption = { workspace = true } @@ -15,7 +18,7 @@ num_cpus = "1" online_config = { workspace = true } protobuf = "2" raft = { workspace = true } -raft-engine = { git = "https://github.com/tikv/raft-engine.git", features = ["swap"] } +raft-engine = { workspace = true } serde = "1.0" serde_derive = "1.0" slog = { workspace = true } diff --git a/components/raftstore-v2/src/batch/store.rs b/components/raftstore-v2/src/batch/store.rs index 4c142a43abf..23e41914012 100644 --- a/components/raftstore-v2/src/batch/store.rs +++ b/components/raftstore-v2/src/batch/store.rs @@ -1,7 +1,6 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. use std::{ - cmp, ops::{Deref, DerefMut}, path::Path, sync::{ @@ -48,7 +47,7 @@ use tikv_util::{ box_err, config::{Tracker, VersionTrack}, log::SlogFormat, - sys::SysQuota, + sys::{disk::get_disk_status, SysQuota}, time::{duration_to_sec, monotonic_raw_now, Instant as TiInstant, Limiter}, timer::{SteadyTimer, GLOBAL_TIMER_HANDLE}, worker::{Builder, LazyWorker, Scheduler, Worker}, @@ -105,6 +104,10 @@ pub struct StoreContext { /// Disk usage for the store itself. pub self_disk_usage: DiskUsage, + // TODO: how to remove offlined stores? + /// Disk usage for other stores. The store itself is not included. + /// Only contains items which is not `DiskUsage::Normal`. + pub store_disk_usages: HashMap, pub snap_mgr: TabletSnapManager, pub global_stat: GlobalStoreStat, @@ -140,7 +143,7 @@ impl StoreContext { self.tick_batch[PeerTick::CheckLongUncommitted as usize].wait_duration = self.cfg.check_long_uncommitted_interval.0; self.tick_batch[PeerTick::GcPeer as usize].wait_duration = - 60 * cmp::min(Duration::from_secs(1), self.cfg.raft_base_tick_interval.0); + self.cfg.gc_peer_check_interval.0; } // Return None means it has passed unsafe vote period. @@ -229,6 +232,7 @@ impl PollHandler StoreRouter { msg: Box, ) -> std::result::Result<(), TrySendError>> { let id = msg.get_region_id(); - let peer_msg = PeerMsg::RaftMessage(msg); + let peer_msg = PeerMsg::RaftMessage(msg, Some(TiInstant::now())); let store_msg = match self.router.try_send(id, peer_msg) { Either::Left(Ok(())) => return Ok(()), - Either::Left(Err(TrySendError::Full(PeerMsg::RaftMessage(m)))) => { + Either::Left(Err(TrySendError::Full(PeerMsg::RaftMessage(m, _)))) => { return Err(TrySendError::Full(m)); } - Either::Left(Err(TrySendError::Disconnected(PeerMsg::RaftMessage(m)))) => { + Either::Left(Err(TrySendError::Disconnected(PeerMsg::RaftMessage(m, _)))) => { return Err(TrySendError::Disconnected(m)); } - Either::Right(PeerMsg::RaftMessage(m)) => StoreMsg::RaftMessage(m), + Either::Right(PeerMsg::RaftMessage(m, _)) => StoreMsg::RaftMessage(m), _ => unreachable!(), }; match self.router.send_control(store_msg) { diff --git a/components/raftstore-v2/src/fsm/peer.rs b/components/raftstore-v2/src/fsm/peer.rs index f6b9217ecbf..47a1aee1ef4 100644 --- a/components/raftstore-v2/src/fsm/peer.rs +++ b/components/raftstore-v2/src/fsm/peer.rs @@ -9,7 +9,7 @@ use crossbeam::channel::TryRecvError; use encryption_export::DataKeyManager; use engine_traits::{KvEngine, RaftEngine, TabletRegistry}; use kvproto::{errorpb, raft_cmdpb::RaftCmdResponse}; -use raftstore::store::{Config, TabletSnapManager, Transport}; +use raftstore::store::{Config, ReadCallback, TabletSnapManager, Transport}; use slog::{debug, info, trace, Logger}; use tikv_util::{ is_zero_duration, @@ -17,6 +17,7 @@ use tikv_util::{ slog_panic, time::{duration_to_sec, Instant}, }; +use tracker::{TrackerToken, GLOBAL_TRACKERS}; use crate::{ batch::StoreContext, @@ -195,6 +196,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, self.schedule_tick(PeerTick::SplitRegionCheck); self.schedule_tick(PeerTick::PdHeartbeat); self.schedule_tick(PeerTick::CompactLog); + self.fsm.peer.on_check_merge(self.store_ctx); if self.fsm.peer.storage().is_initialized() { self.fsm.peer.schedule_apply_fsm(self.store_ctx); } @@ -206,11 +208,17 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, } #[inline] - fn on_receive_command(&self, send_time: Instant) { + fn on_receive_command(&self, send_time: Instant, read_token: Option) { + let propose_wait_time = send_time.saturating_elapsed(); self.store_ctx .raft_metrics .propose_wait_time - .observe(duration_to_sec(send_time.saturating_elapsed())); + .observe(duration_to_sec(propose_wait_time)); + if let Some(token) = read_token { + GLOBAL_TRACKERS.with_tracker(token, |tracker| { + tracker.metrics.read_index_propose_wait_nanos = propose_wait_time.as_nanos() as u64; + }); + } } fn on_tick(&mut self, tick: PeerTick) { @@ -239,30 +247,33 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, pub fn on_msgs(&mut self, peer_msgs_buf: &mut Vec) { for msg in peer_msgs_buf.drain(..) { match msg { - PeerMsg::RaftMessage(msg) => { - self.fsm.peer.on_raft_message(self.store_ctx, msg); + PeerMsg::RaftMessage(msg, send_time) => { + self.fsm + .peer + .on_raft_message(self.store_ctx, msg, send_time); } PeerMsg::RaftQuery(cmd) => { - self.on_receive_command(cmd.send_time); + self.on_receive_command(cmd.send_time, cmd.ch.read_tracker()); self.on_query(cmd.request, cmd.ch) } PeerMsg::AdminCommand(cmd) => { - self.on_receive_command(cmd.send_time); + self.on_receive_command(cmd.send_time, None); self.fsm .peer_mut() .on_admin_command(self.store_ctx, cmd.request, cmd.ch) } PeerMsg::SimpleWrite(write) => { - self.on_receive_command(write.send_time); + self.on_receive_command(write.send_time, None); self.fsm.peer_mut().on_simple_write( self.store_ctx, write.header, write.data, write.ch, + Some(write.extra_opts), ); } PeerMsg::UnsafeWrite(write) => { - self.on_receive_command(write.send_time); + self.on_receive_command(write.send_time, None); self.fsm .peer_mut() .on_unsafe_write(self.store_ctx, write.data); @@ -307,6 +318,9 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, PeerMsg::StoreUnreachable { to_store_id } => { self.fsm.peer_mut().on_store_unreachable(to_store_id) } + PeerMsg::StoreMaybeTombstone { store_id } => { + self.fsm.peer_mut().on_store_maybe_tombstone(store_id) + } PeerMsg::SnapshotSent { to_peer_id, status } => { self.fsm.peer_mut().on_snapshot_sent(to_peer_id, status) } @@ -370,9 +384,10 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, syncer, failed_stores, ), - PeerMsg::ExitForceLeaderState => { - self.fsm.peer_mut().on_exit_force_leader(self.store_ctx) - } + PeerMsg::ExitForceLeaderState => self + .fsm + .peer_mut() + .on_exit_force_leader(self.store_ctx, false), PeerMsg::ExitForceLeaderStateCampaign => { self.fsm.peer_mut().on_exit_force_leader_campaign() } diff --git a/components/raftstore-v2/src/operation/bucket.rs b/components/raftstore-v2/src/operation/bucket.rs index 432ea72456a..920a4e68e8c 100644 --- a/components/raftstore-v2/src/operation/bucket.rs +++ b/components/raftstore-v2/src/operation/bucket.rs @@ -6,15 +6,15 @@ use std::sync::Arc; use engine_traits::{KvEngine, RaftEngine}; use kvproto::{ - metapb::{self, RegionEpoch}, + metapb::RegionEpoch, raft_serverpb::{ExtraMessageType, RaftMessage, RefreshBuckets}, }; -use pd_client::{BucketMeta, BucketStat}; +use pd_client::BucketMeta; use raftstore::{ coprocessor::RegionChangeEvent, store::{util, Bucket, BucketRange, ReadProgress, SplitCheckTask, Transport}, }; -use slog::{error, info, warn}; +use slog::{error, info}; use crate::{ batch::StoreContext, @@ -24,254 +24,40 @@ use crate::{ worker::pd, }; -#[derive(Debug, Clone, Default)] -pub struct BucketStatsInfo { - bucket_stat: Option, - // the last buckets records the stats that the recently refreshed. - last_bucket_stat: Option, - // the report bucket stat records the increment stats after last report pd. - // it will be reset after report pd. - report_bucket_stat: Option, - // last bucket count. - // BucketStat.meta is Arc so it cannot be used for last bucket count - last_bucket_count: usize, -} - -impl BucketStatsInfo { - /// returns all bucket ranges those's write_bytes exceed the given - /// diff_size_threshold. - pub fn gen_bucket_range_for_update( - &self, - diff_size_threshold: u64, - ) -> Option> { - let region_buckets = self.bucket_stat.as_ref()?; - let stats = ®ion_buckets.stats; - let keys = ®ion_buckets.meta.keys; - - let empty_last_keys = vec![]; - let empty_last_stats = metapb::BucketStats::default(); - let (last_keys, last_stats, stats_reset) = self - .last_bucket_stat - .as_ref() - .map(|b| { - ( - &b.meta.keys, - &b.stats, - region_buckets.create_time != b.create_time, - ) - }) - .unwrap_or((&empty_last_keys, &empty_last_stats, false)); - - let mut bucket_ranges = vec![]; - let mut j = 0; - assert_eq!(keys.len(), stats.write_bytes.len() + 1); - for i in 0..stats.write_bytes.len() { - let mut diff_in_bytes = stats.write_bytes[i]; - while j < last_keys.len() && keys[i] > last_keys[j] { - j += 1; - } - if j < last_keys.len() && keys[i] == last_keys[j] { - if !stats_reset { - diff_in_bytes -= last_stats.write_bytes[j]; - } - j += 1; - } - if diff_in_bytes >= diff_size_threshold { - bucket_ranges.push(BucketRange(keys[i].clone(), keys[i + 1].clone())); - } - } - Some(bucket_ranges) - } - - #[inline] - pub fn version(&self) -> u64 { - self.bucket_stat - .as_ref() - .or(self.last_bucket_stat.as_ref()) - .map(|b| b.meta.version) - .unwrap_or_default() - } - #[inline] - pub fn add_bucket_flow(&mut self, delta: &Option) { - if let (Some(buckets), Some(report_buckets), Some(delta)) = ( - self.bucket_stat.as_mut(), - self.report_bucket_stat.as_mut(), - delta, - ) { - buckets.merge(delta); - report_buckets.merge(delta); - } - } - - #[inline] - pub fn set_bucket_stat(&mut self, buckets: Option) { - if let Some(b) = self.bucket_stat.take() { - self.last_bucket_stat = Some(b); - } - self.report_bucket_stat = buckets.clone(); - self.bucket_stat = buckets; - self.last_bucket_count = self - .bucket_stat - .as_ref() - .map_or(0, |bucket_stat| bucket_stat.meta.keys.len() - 1); - } - - #[inline] - pub fn clear_bucket_stat(&mut self) { - if let Some(bucket) = self.report_bucket_stat.as_mut() { - bucket.clear_stats(); - } - } - - #[inline] - pub fn report_bucket_stat(&mut self) -> BucketStat { - let current = self.report_bucket_stat.as_mut().unwrap(); - let delta = current.clone(); - current.clear_stats(); - delta - } - - #[inline] - pub fn bucket_stat(&self) -> &Option { - &self.bucket_stat - } - - #[inline] - pub fn last_bucket_count(&self) -> usize { - self.last_bucket_count - } -} - impl Peer { #[inline] pub fn on_refresh_region_buckets( &mut self, store_ctx: &mut StoreContext, region_epoch: RegionEpoch, - mut buckets: Vec, + buckets: Vec, bucket_ranges: Option>, ) { - // bucket version layout - // term logical counter - // |-----------|-----------| - // high bits low bits - // term: given 10s election timeout, the 32 bit means 1362 year running time - let gen_bucket_version = |term, current_version| { - let current_version_term = current_version >> 32; - let bucket_version: u64 = if current_version_term == term { - current_version + 1 - } else { - if term > u32::MAX.into() { - error!( - self.logger, - "unexpected term {} more than u32::MAX. Bucket - version will be backward.", - term - ); - } - term << 32 - }; - bucket_version - }; - - let region = self.region(); - let current_version = self.region_buckets_info().version(); - let next_bucket_version = gen_bucket_version(self.term(), current_version); - let mut is_first_refresh = true; - let mut change_bucket_version = false; - let mut region_buckets: BucketStat; - - // The region buckets reset after this region happened split or merge. - // The message should be dropped if it's epoch is lower than the regions. - // The bucket ranges is none when the region buckets is also none. - // So this condition indicates that the region buckets needs to refresh not - // renew. - if let (Some(bucket_ranges), Some(peer_region_buckets)) = - (bucket_ranges, self.region_buckets_info().bucket_stat()) - { - is_first_refresh = false; - assert_eq!(buckets.len(), bucket_ranges.len()); - let mut meta_idx = 0; - region_buckets = peer_region_buckets.clone(); - let mut meta = (*region_buckets.meta).clone(); - meta.region_epoch = region_epoch; - for (bucket, bucket_range) in buckets.into_iter().zip(bucket_ranges) { - // the bucket ranges maybe need to split or merge not all the meta keys, so it - // needs to find the first keys. - while meta_idx < meta.keys.len() && meta.keys[meta_idx] != bucket_range.0 { - meta_idx += 1; - } - // meta_idx can't be not the last entry (which is end key) - if meta_idx >= meta.keys.len() - 1 { - warn!( - self.logger, - "can't find the bucket key"; - "bucket_range_key" => log_wrappers::Value::key(&bucket_range.0)); - break; - } - // the bucket size is small and does not have split keys, - // then it should be merged with its left neighbor - let region_bucket_merge_size = store_ctx - .coprocessor_host - .cfg - .region_bucket_merge_size_ratio - * (store_ctx.coprocessor_host.cfg.region_bucket_size.0 as f64); - if bucket.keys.is_empty() && bucket.size <= (region_bucket_merge_size as u64) { - meta.sizes[meta_idx] = bucket.size; - // the region has more than one bucket - // and the left neighbor + current bucket size is not very big - if meta.keys.len() > 2 - && meta_idx != 0 - && meta.sizes[meta_idx - 1] + bucket.size - < store_ctx.coprocessor_host.cfg.region_bucket_size.0 * 2 - { - // bucket is too small - region_buckets.left_merge(meta_idx); - meta.left_merge(meta_idx); - change_bucket_version = true; - continue; - } - } else { - // update size - meta.sizes[meta_idx] = bucket.size / (bucket.keys.len() + 1) as u64; - // insert new bucket keys (split the original bucket) - for bucket_key in bucket.keys { - meta_idx += 1; - region_buckets.split(meta_idx); - meta.split(meta_idx, bucket_key); - change_bucket_version = true; - } - } - meta_idx += 1; - } - if self.region_buckets_info().last_bucket_count() != region_buckets.meta.keys.len() - 1 - { - change_bucket_version = true; - } - if change_bucket_version { - meta.version = next_bucket_version; - } - region_buckets.meta = Arc::new(meta); - } else { - // when the region buckets is none, the exclusive buckets includes all the - // bucket keys. - assert_eq!(buckets.len(), 1); - change_bucket_version = true; - let bucket_keys = buckets.pop().unwrap().keys; - let bucket_count = bucket_keys.len() + 1; - let mut meta = BucketMeta { - region_id: self.region_id(), - region_epoch, - version: next_bucket_version, - keys: bucket_keys, - sizes: vec![store_ctx.coprocessor_host.cfg.region_bucket_size.0; bucket_count], - }; - // padding the boundary keys and initialize the flow. - meta.keys.insert(0, region.get_start_key().to_vec()); - meta.keys.push(region.get_end_key().to_vec()); - region_buckets = BucketStat::from_meta(Arc::new(meta)); + if self.term() > u32::MAX.into() { + error!( + self.logger, + "unexpected term {} more than u32::MAX. Bucket version will be backward.", + self.term() + ); } + let current_version = self.region_buckets_info().version(); + let next_bucket_version = util::gen_bucket_version(self.term(), current_version); + let region = self.region().clone(); + let change_bucket_version = self.region_buckets_info_mut().on_refresh_region_buckets( + &store_ctx.coprocessor_host.cfg, + next_bucket_version, + buckets, + region_epoch, + ®ion, + bucket_ranges, + ); + let region_buckets = self + .region_buckets_info() + .bucket_stat() + .as_ref() + .unwrap() + .clone(); let buckets_count = region_buckets.meta.keys.len() - 1; if change_bucket_version { // TODO: we may need to make it debug once the coprocessor timeout is resolved. @@ -281,17 +67,18 @@ impl Peer { "bucket_version" => next_bucket_version, "buckets_count" => buckets_count, "estimated_region_size" => region_buckets.meta.total_size(), - "first_refresh" => is_first_refresh, ); + } else { + // it means the buckets key range not any change, so don't need to refresh. + return; } + store_ctx.coprocessor_host.on_region_changed( - region, + self.region(), RegionChangeEvent::UpdateBuckets(buckets_count), self.state_role(), ); let meta = region_buckets.meta.clone(); - self.region_buckets_info_mut() - .set_bucket_stat(Some(region_buckets.clone())); { let mut store_meta = store_ctx.store_meta.lock().unwrap(); if let Some(reader) = store_meta.readers.get_mut(&self.region_id()) { @@ -302,13 +89,13 @@ impl Peer { if let Some(apply_scheduler) = self.apply_scheduler() { apply_scheduler.send(ApplyTask::RefreshBucketStat(region_buckets.meta.clone())); } + if !self.is_leader() { + return; + } let version = region_buckets.meta.version; let keys = region_buckets.meta.keys.clone(); // Notify followers to flush their relevant memtables let peers = self.region().get_peers().to_vec(); - if !self.is_leader() { - return; - } for p in peers { if p == *self.peer() || p.is_witness { continue; @@ -397,9 +184,9 @@ impl Peer { if !ctx.coprocessor_host.cfg.enable_region_bucket() { return None; } - let bucket_update_diff_size_threshold = ctx.coprocessor_host.cfg.region_bucket_size.0 / 2; + let region_bucket_max_size = ctx.coprocessor_host.cfg.region_bucket_size.0 * 2; self.region_buckets_info() - .gen_bucket_range_for_update(bucket_update_diff_size_threshold) + .gen_bucket_range_for_update(region_bucket_max_size) } } diff --git a/components/raftstore-v2/src/operation/command/admin/compact_log.rs b/components/raftstore-v2/src/operation/command/admin/compact_log.rs index 8920ea97e1d..1c4538ab51e 100644 --- a/components/raftstore-v2/src/operation/command/admin/compact_log.rs +++ b/components/raftstore-v2/src/operation/command/admin/compact_log.rs @@ -13,7 +13,13 @@ //! Updates truncated index, and compacts logs if the corresponding changes have //! been persisted in kvdb. -use std::path::PathBuf; +use std::{ + path::PathBuf, + sync::{ + atomic::{AtomicU64, Ordering}, + Arc, + }, +}; use engine_traits::{KvEngine, RaftEngine, RaftLogBatch}; use kvproto::raft_cmdpb::{AdminCmdType, AdminRequest, AdminResponse, RaftCmdRequest}; @@ -50,16 +56,21 @@ pub struct CompactLogContext { /// persisted. When persisted_apply is advanced, we need to notify tablet /// worker to destroy them. tombstone_tablets_wait_index: Vec, + /// Sometimes a tombstone tablet can be registered after tablet index is + /// advanced. We should not consider it as an active tablet otherwise it + /// might block peer destroy progress. + persisted_tablet_index: Arc, } impl CompactLogContext { - pub fn new(last_applying_index: u64) -> CompactLogContext { + pub fn new(last_applying_index: u64, persisted_applied: u64) -> CompactLogContext { CompactLogContext { skipped_ticks: 0, approximate_log_size: 0, last_applying_index, last_compacted_idx: 0, tombstone_tablets_wait_index: vec![], + persisted_tablet_index: AtomicU64::new(persisted_applied).into(), } } @@ -134,6 +145,8 @@ impl Peer { store_ctx: &mut StoreContext, force: bool, ) { + fail::fail_point!("maybe_propose_compact_log", |_| {}); + // As leader, we would not keep caches for the peers that didn't response // heartbeat in the last few seconds. That happens probably because // another TiKV is down. In this case if we do not clean up the cache, @@ -379,7 +392,9 @@ impl Peer { )); } - /// Returns if there's any tombstone being removed. + /// Returns if there's any tombstone being removed. `persisted` state may + /// not be persisted yet, caller is responsible for actually destroying the + /// physical tablets afterwards. #[inline] pub fn remove_tombstone_tablets(&mut self, persisted: u64) -> bool { let compact_log_context = self.compact_log_context_mut(); @@ -398,11 +413,21 @@ impl Peer { } } + /// User can only increase this counter. + #[inline] + pub fn remember_persisted_tablet_index(&self) -> Arc { + self.compact_log_context().persisted_tablet_index.clone() + } + + /// Returns whether there's any tombstone tablet newer than persisted tablet + /// index. They might still be referenced by inflight apply and cannot be + /// destroyed. pub fn has_pending_tombstone_tablets(&self) -> bool { - !self - .compact_log_context() - .tombstone_tablets_wait_index - .is_empty() + let ctx = self.compact_log_context(); + let persisted = ctx.persisted_tablet_index.load(Ordering::Relaxed); + ctx.tombstone_tablets_wait_index + .iter() + .any(|i| *i > persisted) } #[inline] @@ -411,6 +436,8 @@ impl Peer { ctx: &StoreContext, task: &mut WriteTask, ) { + let applied_index = self.entry_storage().applied_index(); + self.remove_tombstone_tablets(applied_index); assert!( !self.has_pending_tombstone_tablets(), "{} all tombstone should be cleared before being destroyed.", @@ -421,7 +448,6 @@ impl Peer { None => return, }; let region_id = self.region_id(); - let applied_index = self.entry_storage().applied_index(); let sched = ctx.schedulers.tablet.clone(); let _ = sched.schedule(tablet::Task::prepare_destroy( tablet, @@ -557,13 +583,17 @@ impl Peer { } if self.remove_tombstone_tablets(new_persisted) { let sched = store_ctx.schedulers.tablet.clone(); + let counter = self.remember_persisted_tablet_index(); if !task.has_snapshot { task.persisted_cbs.push(Box::new(move || { let _ = sched.schedule(tablet::Task::destroy(region_id, new_persisted)); + // Writer guarantees no race between different callbacks. + counter.store(new_persisted, Ordering::Relaxed); })); } else { // In snapshot, the index is persisted, tablet can be destroyed directly. let _ = sched.schedule(tablet::Task::destroy(region_id, new_persisted)); + counter.store(new_persisted, Ordering::Relaxed); } } } diff --git a/components/raftstore-v2/src/operation/command/admin/conf_change.rs b/components/raftstore-v2/src/operation/command/admin/conf_change.rs index 2bd06fca6c2..5c7ff96a955 100644 --- a/components/raftstore-v2/src/operation/command/admin/conf_change.rs +++ b/components/raftstore-v2/src/operation/command/admin/conf_change.rs @@ -10,6 +10,7 @@ use std::time::Instant; use engine_traits::{KvEngine, RaftEngine, RaftLogBatch}; +use fail::fail_point; use kvproto::{ metapb::{self, PeerRole}, raft_cmdpb::{AdminRequest, AdminResponse, ChangePeerRequest, RaftCmdRequest}, @@ -105,6 +106,7 @@ impl Peer { changes.as_ref(), &cc, self.is_in_force_leader(), + self.get_peer_heartbeats(), )?; // TODO: check if the new peer is already in history record. @@ -392,6 +394,14 @@ impl Apply { match change_type { ConfChangeType::AddNode => { + let add_node_fp = || { + fail_point!( + "apply_on_add_node_1_2", + self.peer_id() == 2 && self.region_id() == 1, + |_| {} + ) + }; + add_node_fp(); PEER_ADMIN_CMD_COUNTER_VEC .with_label_values(&["add_peer", "all"]) .inc(); @@ -595,15 +605,22 @@ impl Apply { "update gc peer"; "index" => log_index, "updates" => ?updates, - "gc_peers" => ?removed_records, - "merged_peers" => ?merged_records + "removed_records" => ?removed_records, + "merged_records" => ?merged_records ); removed_records.retain(|p| !updates.contains(&p.get_id())); merged_records.retain_mut(|r| { - let mut sources: Vec<_> = r.take_source_peers().into(); - sources.retain(|p| !updates.contains(&p.get_id())); - r.set_source_peers(sources.into()); - !r.get_source_peers().is_empty() + // Clean up source peers if they acknowledge GcPeerRequest. + let mut source_peers: Vec<_> = r.take_source_peers().into(); + source_peers.retain(|p| !updates.contains(&p.get_id())); + r.set_source_peers(source_peers.into()); + // Clean up source removed records (peers) if they acknowledge GcPeerRequest. + let mut source_removed_records: Vec<_> = r.take_source_removed_records().into(); + source_removed_records.retain(|p| !updates.contains(&p.get_id())); + r.set_source_removed_records(source_removed_records.into()); + // Clean up merged records if all source peers and source removed records are + // empty. + !r.get_source_peers().is_empty() || !r.get_source_removed_records().is_empty() }); self.region_state_mut() .set_removed_records(removed_records.into()); diff --git a/components/raftstore-v2/src/operation/command/admin/merge/commit.rs b/components/raftstore-v2/src/operation/command/admin/merge/commit.rs index 5bd92e3ea1c..b12ba9eaf9d 100644 --- a/components/raftstore-v2/src/operation/command/admin/merge/commit.rs +++ b/components/raftstore-v2/src/operation/command/admin/merge/commit.rs @@ -172,11 +172,17 @@ impl Peer { &mut self, store_ctx: &mut StoreContext, ) { + fail::fail_point!("on_schedule_merge", |_| {}); fail::fail_point!( "ask_target_peer_to_commit_merge_2", self.region_id() == 2, |_| {} ); + fail::fail_point!( + "ask_target_peer_to_commit_merge_store_1", + store_ctx.store_id == 1, + |_| {} + ); let state = self.applied_merge_state().unwrap(); let target = state.get_target(); let target_id = target.get_id(); @@ -198,7 +204,7 @@ impl Peer { Ok(ents) => ents, Err(e) => slog_panic!( self.logger, - "failed to get merge entires"; + "failed to get merge entries"; "err" => ?e, "low" => low, "commit" => state.get_commit() @@ -261,6 +267,7 @@ impl Peer { store_ctx: &mut StoreContext, req: RaftCmdRequest, ) { + fail::fail_point!("on_ask_commit_merge", |_| {}); let expected_epoch = req.get_header().get_region_epoch(); let merge = req.get_admin_request().get_commit_merge(); assert!(merge.has_source_state() && merge.get_source_state().has_merge_state()); @@ -293,7 +300,10 @@ impl Peer { target_id: self.region_id(), }, ); - } else if util::is_epoch_stale(expected_epoch, region.get_region_epoch()) { + return; + } + // current region_epoch > region epoch in commit merge. + if util::is_epoch_stale(expected_epoch, region.get_region_epoch()) { info!( self.logger, "reject commit merge because of stale"; @@ -304,56 +314,51 @@ impl Peer { let _ = store_ctx .router .force_send(source_id, PeerMsg::RejectCommitMerge { index }); - } else if expected_epoch == region.get_region_epoch() { - assert!( - util::is_sibling_regions(source_region, region), - "{}: {:?}, {:?}", - SlogFormat(&self.logger), - source_region, - region - ); - assert!( - region_on_same_stores(source_region, region), - "{:?}, {:?}", - source_region, - region - ); - assert!(!self.storage().has_dirty_data()); - if self.is_leader() { - let index = commit_of_merge(req.get_admin_request().get_commit_merge()); - if self.proposal_control().is_merging() { - // `on_admin_command` may delay our request indefinitely. It's better to check - // directly. - info!( - self.logger, - "reject commit merge because of target is merging with another region"; - ); - } else { - let (ch, res) = CmdResChannel::pair(); - self.on_admin_command(store_ctx, req, ch); - if let Some(res) = res.take_result() - && res.get_header().has_error() - { - error!( - self.logger, - "failed to propose commit merge"; - "source" => source_id, - "res" => ?res, - ); - } else { - return; - } - } - let _ = store_ctx - .router - .force_send(source_id, PeerMsg::RejectCommitMerge { index }); - } - } else { + return; + } + // current region_epoch < region epoch in commit merge. + if util::is_epoch_stale(region.get_region_epoch(), expected_epoch) { info!( self.logger, - "ignore commit merge because self epoch is stale"; + "target region still not catch up, skip."; "source" => ?source_region, + "target_region_epoch" => ?expected_epoch, + "exist_region_epoch" => ?self.region().get_region_epoch(), + ); + return; + } + assert!( + util::is_sibling_regions(source_region, region), + "{}: {:?}, {:?}", + SlogFormat(&self.logger), + source_region, + region + ); + assert!( + region_on_same_stores(source_region, region), + "{:?}, {:?}", + source_region, + region + ); + assert!(!self.storage().has_dirty_data()); + let (ch, res) = CmdResChannel::pair(); + self.on_admin_command(store_ctx, req, ch); + if let Some(res) = res.take_result() + && res.get_header().has_error() + { + error!( + self.logger, + "failed to propose commit merge"; + "source" => source_id, + "res" => ?res, ); + fail::fail_point!( + "on_propose_commit_merge_fail_store_1", + store_ctx.store_id == 1, + |_| {} + ); + } else { + fail::fail_point!("on_propose_commit_merge_success"); } } @@ -362,6 +367,7 @@ impl Peer { store_ctx: &mut StoreContext, req: RaftCmdRequest, ) -> Result { + (|| fail::fail_point!("propose_commit_merge_1", store_ctx.store_id == 1, |_| {}))(); let mut proposal_ctx = ProposalContext::empty(); proposal_ctx.insert(ProposalContext::COMMIT_MERGE); let data = req.write_to_bytes().unwrap(); @@ -532,9 +538,6 @@ impl Apply { state.set_state(PeerState::Normal); assert!(!state.has_merge_state()); state.set_tablet_index(index); - let mut removed_records: Vec<_> = state.take_removed_records().into(); - removed_records.append(&mut source_state.get_removed_records().into()); - state.set_removed_records(removed_records.into()); let mut merged_records: Vec<_> = state.take_merged_records().into(); merged_records.append(&mut source_state.get_merged_records().into()); state.set_merged_records(merged_records.into()); @@ -542,6 +545,7 @@ impl Apply { merged_record.set_source_region_id(source_region.get_id()); merged_record.set_source_epoch(source_region.get_region_epoch().clone()); merged_record.set_source_peers(source_region.get_peers().into()); + merged_record.set_source_removed_records(source_state.get_removed_records().into()); merged_record.set_target_region_id(region.get_id()); merged_record.set_target_epoch(region.get_region_epoch().clone()); merged_record.set_target_peers(region.get_peers().into()); @@ -683,6 +687,8 @@ impl Peer { info!( self.logger, "become follower for new logs"; + "first_log_term" => first.term, + "first_log_index" => first.index, "new_log_term" => last_log.term, "new_log_index" => last_log.index, "term" => self.term(), @@ -730,6 +736,12 @@ impl Peer { store_ctx: &mut StoreContext, mut res: CommitMergeResult, ) { + fail::fail_point!( + "on_apply_res_commit_merge_2", + self.peer().store_id == 2, + |_| {} + ); + let region = res.region_state.get_region(); assert!( res.source.get_end_key() == region.get_end_key() @@ -815,6 +827,7 @@ impl Peer { "target_region" => ?self.region(), ); self.add_pending_tick(PeerTick::SplitRegionCheck); + self.maybe_schedule_gc_peer_tick(); } } diff --git a/components/raftstore-v2/src/operation/command/admin/merge/prepare.rs b/components/raftstore-v2/src/operation/command/admin/merge/prepare.rs index d3d1896287c..4a5875f7097 100644 --- a/components/raftstore-v2/src/operation/command/admin/merge/prepare.rs +++ b/components/raftstore-v2/src/operation/command/admin/merge/prepare.rs @@ -219,22 +219,7 @@ impl Peer { if r.is_ok() { self.proposal_control_mut().set_pending_prepare_merge(false); } else { - // Match v1::post_propose_fail. - // If we just failed to propose PrepareMerge, the pessimistic locks status - // may become MergingRegion incorrectly. So, we have to revert it here. - // Note: The `is_merging` check from v1 is removed because proposed - // `PrepareMerge` rejects all writes (in `ProposalControl::check_conflict`). - assert!( - !self.proposal_control().is_merging(), - "{}", - SlogFormat(&self.logger) - ); - self.take_merge_context(); - self.proposal_control_mut().set_pending_prepare_merge(false); - let mut pessimistic_locks = self.txn_context().ext().pessimistic_locks.write(); - if pessimistic_locks.status == LocksStatus::MergingRegion { - pessimistic_locks.status = LocksStatus::Normal; - } + self.post_prepare_merge_fail(); } r } @@ -707,6 +692,25 @@ impl Peer { self.propose(store_ctx, cmd.write_to_bytes().unwrap())?; Ok(()) } + + pub fn post_prepare_merge_fail(&mut self) { + // Match v1::post_propose_fail. + // If we just failed to propose PrepareMerge, the pessimistic locks status + // may become MergingRegion incorrectly. So, we have to revert it here. + // Note: The `is_merging` check from v1 is removed because proposed + // `PrepareMerge` rejects all writes (in `ProposalControl::check_conflict`). + assert!( + !self.proposal_control().is_merging(), + "{}", + SlogFormat(&self.logger) + ); + self.take_merge_context(); + self.proposal_control_mut().set_pending_prepare_merge(false); + let mut pessimistic_locks = self.txn_context().ext().pessimistic_locks.write(); + if pessimistic_locks.status == LocksStatus::MergingRegion { + pessimistic_locks.status = LocksStatus::Normal; + } + } } impl Apply { @@ -812,6 +816,8 @@ impl Peer { store_ctx: &mut StoreContext, res: PrepareMergeResult, ) { + fail::fail_point!("on_apply_res_prepare_merge"); + let region = res.region_state.get_region().clone(); { let mut meta = store_ctx.store_meta.lock().unwrap(); diff --git a/components/raftstore-v2/src/operation/command/admin/merge/rollback.rs b/components/raftstore-v2/src/operation/command/admin/merge/rollback.rs index cb45fdcf1cf..adc49a928b3 100644 --- a/components/raftstore-v2/src/operation/command/admin/merge/rollback.rs +++ b/components/raftstore-v2/src/operation/command/admin/merge/rollback.rs @@ -4,9 +4,8 @@ use engine_traits::{KvEngine, RaftEngine, RaftLogBatch}; use kvproto::{ - metapb, raft_cmdpb::{AdminCmdType, AdminRequest, AdminResponse}, - raft_serverpb::PeerState, + raft_serverpb::{PeerState, RegionLocalState}, }; use raftstore::{ coprocessor::RegionChangeReason, @@ -28,7 +27,7 @@ use crate::{ #[derive(Debug)] pub struct RollbackMergeResult { commit: u64, - region: metapb::Region, + region_state: RegionLocalState, } impl Peer { @@ -38,6 +37,7 @@ impl Peer { store_ctx: &mut StoreContext, index: u64, ) { + fail::fail_point!("on_reject_commit_merge_1", store_ctx.store_id == 1, |_| {}); let self_index = self.merge_context().and_then(|c| c.prepare_merge_index()); if self_index != Some(index) { info!( @@ -75,7 +75,7 @@ impl Apply { pub fn apply_rollback_merge( &mut self, req: &AdminRequest, - _index: u64, + index: u64, ) -> Result<(AdminResponse, AdminCmdResult)> { fail::fail_point!("apply_rollback_merge"); PEER_ADMIN_CMD_COUNTER.rollback_merge.all.inc(); @@ -95,6 +95,15 @@ impl Apply { "state" => ?merge_state, ); } + + let prepare_merge_commit = rollback.commit; + info!( + self.logger, + "execute RollbackMerge"; + "commit" => prepare_merge_commit, + "index" => index, + ); + let mut region = self.region().clone(); let version = region.get_region_epoch().get_version(); // Update version to avoid duplicated rollback requests. @@ -108,7 +117,7 @@ impl Apply { AdminResponse::default(), AdminCmdResult::RollbackMerge(RollbackMergeResult { commit: rollback.get_commit(), - region, + region_state: self.region_state().clone(), }), )) } @@ -121,6 +130,7 @@ impl Peer { store_ctx: &mut StoreContext, res: RollbackMergeResult, ) { + let region = res.region_state.get_region(); assert_ne!(res.commit, 0); let current = self.merge_context().and_then(|c| c.prepare_merge_index()); if current != Some(res.commit) { @@ -133,21 +143,21 @@ impl Peer { } { let mut meta = store_ctx.store_meta.lock().unwrap(); - meta.set_region(&res.region, true, &self.logger); - let (reader, _) = meta.readers.get_mut(&res.region.get_id()).unwrap(); + meta.set_region(region, true, &self.logger); + let (reader, _) = meta.readers.get_mut(®ion.get_id()).unwrap(); self.set_region( &store_ctx.coprocessor_host, reader, - res.region.clone(), + region.clone(), RegionChangeReason::RollbackMerge, self.storage().region_state().get_tablet_index(), ); } - let region_state = self.storage().region_state().clone(); let region_id = self.region_id(); self.state_changes_mut() - .put_region_state(region_id, res.commit, ®ion_state) + .put_region_state(region_id, res.commit, &res.region_state) .unwrap(); + self.storage_mut().set_region_state(res.region_state); self.set_has_extra_write(); self.rollback_merge(store_ctx); diff --git a/components/raftstore-v2/src/operation/command/admin/mod.rs b/components/raftstore-v2/src/operation/command/admin/mod.rs index d59a564c696..9d7fee55ae4 100644 --- a/components/raftstore-v2/src/operation/command/admin/mod.rs +++ b/components/raftstore-v2/src/operation/command/admin/mod.rs @@ -12,6 +12,7 @@ use compact_log::CompactLogResult; use conf_change::{ConfChangeResult, UpdateGcPeersResult}; use engine_traits::{KvEngine, RaftEngine}; use kvproto::{ + kvrpcpb::DiskFullOpt, metapb::{PeerRole, Region}, raft_cmdpb::{AdminCmdType, RaftCmdRequest}, raft_serverpb::{ExtraMessageType, FlushMemtable, RaftMessage}, @@ -33,13 +34,13 @@ use raftstore::{ }, Error, }; -use slog::{error, info}; +use slog::{debug, error, info}; use split::SplitResult; pub use split::{ report_split_init_finish, temp_split_path, RequestHalfSplit, RequestSplit, SplitFlowControl, SplitInit, SplitPendingAppend, SPLIT_PREFIX, }; -use tikv_util::{box_err, log::SlogFormat, slog_panic}; +use tikv_util::{box_err, log::SlogFormat, slog_panic, sys::disk::DiskUsage}; use txn_types::WriteBatchFlags; use self::flashback::FlashbackResult; @@ -103,6 +104,18 @@ impl Peer { let pre_transfer_leader = cmd_type == AdminCmdType::TransferLeader && !WriteBatchFlags::from_bits_truncate(req.get_header().get_flags()) .contains(WriteBatchFlags::TRANSFER_LEADER_PROPOSAL); + let is_conf_change = apply::is_conf_change_cmd(&req); + + // Check whether the admin request can be proposed when disk full. + let can_skip_check = is_transfer_leader || pre_transfer_leader || is_conf_change; + if !can_skip_check && let Err(e) = + self.check_proposal_with_disk_full_opt(ctx, DiskFullOpt::AllowedOnAlmostFull) + { + let resp = cmd_resp::new_error(e); + ch.report_error(resp); + self.post_propose_fail(cmd_type); + return; + } // The admin request is rejected because it may need to update epoch checker // which introduces an uncertainty and may breaks the correctness of epoch @@ -134,9 +147,11 @@ impl Peer { ch.report_error(resp); return; } + // Prepare Merge need to be broadcast to as many as followers when disk full. + self.on_prepare_merge(cmd_type, ctx); // To maintain propose order, we need to make pending proposal first. self.propose_pending_writes(ctx); - let res = if apply::is_conf_change_cmd(&req) { + let res = if is_conf_change { self.propose_conf_change(ctx, req) } else { // propose other admin command. @@ -258,6 +273,42 @@ impl Peer { self.post_propose_command(ctx, res, vec![ch], true); } + fn on_prepare_merge( + &mut self, + cmd_type: AdminCmdType, + ctx: &StoreContext, + ) { + let is_merge_cmd = + cmd_type == AdminCmdType::PrepareMerge || cmd_type == AdminCmdType::RollbackMerge; + let has_disk_full_peers = self.abnormal_peer_context().disk_full_peers().is_empty(); + let proposal_index = self.next_proposal_index(); + if is_merge_cmd + && (!matches!(ctx.self_disk_usage, DiskUsage::Normal) || !has_disk_full_peers) + { + self.has_region_merge_proposal = true; + self.region_merge_proposal_index = proposal_index; + let mut peers = vec![]; + self.abnormal_peer_context_mut() + .disk_full_peers_mut() + .peers_mut() + .iter_mut() + .for_each(|(k, v)| { + if !matches!(v.0, DiskUsage::AlreadyFull) { + v.1 = true; + peers.push(*k); + } + }); + debug!( + self.logger, + "adjust max inflight msgs"; + "cmd_type" => ?cmd_type, + "raft_max_inflight_msgs" => ctx.cfg.raft_max_inflight_msgs, + "region" => self.region_id() + ); + self.adjust_peers_max_inflight_msgs(&peers, ctx.cfg.raft_max_inflight_msgs); + } + } + fn start_pre_flush( &mut self, ctx: &mut StoreContext, diff --git a/components/raftstore-v2/src/operation/command/admin/split.rs b/components/raftstore-v2/src/operation/command/admin/split.rs index c744c1b9161..cfbd7678c17 100644 --- a/components/raftstore-v2/src/operation/command/admin/split.rs +++ b/components/raftstore-v2/src/operation/command/admin/split.rs @@ -35,6 +35,7 @@ use engine_traits::{ use fail::fail_point; use futures::channel::oneshot; use kvproto::{ + kvrpcpb::DiskFullOpt, metapb::{self, Region, RegionEpoch}, pdpb::CheckPolicy, raft_cmdpb::{AdminRequest, AdminResponse, RaftCmdRequest, SplitRequest}, @@ -75,6 +76,9 @@ pub struct SplitResult { // The index of the derived region in `regions` pub derived_index: usize, pub tablet_index: u64, + // new regions will share the region size if it's true. + // otherwise, the new region's size will be 0. + pub share_source_region_size: bool, // Hack: in common case we should use generic, but split is an infrequent // event that performance is not critical. And using `Any` can avoid polluting // all existing code. @@ -148,6 +152,9 @@ pub struct RequestSplit { pub epoch: RegionEpoch, pub split_keys: Vec>, pub source: Cow<'static, str>, + // new regions will share the region size if it's true. + // otherwise, the new region's size will be 0. + pub share_source_region_size: bool, } #[derive(Debug)] @@ -235,6 +242,7 @@ impl Peer { { return true; } + fail_point!("on_split_region_check_tick", |_| true); if ctx.schedulers.split_check.is_busy() { return false; } @@ -325,6 +333,14 @@ impl Peer { )))); return; } + // Check whether the admin request can be proposed when disk full. + if let Err(e) = + self.check_proposal_with_disk_full_opt(ctx, DiskFullOpt::AllowedOnAlmostFull) + { + info!(self.logger, "disk is full, skip split"; "err" => ?e); + ch.set_result(cmd_resp::new_error(e)); + return; + } if let Err(e) = util::validate_split_region( self.region_id(), self.peer_id(), @@ -336,7 +352,7 @@ impl Peer { ch.set_result(cmd_resp::new_error(e)); return; } - self.ask_batch_split_pd(ctx, rs.split_keys, ch); + self.ask_batch_split_pd(ctx, rs.split_keys, rs.share_source_region_size, ch); } pub fn on_request_half_split( @@ -358,6 +374,13 @@ impl Peer { info!(self.logger, "not leader, skip."); return; } + // Check whether the admin request can be proposed when disk full. + if let Err(e) = + self.check_proposal_with_disk_full_opt(ctx, DiskFullOpt::AllowedOnAlmostFull) + { + info!(self.logger, "disk is full, skip half split"; "err" => ?e); + return; + } let region = self.region(); if util::is_epoch_stale(&rhs.epoch, region.get_region_epoch()) { @@ -479,6 +502,7 @@ impl Apply { let derived_req = &[derived_req]; let right_derive = split_reqs.get_right_derive(); + let share_source_region_size = split_reqs.get_share_source_region_size(); let reqs = if right_derive { split_reqs.get_requests().iter().chain(derived_req) } else { @@ -615,6 +639,7 @@ impl Apply { derived_index, tablet_index: log_index, tablet: Box::new(tablet), + share_source_region_size, }), )) } @@ -665,6 +690,7 @@ impl Peer { fail_point!("on_split", self.peer().get_store_id() == 3, |_| {}); let derived = &res.regions[res.derived_index]; + let share_source_region_size = res.share_source_region_size; let region_id = derived.get_id(); let region_locks = self.txn_context().split(&res.regions, derived); @@ -695,8 +721,14 @@ impl Peer { let new_region_count = res.regions.len() as u64; let control = self.split_flow_control_mut(); - let estimated_size = control.approximate_size.map(|v| v / new_region_count); - let estimated_keys = control.approximate_keys.map(|v| v / new_region_count); + // if share_source_region_size is true, it means the new region contains any + // data from the origin region. + let mut share_size = None; + let mut share_keys = None; + if share_source_region_size { + share_size = control.approximate_size.map(|v| v / new_region_count); + share_keys = control.approximate_keys.map(|v| v / new_region_count); + } self.post_split(); @@ -714,8 +746,11 @@ impl Peer { // After split, the peer may need to update its metrics. let control = self.split_flow_control_mut(); control.may_skip_split_check = false; - control.approximate_size = estimated_size; - control.approximate_keys = estimated_keys; + if share_source_region_size { + control.approximate_size = share_size; + control.approximate_keys = share_keys; + } + self.add_pending_tick(PeerTick::SplitRegionCheck); } self.storage_mut().set_has_dirty_data(true); @@ -760,8 +795,8 @@ impl Peer { derived_region_id: region_id, check_split: last_region_id == new_region_id, scheduled: false, - approximate_size: estimated_size, - approximate_keys: estimated_keys, + approximate_size: share_size, + approximate_keys: share_keys, locks, })); diff --git a/components/raftstore-v2/src/operation/command/admin/transfer_leader.rs b/components/raftstore-v2/src/operation/command/admin/transfer_leader.rs index 4cdeba3bc41..bf9cb426255 100644 --- a/components/raftstore-v2/src/operation/command/admin/transfer_leader.rs +++ b/components/raftstore-v2/src/operation/command/admin/transfer_leader.rs @@ -118,7 +118,7 @@ impl Peer { transferee } - fn pre_transfer_leader(&mut self, peer: &metapb::Peer) -> bool { + pub fn pre_transfer_leader(&mut self, peer: &metapb::Peer) -> bool { if self.raft_group().raft.has_pending_conf() { info!( self.logger, diff --git a/components/raftstore-v2/src/operation/command/mod.rs b/components/raftstore-v2/src/operation/command/mod.rs index 0fd88cc987b..51384c6eaf2 100644 --- a/components/raftstore-v2/src/operation/command/mod.rs +++ b/components/raftstore-v2/src/operation/command/mod.rs @@ -455,6 +455,11 @@ impl Peer { if is_leader { self.retry_pending_prepare_merge(ctx, apply_res.applied_index); } + if !apply_res.sst_applied_index.is_empty() { + self.storage_mut() + .apply_trace_mut() + .on_sst_ingested(&apply_res.sst_applied_index); + } self.on_data_modified(apply_res.modifications); self.handle_read_on_apply( ctx, @@ -462,7 +467,7 @@ impl Peer { apply_res.applied_index, progress_to_be_updated, ); - self.try_compelete_recovery(); + self.try_complete_recovery(); if !self.pause_for_replay() && self.storage_mut().apply_trace_mut().should_flush() { if let Some(scheduler) = self.apply_scheduler() { scheduler.send(ApplyTask::ManualFlush); @@ -476,6 +481,12 @@ impl Peer { } self.check_unsafe_recovery_state(ctx); } + + pub fn post_propose_fail(&mut self, cmd_type: AdminCmdType) { + if cmd_type == AdminCmdType::PrepareMerge { + self.post_prepare_merge_fail(); + } + } } #[derive(Debug)] @@ -583,6 +594,7 @@ impl Apply { fail::fail_point!("APPLY_COMMITTED_ENTRIES"); fail::fail_point!("on_handle_apply_1003", self.peer_id() == 1003, |_| {}); fail::fail_point!("on_handle_apply_2", self.peer_id() == 2, |_| {}); + fail::fail_point!("on_handle_apply", |_| {}); fail::fail_point!("on_handle_apply_store_1", self.store_id() == 1, |_| {}); let now = std::time::Instant::now(); let apply_wait_time = APPLY_TASK_WAIT_TIME_HISTOGRAM.local(); @@ -865,6 +877,7 @@ impl Apply { apply_res.modifications = *self.modifications_mut(); apply_res.metrics = mem::take(&mut self.metrics); apply_res.bucket_stat = self.buckets.clone(); + apply_res.sst_applied_index = self.take_sst_applied_index(); let written_bytes = apply_res.metrics.written_bytes; let skip_report = || -> bool { diff --git a/components/raftstore-v2/src/operation/command/write/ingest.rs b/components/raftstore-v2/src/operation/command/write/ingest.rs index 7e8ed381ad0..45247b3f36f 100644 --- a/components/raftstore-v2/src/operation/command/write/ingest.rs +++ b/components/raftstore-v2/src/operation/command/write/ingest.rs @@ -2,8 +2,9 @@ use collections::HashMap; use crossbeam::channel::TrySendError; -use engine_traits::{data_cf_offset, KvEngine, RaftEngine}; +use engine_traits::{data_cf_offset, KvEngine, RaftEngine, DATA_CFS_LEN}; use kvproto::import_sstpb::SstMeta; +use pd_client::metrics::STORE_SIZE_EVENT_INT_VEC; use raftstore::{ store::{check_sst_for_ingestion, metrics::PEER_WRITE_CMD_COUNTER, util}, Result, @@ -16,7 +17,7 @@ use crate::{ batch::StoreContext, fsm::{ApplyResReporter, Store, StoreFsmDelegate}, raft::{Apply, Peer}, - router::{PeerMsg, StoreTick}, + router::{PeerMsg, SstApplyIndex, StoreTick}, worker::tablet, }; @@ -39,7 +40,14 @@ impl Store { &mut self, ctx: &mut StoreContext, ) -> Result<()> { + let import_size = box_try!(ctx.sst_importer.get_total_size()); + STORE_SIZE_EVENT_INT_VEC.import_size.set(import_size as i64); let ssts = box_try!(ctx.sst_importer.list_ssts()); + // filter old version SSTs + let ssts: Vec<_> = ssts + .into_iter() + .filter(|sst| sst.1 >= sst_importer::API_VERSION_2) + .collect(); if ssts.is_empty() { return Ok(()); } @@ -47,9 +55,9 @@ impl Store { let mut region_ssts: HashMap<_, Vec<_>> = HashMap::default(); for sst in ssts { region_ssts - .entry(sst.get_region_id()) + .entry(sst.0.get_region_id()) .or_default() - .push(sst); + .push(sst.0); } let ranges = ctx.sst_importer.ranges_in_import(); @@ -107,10 +115,12 @@ impl Peer { impl Apply { #[inline] pub fn apply_ingest(&mut self, index: u64, ssts: Vec) -> Result<()> { + fail::fail_point!("on_apply_ingest"); PEER_WRITE_CMD_COUNTER.ingest_sst.inc(); let mut infos = Vec::with_capacity(ssts.len()); let mut size: i64 = 0; let mut keys: u64 = 0; + let mut cf_indexes = [u64::MAX; DATA_CFS_LEN]; for sst in &ssts { // This may not be enough as ingest sst may not trigger flush at all. let off = data_cf_offset(sst.get_cf_name()); @@ -138,6 +148,7 @@ impl Apply { slog_panic!(self.logger, "corrupted sst"; "sst" => ?sst, "error" => ?e); } } + cf_indexes[off] = index; } if !infos.is_empty() { // Unlike v1, we can't batch ssts accross regions. @@ -154,6 +165,11 @@ impl Apply { self.metrics.size_diff_hint += size; self.metrics.written_bytes += size as u64; self.metrics.written_keys += keys; + for (cf_index, index) in cf_indexes.into_iter().enumerate() { + if index != u64::MAX { + self.push_sst_applied_index(SstApplyIndex { cf_index, index }); + } + } Ok(()) } } diff --git a/components/raftstore-v2/src/operation/command/write/mod.rs b/components/raftstore-v2/src/operation/command/write/mod.rs index a9d8bd664fe..5806614e192 100644 --- a/components/raftstore-v2/src/operation/command/write/mod.rs +++ b/components/raftstore-v2/src/operation/command/write/mod.rs @@ -12,7 +12,8 @@ use raftstore::{ fsm::{apply, MAX_PROPOSAL_SIZE_RATIO}, metrics::PEER_WRITE_CMD_COUNTER, msg::ErrorCallback, - util::{self, NORMAL_REQ_CHECK_CONF_VER, NORMAL_REQ_CHECK_VER}, + util::{self}, + RaftCmdExtraOpts, }, Error, Result, }; @@ -42,6 +43,7 @@ impl Peer { header: Box, data: SimpleWriteBinary, ch: CmdResChannel, + extra_opts: Option, ) { if !self.serving() { apply::notify_req_region_removed(self.region_id(), ch); @@ -59,6 +61,20 @@ impl Peer { ch.report_error(resp); return; } + if let Some(opts) = extra_opts { + if let Some(Err(e)) = opts.deadline.map(|deadline| deadline.check()) { + let resp = cmd_resp::new_error(e.into()); + ch.report_error(resp); + return; + } + // Check whether the write request can be proposed with the given disk full + // option. + if let Err(e) = self.check_proposal_with_disk_full_opt(ctx, opts.disk_full_opt) { + let resp = cmd_resp::new_error(e); + ch.report_error(resp); + return; + } + } // To maintain propose order, we need to make pending proposal first. self.propose_pending_writes(ctx); if let Some(conflict) = self.proposal_control_mut().check_conflict(None) { @@ -72,13 +88,10 @@ impl Peer { ch.report_error(resp); return; } - // ProposalControl is reliable only when applied to current term. - let call_proposed_on_success = self.applied_to_current_term(); let mut encoder = SimpleWriteReqEncoder::new( header, data, (ctx.cfg.raft_entry_max_size.0 as f64 * MAX_PROPOSAL_SIZE_RATIO) as usize, - call_proposed_on_success, ); encoder.add_response_channel(ch); self.set_has_ready(); @@ -98,7 +111,6 @@ impl Peer { Box::::default(), data, ctx.cfg.raft_entry_max_size.0 as usize, - false, ) .encode() .0 @@ -110,30 +122,17 @@ impl Peer { pub fn propose_pending_writes(&mut self, ctx: &mut StoreContext) { if let Some(encoder) = self.simple_write_encoder_mut().take() { - let call_proposed_on_success = if encoder.notify_proposed() { - // The request has pass conflict check and called all proposed callbacks. + let header = encoder.header(); + let res = self.validate_command(header, None, &mut ctx.raft_metrics); + let call_proposed_on_success = if matches!(res, Err(Error::EpochNotMatch { .. })) { false } else { - // Epoch may have changed since last check. - let from_epoch = encoder.header().get_region_epoch(); - let res = util::compare_region_epoch( - from_epoch, - self.region(), - NORMAL_REQ_CHECK_CONF_VER, - NORMAL_REQ_CHECK_VER, - true, - ); - if let Err(e) = res { - // TODO: query sibling regions. - ctx.raft_metrics.invalid_proposal.epoch_not_match.inc(); - encoder.encode().1.report_error(cmd_resp::new_error(e)); - return; - } - // Only when it applies to current term, the epoch check can be reliable. self.applied_to_current_term() }; + let (data, chs) = encoder.encode(); - let res = self.propose(ctx, data); + let res = res.and_then(|_| self.propose(ctx, data)); + fail_point!("after_propose_pending_writes"); self.post_propose_command(ctx, res, chs, call_proposed_on_success); diff --git a/components/raftstore-v2/src/operation/disk_snapshot_backup.rs b/components/raftstore-v2/src/operation/disk_snapshot_backup.rs new file mode 100644 index 00000000000..1e033248b23 --- /dev/null +++ b/components/raftstore-v2/src/operation/disk_snapshot_backup.rs @@ -0,0 +1,37 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +use futures::channel::mpsc::UnboundedSender; +use kvproto::brpb::CheckAdminResponse; +use raftstore::store::snapshot_backup::{SnapshotBrHandle, SnapshotBrWaitApplyRequest}; +use tikv_util::box_err; + +const REASON: &str = "Raftstore V2 doesn't support snapshot backup yet."; + +#[derive(Clone, Copy)] +pub struct UnimplementedHandle; + +impl SnapshotBrHandle for UnimplementedHandle { + fn send_wait_apply(&self, _region: u64, _req: SnapshotBrWaitApplyRequest) -> crate::Result<()> { + Err(crate::Error::Other(box_err!( + "send_wait_apply not implemented; note: {}", + REASON + ))) + } + + fn broadcast_wait_apply(&self, _req: SnapshotBrWaitApplyRequest) -> crate::Result<()> { + Err(crate::Error::Other(box_err!( + "broadcast_wait_apply not implemented; note: {}", + REASON + ))) + } + + fn broadcast_check_pending_admin( + &self, + _tx: UnboundedSender, + ) -> crate::Result<()> { + Err(crate::Error::Other(box_err!( + "broadcast_check_pending_admin not implemented; note: {}", + REASON + ))) + } +} diff --git a/components/raftstore-v2/src/operation/life.rs b/components/raftstore-v2/src/operation/life.rs index 52f00d137f8..e9fc84643da 100644 --- a/components/raftstore-v2/src/operation/life.rs +++ b/components/raftstore-v2/src/operation/life.rs @@ -26,28 +26,35 @@ //! `merged_records`, to avoid race between destroy and merge, leader needs to //! ask target peer to destroy source peer. -use std::{cmp, mem}; +use std::{cmp, collections::HashSet, mem}; use batch_system::BasicMailbox; use crossbeam::channel::{SendError, TrySendError}; use engine_traits::{KvEngine, RaftEngine, RaftLogBatch}; use kvproto::{ - metapb::{self, Region}, + kvrpcpb::DiskFullOpt, + metapb::{self, PeerRole, Region}, raft_cmdpb::{AdminCmdType, RaftCmdRequest}, raft_serverpb::{ExtraMessage, ExtraMessageType, PeerState, RaftMessage}, }; -use raftstore::store::{ - fsm::{ - apply, - life::{build_peer_destroyed_report, forward_destroy_to_source_peer}, - Proposal, +use raft::eraftpb::MessageType; +use raftstore::{ + store::{ + fsm::{ + apply, + life::{build_peer_destroyed_report, forward_destroy_to_source_peer}, + Proposal, + }, + local_metrics::IoType as InspectIoType, + metrics::RAFT_PEER_PENDING_DURATION, + util, DiskFullPeers, Transport, WriteTask, }, - metrics::RAFT_PEER_PENDING_DURATION, - util, Transport, WriteTask, + Error, Result, }; use slog::{debug, error, info, warn}; use tikv_util::{ store::find_peer, + sys::disk::DiskUsage, time::{duration_to_sec, Instant}, }; @@ -126,16 +133,22 @@ pub struct AbnormalPeerContext { pending_peers: Vec<(u64, Instant)>, /// A inaccurate cache about which peer is marked as down. down_peers: Vec, + // disk full peer set. + disk_full_peers: DiskFullPeers, + // show whether an already disk full TiKV appears in the potential majority set. + dangerous_majority_set: bool, } impl AbnormalPeerContext { #[inline] pub fn is_empty(&self) -> bool { - self.pending_peers.is_empty() && self.down_peers.is_empty() + self.pending_peers.is_empty() && self.down_peers.is_empty() /* && self.disk_full_peers.is_empty() */ } #[inline] pub fn reset(&mut self) { + // No need to refresh disk_full_peers as it will be refreshed + // automatically when the disk usage updated. self.pending_peers.clear(); self.down_peers.clear(); } @@ -174,6 +187,26 @@ impl AbnormalPeerContext { RAFT_PEER_PENDING_DURATION.observe(elapsed); }); } + + #[inline] + pub fn disk_full_peers(&self) -> &DiskFullPeers { + &self.disk_full_peers + } + + #[inline] + pub fn disk_full_peers_mut(&mut self) -> &mut DiskFullPeers { + &mut self.disk_full_peers + } + + #[inline] + pub fn is_dangerous_majority_set(&self) -> bool { + self.dangerous_majority_set + } + + #[inline] + pub fn setup_dangerous_majority_set(&mut self, is_dangerous: bool) { + self.dangerous_majority_set = is_dangerous; + } } #[derive(Default)] @@ -384,8 +417,8 @@ impl Store { ); let region_id = msg.get_region_id(); // The message can be sent when the peer is being created, so try send it first. - let mut msg = if let Err(TrySendError::Disconnected(PeerMsg::RaftMessage(m))) = - ctx.router.send(region_id, PeerMsg::RaftMessage(msg)) + let mut msg = if let Err(TrySendError::Disconnected(PeerMsg::RaftMessage(m, _))) = + ctx.router.send(region_id, PeerMsg::RaftMessage(msg, None)) { m } else { @@ -415,6 +448,20 @@ impl Store { ctx.raft_metrics.message_dropped.stale_msg.inc(); return false; } + // Check whether this message should be dropped when disk full. + let msg_type = msg.get_message().get_msg_type(); + if matches!(ctx.self_disk_usage, DiskUsage::AlreadyFull) + && MessageType::MsgTimeoutNow == msg_type + { + debug!( + self.logger(), + "skip {:?} because of disk full", msg_type; + "region_id" => region_id, "peer_id" => to_peer.id, + ); + ctx.raft_metrics.message_dropped.disk_full.inc(); + return false; + } + let destroyed = match check_if_to_peer_destroyed(&ctx.engine, &msg, self.store_id()) { Ok(d) => d, Err(e) => { @@ -424,7 +471,13 @@ impl Store { }; if destroyed { if msg.get_is_tombstone() { + let msg_region_epoch = msg.get_region_epoch().clone(); if let Some(msg) = build_peer_destroyed_report(&mut msg) { + info!(self.logger(), "peer reports destroyed"; + "from_peer" => ?msg.get_from_peer(), + "from_region_epoch" => ?msg_region_epoch, + "region_id" => ?msg.get_region_id(), + "to_peer_id" => ?msg.get_to_peer().get_id()); let _ = ctx.trans.send(msg); } return false; @@ -510,7 +563,7 @@ impl Store { if from_peer.id != raft::INVALID_ID { // For now the peer only exists in memory. It will persist its states when // handling its first readiness. - let _ = ctx.router.send(region_id, PeerMsg::RaftMessage(msg)); + let _ = ctx.router.send(region_id, PeerMsg::RaftMessage(msg, None)); } true } @@ -527,9 +580,9 @@ impl Store { { // Record the last statistics of commit-log-duration and store-write-duration. inspector.record_store_wait(start_ts.saturating_elapsed()); - inspector.record_store_commit(ctx.raft_metrics.stat_commit_log.avg()); - // Reset the stat_commit_log and wait it to be refreshed in the next tick. - ctx.raft_metrics.stat_commit_log.reset(); + inspector.record_store_commit(ctx.raft_metrics.health_stats.avg(InspectIoType::Network)); + // Reset the health_stats and wait it to be refreshed in the next tick. + ctx.raft_metrics.health_stats.reset(); ctx.pending_latency_inspect.push(inspector); } } @@ -581,7 +634,11 @@ impl Peer { .iter() .find(|p| p.id == msg.get_from_peer().get_id()) { - let tombstone_msg = self.tombstone_message_for_same_region(peer.clone()); + let tombstone_msg = self.tombstone_message( + self.region_id(), + self.region().get_region_epoch().clone(), + peer.clone(), + ); self.add_message(tombstone_msg); true } else { @@ -589,13 +646,24 @@ impl Peer { } } - fn tombstone_message_for_same_region(&self, peer: metapb::Peer) -> RaftMessage { - let region_id = self.region_id(); + fn tombstone_message( + &self, + region_id: u64, + region_epoch: metapb::RegionEpoch, + peer: metapb::Peer, + ) -> RaftMessage { let mut tombstone_message = RaftMessage::default(); + if self.region_id() != region_id { + // After merge, target region needs to GC peers of source region. + let extra_msg = tombstone_message.mut_extra_msg(); + extra_msg.set_type(ExtraMessageType::MsgGcPeerRequest); + let check_peer = extra_msg.mut_check_gc_peer(); + check_peer.set_from_region_id(self.region_id()); + } tombstone_message.set_region_id(region_id); tombstone_message.set_from_peer(self.peer().clone()); tombstone_message.set_to_peer(peer); - tombstone_message.set_region_epoch(self.region().get_region_epoch().clone()); + tombstone_message.set_region_epoch(region_epoch); tombstone_message.set_is_tombstone(true); tombstone_message } @@ -604,6 +672,10 @@ impl Peer { match msg.get_to_peer().get_id().cmp(&self.peer_id()) { cmp::Ordering::Less => { if let Some(msg) = build_peer_destroyed_report(msg) { + info!(self.logger, "peer reports destroyed"; + "from_peer" => ?msg.get_from_peer(), + "from_region_epoch" => ?msg.get_region_epoch(), + "to_peer_id" => ?msg.get_to_peer().get_id()); self.add_message(msg); } } @@ -656,6 +728,10 @@ impl Peer { let _ = router.send_raft_message(m.into()); }, ); + } else { + // Source peer is already destroyed. Forward to store, and let + // it report GcPeer response. + let _ = ctx.router.send_raft_message(m.into()); } }); } @@ -671,6 +747,7 @@ impl Peer { && state.get_merged_records().iter().all(|p| { p.get_source_peers() .iter() + .chain(p.get_source_removed_records()) .all(|p| p.get_id() != gc_peer_id) }) { @@ -683,6 +760,37 @@ impl Peer { ctx.confirmed_ids.push(gc_peer_id); } + // Clean up removed and merged records for peers on tombstone stores, + // otherwise it may keep sending gc peer request to the tombstone store. + pub fn on_store_maybe_tombstone_gc_peer(&mut self, store_id: u64) { + let mut peers_on_tombstone = vec![]; + let state = self.storage().region_state(); + for peer in state.get_removed_records() { + if peer.get_store_id() == store_id { + peers_on_tombstone.push(peer.clone()); + } + } + for record in state.get_merged_records() { + for peer in record.get_source_peers() { + if peer.get_store_id() == store_id { + peers_on_tombstone.push(peer.clone()); + } + } + } + if peers_on_tombstone.is_empty() { + return; + } + info!(self.logger, "gc peer on tombstone store"; + "tombstone_store_id" => store_id, + "peers" => ?peers_on_tombstone); + let ctx = self.gc_peer_context_mut(); + for peer in peers_on_tombstone { + if !ctx.confirmed_ids.contains(&peer.get_id()) { + ctx.confirmed_ids.push(peer.get_id()); + } + } + } + // Removes deleted peers from region state by proposing a `UpdateGcPeer` // command. pub fn on_gc_peer_tick(&mut self, ctx: &mut StoreContext) { @@ -695,27 +803,50 @@ impl Peer { } let mut need_gc_ids = Vec::with_capacity(5); let gc_context = self.gc_peer_context(); + let mut tombstone_removed_records = + |region_id, region_epoch: &metapb::RegionEpoch, peer: &metapb::Peer| { + need_gc_ids.push(peer.get_id()); + if gc_context.confirmed_ids.contains(&peer.get_id()) { + return; + } + + let msg = self.tombstone_message(region_id, region_epoch.clone(), peer.clone()); + // For leader, it's OK to send gc message immediately. + let _ = ctx.trans.send(msg); + }; for peer in state.get_removed_records() { - need_gc_ids.push(peer.get_id()); - if gc_context.confirmed_ids.contains(&peer.get_id()) { - continue; + tombstone_removed_records(self.region_id(), self.region().get_region_epoch(), peer); + } + // For merge, we need to + // 1. ask source removed peers to destroy. + for record in state.get_merged_records() { + for peer in record.get_source_removed_records() { + tombstone_removed_records( + record.get_source_region_id(), + record.get_source_epoch(), + peer, + ); } - - let msg = self.tombstone_message_for_same_region(peer.clone()); - // For leader, it's OK to send gc message immediately. - let _ = ctx.trans.send(msg); } + // 2. ask target to check whether source should be deleted. for record in state.get_merged_records() { - // For merge, we ask target to check whether source should be deleted. - for (source, target) in record - .get_source_peers() - .iter() - .zip(record.get_target_peers()) - { + for source in record.get_source_peers() { need_gc_ids.push(source.get_id()); if gc_context.confirmed_ids.contains(&source.get_id()) { continue; } + let Some(target) = record + .get_target_peers() + .iter() + .find(|p| p.get_store_id() == source.get_store_id()) + else { + panic!( + "[region {}] {} target peer not found, {:?}", + self.region_id(), + self.peer_id(), + state + ); + }; let mut msg = RaftMessage::default(); msg.set_region_id(record.get_target_region_id()); @@ -752,6 +883,266 @@ impl Peer { self.maybe_schedule_gc_peer_tick(); } + pub fn adjust_peers_max_inflight_msgs(&mut self, peers: &[u64], raft_max_inflight_msgs: usize) { + peers.iter().for_each(|id| { + self.raft_group_mut() + .raft + .adjust_max_inflight_msgs(*id, raft_max_inflight_msgs); + debug!( + self.logger, + "adjust max inflight msgs"; + "raft_max_inflight_msgs" => raft_max_inflight_msgs, + "peer_id" => id + ); + }); + } + + // Check disk usages for the peer itself and other peers in the raft group. + // The return value indicates whether the proposal is allowed or not. + pub fn check_proposal_with_disk_full_opt( + &mut self, + ctx: &StoreContext, + disk_full_opt: DiskFullOpt, + ) -> Result<()> { + let leader_allowed = match ctx.self_disk_usage { + DiskUsage::Normal => true, + DiskUsage::AlmostFull => !matches!(disk_full_opt, DiskFullOpt::NotAllowedOnFull), + DiskUsage::AlreadyFull => false, + }; + let mut disk_full_stores = Vec::new(); + let abnormal_peer_context = self.abnormal_peer_context(); + let disk_full_peers = abnormal_peer_context.disk_full_peers(); + if !leader_allowed { + disk_full_stores.push(ctx.store_id); + // Try to transfer leader to a node with disk usage normal to maintain write + // availability. If majority node is disk full, to transfer leader or not is not + // necessary. Note: Need to exclude learner node. + if !disk_full_peers.majority() { + let target_peer = self + .region() + .get_peers() + .iter() + .find(|x| { + !disk_full_peers.has(x.get_id()) + && x.get_id() != self.peer_id() + && !self + .abnormal_peer_context() + .down_peers() + .contains(&x.get_id()) + && !matches!(x.get_role(), PeerRole::Learner) + }) + .cloned(); + if let Some(p) = target_peer { + debug!( + self.logger, + "try to transfer leader because of current leader disk full"; + "region_id" => self.region().get_id(), + "peer_id" => self.peer_id(), + "target_peer_id" => p.get_id(), + ); + self.pre_transfer_leader(&p); + } + } + } else { + // Check followers. + if disk_full_peers.is_empty() { + return Ok(()); + } + if !abnormal_peer_context.is_dangerous_majority_set() { + if !disk_full_peers.majority() { + return Ok(()); + } + // Majority peers are in disk full status but the request carries a special + // flag. + if matches!(disk_full_opt, DiskFullOpt::AllowedOnAlmostFull) + && disk_full_peers.peers().values().any(|x| x.1) + { + return Ok(()); + } + } + for peer in self.region().get_peers() { + let (peer_id, store_id) = (peer.get_id(), peer.get_store_id()); + if disk_full_peers.peers().get(&peer_id).is_some() { + disk_full_stores.push(store_id); + } + } + } + let errmsg = format!( + "propose failed: tikv disk full, cmd diskFullOpt={:?}, leader diskUsage={:?}", + disk_full_opt, ctx.self_disk_usage + ); + Err(Error::DiskFull(disk_full_stores, errmsg)) + } + + pub fn clear_disk_full_peers(&mut self, ctx: &StoreContext) { + let disk_full_peers = mem::take(self.abnormal_peer_context_mut().disk_full_peers_mut()); + let raft = &mut self.raft_group_mut().raft; + for peer in disk_full_peers.peers().iter() { + raft.adjust_max_inflight_msgs(*peer.0, ctx.cfg.raft_max_inflight_msgs); + } + } + + pub fn refill_disk_full_peers(&mut self, ctx: &StoreContext) { + self.clear_disk_full_peers(ctx); + debug!( + self.logger, + "region id {}, peer id {}, store id {}: refill disk full peers when peer disk usage status changed or merge triggered", + self.region().get_id(), + self.peer_id(), + ctx.store_id, + ); + + // Collect disk full peers and all peers' `next_idx` to find a potential quorum. + let peers_len = self.region().get_peers().len(); + let mut normal_peers = HashSet::default(); + let mut next_idxs = Vec::with_capacity(peers_len); + let mut min_peer_index = u64::MAX; + for peer in self.region().get_peers() { + let (peer_id, store_id) = (peer.get_id(), peer.get_store_id()); + let usage = ctx.store_disk_usages.get(&store_id); + if usage.is_none() { + // Always treat the leader itself as normal. + normal_peers.insert(peer_id); + } + if let Some(pr) = self.raft_group().raft.prs().get(peer_id) { + // status 3-normal, 2-almostfull, 1-alreadyfull, only for simplying the sort + // func belowing. + let mut status = 3; + if let Some(usg) = usage { + status = match usg { + DiskUsage::Normal => 3, + DiskUsage::AlmostFull => 2, + DiskUsage::AlreadyFull => 1, + }; + } + + if !self.abnormal_peer_context().down_peers().contains(&peer_id) { + next_idxs.push((peer_id, pr.next_idx, usage, status)); + if min_peer_index > pr.next_idx { + min_peer_index = pr.next_idx; + } + } + } + } + if self.has_region_merge_proposal { + debug!( + self.logger, + "region id {}, peer id {}, store id {} has a merge request, with region_merge_proposal_index {}", + self.region_id(), + self.peer_id(), + ctx.store_id, + self.region_merge_proposal_index + ); + if min_peer_index > self.region_merge_proposal_index { + self.has_region_merge_proposal = false; + } + } + + if normal_peers.len() == peers_len { + return; + } + + // Reverse sort peers based on `next_idx`, `usage` and `store healthy status`, + // then try to get a potential quorum. + next_idxs.sort_by(|x, y| { + if x.3 == y.3 { + y.1.cmp(&x.1) + } else { + y.3.cmp(&x.3) + } + }); + + let majority = !self.raft_group().raft.prs().has_quorum(&normal_peers); + self.abnormal_peer_context_mut() + .disk_full_peers_mut() + .set_majority(majority); + // Here set all peers can be sent when merging. + for &(peer, _, usage, ..) in &next_idxs { + if let Some(usage) = usage { + if self.has_region_merge_proposal && !matches!(*usage, DiskUsage::AlreadyFull) { + self.abnormal_peer_context_mut() + .disk_full_peers_mut() + .peers_mut() + .insert(peer, (*usage, true)); + self.raft_group_mut() + .raft + .adjust_max_inflight_msgs(peer, ctx.cfg.raft_max_inflight_msgs); + debug!( + self.logger, + "refill disk full peer max inflight to {} on a merging region: region id {}, peer id {}", + ctx.cfg.raft_max_inflight_msgs, + self.region_id(), + peer + ); + } else { + self.abnormal_peer_context_mut() + .disk_full_peers_mut() + .peers_mut() + .insert(peer, (*usage, false)); + self.raft_group_mut().raft.adjust_max_inflight_msgs(peer, 0); + debug!( + self.logger, + "refill disk full peer max inflight to {} on region without merging: region id {}, peer id {}", + 0, + self.region_id(), + peer + ); + } + } + } + + if !self.abnormal_peer_context().disk_full_peers().majority() { + // Less than majority peers are in disk full status. + return; + } + + let (mut potential_quorum, mut quorum_ok) = (HashSet::default(), false); + let mut is_dangerous_set = false; + for &(peer_id, _, _, status) in &next_idxs { + potential_quorum.insert(peer_id); + + if status == 1 { + // already full peer. + is_dangerous_set = true; + } + + if self.raft_group().raft.prs().has_quorum(&potential_quorum) { + quorum_ok = true; + break; + } + } + + self.abnormal_peer_context_mut() + .setup_dangerous_majority_set(is_dangerous_set); + + // For the Peer with AlreadFull in potential quorum set, we still need to send + // logs to it. To support incoming configure change. + if quorum_ok { + let has_region_merge_proposal = self.has_region_merge_proposal; + let peers = self + .abnormal_peer_context_mut() + .disk_full_peers_mut() + .peers_mut(); + let mut inflight_peers = vec![]; + for peer in potential_quorum { + if let Some(x) = peers.get_mut(&peer) { + // It can help to establish a quorum. + x.1 = true; + // for merge region, all peers have been set to the max. + if !has_region_merge_proposal { + inflight_peers.push(peer); + } + } + } + debug!( + self.logger, + "refill disk full peer max inflight to 1 in potential quorum set: region id {}", + self.region_id(), + ); + self.adjust_peers_max_inflight_msgs(&inflight_peers, 1); + } + } + /// A peer can be destroyed in four cases: /// /// 1. Received a gc message; @@ -795,9 +1186,17 @@ impl Peer { } // Wait for critical commands like split. if self.has_pending_tombstone_tablets() { + let applied_index = self.entry_storage().applied_index(); + let last_index = self.entry_storage().last_index(); + let persisted = self + .remember_persisted_tablet_index() + .load(std::sync::atomic::Ordering::Relaxed); info!( self.logger, - "postpone destroy because there're pending tombstone tablets" + "postpone destroy because there're pending tombstone tablets"; + "applied_index" => applied_index, + "last_index" => last_index, + "persisted_applied" => persisted, ); return true; } diff --git a/components/raftstore-v2/src/operation/misc.rs b/components/raftstore-v2/src/operation/misc.rs index 867b4192dac..fafca29ea85 100644 --- a/components/raftstore-v2/src/operation/misc.rs +++ b/components/raftstore-v2/src/operation/misc.rs @@ -102,7 +102,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T> StoreFsmDelegate<'a, EK, ER, T> { self.store_ctx.cfg.region_compact_min_tombstones, self.store_ctx.cfg.region_compact_tombstones_percent, self.store_ctx.cfg.region_compact_min_redundant_rows, - self.store_ctx.cfg.region_compact_redundant_rows_percent, + self.store_ctx.cfg.region_compact_redundant_rows_percent(), ), })) { diff --git a/components/raftstore-v2/src/operation/mod.rs b/components/raftstore-v2/src/operation/mod.rs index 8ce592dd753..6c43fcdaa3b 100644 --- a/components/raftstore-v2/src/operation/mod.rs +++ b/components/raftstore-v2/src/operation/mod.rs @@ -2,6 +2,7 @@ mod bucket; mod command; +mod disk_snapshot_backup; mod life; mod misc; mod pd; @@ -17,6 +18,7 @@ pub use command::{ SplitFlowControl, SplitPendingAppend, MERGE_IN_PROGRESS_PREFIX, MERGE_SOURCE_PREFIX, SPLIT_PREFIX, }; +pub use disk_snapshot_backup::UnimplementedHandle as DiskSnapBackupHandle; pub use life::{AbnormalPeerContext, DestroyProgress, GcPeerContext}; pub use ready::{ write_initial_states, ApplyTrace, AsyncWriter, DataTrace, GenSnapTask, ReplayWatch, SnapState, @@ -24,7 +26,6 @@ pub use ready::{ }; pub(crate) use self::{ - bucket::BucketStatsInfo, command::SplitInit, query::{LocalReader, ReadDelegatePair, SharedReadTablet}, txn_ext::TxnContext, @@ -87,7 +88,7 @@ pub mod test_util { let mut header = Box::::default(); header.set_region_id(region_id); header.set_region_epoch(region_epoch); - let req_encoder = SimpleWriteReqEncoder::new(header, encoder.encode(), 512, false); + let req_encoder = SimpleWriteReqEncoder::new(header, encoder.encode(), 512); let (bin, _) = req_encoder.encode(); let mut e = Entry::default(); e.set_entry_type(EntryType::EntryNormal); @@ -112,7 +113,7 @@ pub mod test_util { let mut header = Box::::default(); header.set_region_id(region_id); header.set_region_epoch(region_epoch); - let req_encoder = SimpleWriteReqEncoder::new(header, encoder.encode(), 512, false); + let req_encoder = SimpleWriteReqEncoder::new(header, encoder.encode(), 512); let (bin, _) = req_encoder.encode(); let mut e = Entry::default(); e.set_entry_type(EntryType::EntryNormal); diff --git a/components/raftstore-v2/src/operation/pd.rs b/components/raftstore-v2/src/operation/pd.rs index 817b3aa6eb6..8e392755c5e 100644 --- a/components/raftstore-v2/src/operation/pd.rs +++ b/components/raftstore-v2/src/operation/pd.rs @@ -103,7 +103,7 @@ impl Peer { let task = pd::Task::RegionHeartbeat(pd::RegionHeartbeatTask { term: self.term(), region: self.region().clone(), - down_peers: self.collect_down_peers(ctx.cfg.max_peer_down_duration.0), + down_peers: self.collect_down_peers(ctx), peer: self.peer().clone(), pending_peers: self.collect_pending_peers(ctx), written_bytes: self.self_stat().written_bytes, @@ -215,6 +215,7 @@ impl Peer { &self, ctx: &StoreContext, split_keys: Vec>, + share_source_region_size: bool, ch: CmdResChannel, ) { let task = pd::Task::AskBatchSplit { @@ -222,6 +223,7 @@ impl Peer { split_keys, peer: self.peer().clone(), right_derive: ctx.cfg.right_derive_when_split, + share_source_region_size, ch, }; if let Err(e) = ctx.schedulers.pd.schedule(task) { diff --git a/components/raftstore-v2/src/operation/query/lease.rs b/components/raftstore-v2/src/operation/query/lease.rs index 84a8ad09ed3..189986f93d2 100644 --- a/components/raftstore-v2/src/operation/query/lease.rs +++ b/components/raftstore-v2/src/operation/query/lease.rs @@ -168,7 +168,7 @@ impl Peer { header.set_term(self.term()); let empty_data = SimpleWriteEncoder::with_capacity(0).encode(); let (ch, _) = CmdResChannel::pair(); - self.on_simple_write(ctx, header, empty_data, ch); + self.on_simple_write(ctx, header, empty_data, ch, None); } /// response the read index request diff --git a/components/raftstore-v2/src/operation/query/local.rs b/components/raftstore-v2/src/operation/query/local.rs index 36dbb26e4c7..ea802650f3d 100644 --- a/components/raftstore-v2/src/operation/query/local.rs +++ b/components/raftstore-v2/src/operation/query/local.rs @@ -28,6 +28,7 @@ use raftstore::{ use slog::{debug, Logger}; use tikv_util::{box_err, codec::number::decode_u64, time::monotonic_raw_now, Either}; use time::Timespec; +use tracker::{get_tls_tracker_token, GLOBAL_TRACKERS}; use txn_types::WriteBatchFlags; use crate::{ @@ -335,7 +336,12 @@ where async move { let (mut fut, mut reader) = match res { - Either::Left(Ok(snap)) => return Ok(snap), + Either::Left(Ok(snap)) => { + GLOBAL_TRACKERS.with_tracker(get_tls_tracker_token(), |t| { + t.metrics.local_read = true; + }); + return Ok(snap); + } Either::Left(Err(e)) => return Err(e), Either::Right((fut, reader)) => (fut, reader), }; @@ -580,6 +586,10 @@ impl<'r> SnapRequestInspector<'r> { )); } + fail::fail_point!("perform_read_index", |_| Ok(ReadRequestPolicy::ReadIndex)); + + fail::fail_point!("perform_read_local", |_| Ok(ReadRequestPolicy::ReadLocal)); + let flags = WriteBatchFlags::from_bits_check(req.get_header().get_flags()); if flags.contains(WriteBatchFlags::STALE_READ) { return Ok(ReadRequestPolicy::StaleRead); diff --git a/components/raftstore-v2/src/operation/query/mod.rs b/components/raftstore-v2/src/operation/query/mod.rs index 2f1b1cd0138..10f6e3279c3 100644 --- a/components/raftstore-v2/src/operation/query/mod.rs +++ b/components/raftstore-v2/src/operation/query/mod.rs @@ -471,6 +471,7 @@ impl Peer { // Only leaders need to update applied_term. if progress_to_be_updated && self.is_leader() { if applied_term == self.term() { + fail::fail_point!("on_applied_current_term"); ctx.coprocessor_host .on_applied_current_term(StateRole::Leader, self.region()); } diff --git a/components/raftstore-v2/src/operation/ready/apply_trace.rs b/components/raftstore-v2/src/operation/ready/apply_trace.rs index d4743448d07..e839089837d 100644 --- a/components/raftstore-v2/src/operation/ready/apply_trace.rs +++ b/components/raftstore-v2/src/operation/ready/apply_trace.rs @@ -29,6 +29,7 @@ use std::{ cmp, + collections::VecDeque, path::Path, sync::{atomic::Ordering, mpsc::SyncSender, Mutex}, }; @@ -46,7 +47,7 @@ use kvproto::{ use raftstore::store::{ util, ReadTask, TabletSnapManager, WriteTask, RAFT_INIT_LOG_INDEX, RAFT_INIT_LOG_TERM, }; -use slog::{info, trace, Logger}; +use slog::{info, trace, warn, Logger}; use tikv_util::{box_err, slog_panic, worker::Scheduler}; use crate::{ @@ -56,7 +57,7 @@ use crate::{ ready::snapshot::{install_tablet, recv_snap_path}, }, raft::{Peer, Storage}, - router::PeerMsg, + router::{PeerMsg, SstApplyIndex}, worker::tablet, Result, StoreRouter, }; @@ -138,7 +139,7 @@ impl engine_traits::StateStorage for StateStorage< /// Mapping from data cf to an u64 index. pub type DataTrace = [u64; DATA_CFS_LEN]; -#[derive(Clone, Copy, Default, Debug)] +#[derive(Clone, Default, Debug)] struct Progress { flushed: u64, /// The index of last entry that has modification to the CF. The value @@ -146,6 +147,20 @@ struct Progress { /// /// If `flushed` == `last_modified`, then all data in the CF is persisted. last_modified: u64, + // applied indexes ranges that represent sst is ingested but not flushed indexes. + pending_sst_ranges: VecDeque, +} + +// A range representing [start, end], upper bound inclusive for handling +// convenience. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +struct IndexRange(u64, u64); + +#[derive(Debug)] +// track the global flushed index related to the write task. +struct ReadyFlushedIndex { + ready_number: u64, + flushed_index: u64, } /// `ApplyTrace` is used to track the indexes of modifications and flushes. @@ -178,6 +193,9 @@ pub struct ApplyTrace { last_flush_trigger: u64, /// `true` means the raft cf record should be persisted in next ready. try_persist: bool, + // Because we persist the global flushed in the write task, so we should track + // the task and handle sst cleanup after the write task finished. + flushed_index_queue: VecDeque, } impl ApplyTrace { @@ -230,6 +248,25 @@ impl ApplyTrace { self.admin.last_modified = index; } + pub fn on_sst_ingested(&mut self, sst_applied_index: &[SstApplyIndex]) { + use std::cmp::Ordering; + for &SstApplyIndex { cf_index, index } in sst_applied_index { + let p = &mut self.data_cfs[cf_index]; + if p.flushed < index { + let max_idx = p.pending_sst_ranges.iter().last().map(|r| r.1).unwrap_or(0) + 1; + match max_idx.cmp(&index) { + Ordering::Less => { + p.pending_sst_ranges.push_back(IndexRange(index, index)); + } + Ordering::Equal => { + p.pending_sst_ranges.iter_mut().last().unwrap().1 = index; + } + _ => {} + } + } + } + } + pub fn persisted_apply_index(&self) -> u64 { self.persisted_applied } @@ -283,17 +320,45 @@ impl ApplyTrace { } }) .min(); + // At best effort, we can only advance the index to `mem_index`. let candidate = cmp::min(mem_index, min_flushed.unwrap_or(u64::MAX)); + // try advance the index if there are any sst ingestion next to the flushed + // index, and always trigger a flush if there is any sst ingestion. + let (candidate, has_ingested_sst) = self.advance_flushed_index_for_ingest(candidate); if candidate > self.admin.flushed { self.admin.flushed = candidate; - if self.admin.flushed > self.persisted_applied + 100 { + if has_ingested_sst || (self.admin.flushed > self.persisted_applied + 100) { self.try_persist = true; } } // TODO: persist admin.flushed every 10 minutes. } + fn advance_flushed_index_for_ingest(&mut self, mut max_index: u64) -> (u64, bool) { + let mut has_ingest = false; + loop { + let mut has_change = false; + for p in self.data_cfs.iter_mut() { + while let Some(r) = p.pending_sst_ranges.front_mut() { + if r.0 > max_index + 1 { + break; + } else if r.1 > max_index { + max_index = r.1; + has_change = true; + } + p.pending_sst_ranges.pop_front(); + has_ingest = true; + } + } + if !has_change { + break; + } + } + + (max_index, has_ingest) + } + /// Get the flushed indexes of all data CF that is needed when recoverying /// logs. /// @@ -348,6 +413,38 @@ impl ApplyTrace { fail_point!("should_persist_apply_trace", |_| true); self.try_persist } + + #[inline] + pub fn register_flush_task(&mut self, ready_number: u64, flushed_index: u64) { + assert!( + self.flushed_index_queue + .iter() + .last() + .map(|f| f.ready_number) + .unwrap_or(0) + < ready_number + ); + self.flushed_index_queue.push_back(ReadyFlushedIndex { + ready_number, + flushed_index, + }); + } + + #[inline] + pub fn take_flush_index(&mut self, ready_number: u64) -> Option { + use std::cmp::Ordering; + while let Some(r) = self.flushed_index_queue.pop_front() { + match r.ready_number.cmp(&ready_number) { + Ordering::Equal => return Some(r.flushed_index), + Ordering::Greater => { + self.flushed_index_queue.push_front(r); + break; + } + _ => {} + } + } + None + } } impl Storage { @@ -546,6 +643,7 @@ impl Storage { .unwrap(); trace.try_persist = false; trace.persisted_applied = trace.admin.flushed; + trace.register_flush_task(write_task.ready_number(), trace.admin.flushed); } } @@ -566,24 +664,7 @@ impl Peer { let apply_trace = self.storage_mut().apply_trace_mut(); apply_trace.on_flush(cf, index); apply_trace.maybe_advance_admin_flushed(apply_index); - let stale_ssts = self.sst_apply_state().stale_ssts(cf, index); - if stale_ssts.is_empty() { - return; - } - info!( - self.logger, - "schedule delete stale ssts after flush"; - "stale_ssts" => ?stale_ssts, - "apply_index" => apply_index, - "cf" => cf, - "flushed_index" => index, - ); - let _ = ctx - .schedulers - .tablet - .schedule(tablet::Task::CleanupImportSst( - stale_ssts.into_boxed_slice(), - )); + self.cleanup_stale_ssts(ctx, &[cf], index, apply_index); } pub fn on_data_modified(&mut self, modification: DataTrace) { @@ -598,6 +679,38 @@ impl Peer { apply_trace.maybe_advance_admin_flushed(apply_index); } + pub fn cleanup_stale_ssts( + &mut self, + ctx: &mut StoreContext, + cfs: &[&str], + index: u64, + apply_index: u64, + ) { + let mut stale_ssts = vec![]; + for cf in cfs { + let ssts = self.sst_apply_state().stale_ssts(cf, index); + if !ssts.is_empty() { + info!( + self.logger, + "schedule delete stale ssts after flush"; + "stale_ssts" => ?stale_ssts, + "apply_index" => apply_index, + "cf" => cf, + "flushed_index" => index, + ); + stale_ssts.extend(ssts); + } + } + if !stale_ssts.is_empty() { + _ = ctx + .schedulers + .tablet + .schedule(tablet::Task::CleanupImportSst( + stale_ssts.into_boxed_slice(), + )); + } + } + pub fn flush_before_close(&mut self, ctx: &StoreContext, tx: SyncSender<()>) { info!( self.logger, @@ -605,7 +718,7 @@ impl Peer { ); let region_id = self.region_id(); let flush_threshold: u64 = (|| { - fail_point!("flush_before_cluse_threshold", |t| { + fail_point!("flush_before_close_threshold", |t| { t.unwrap().parse::().unwrap() }); 50 @@ -619,7 +732,18 @@ impl Peer { // flush the oldest cf one by one until we are under the replay count threshold loop { let replay_count = self.storage().estimate_replay_count(); - if replay_count < flush_threshold { + if replay_count < flush_threshold || tried_count == 3 { + // Ideally, the replay count should be 0 after three flush_oldest_cf. If not, + // there may exist bug, but it's not desireable to block here, so we at most try + // three times. + if replay_count >= flush_threshold && tried_count == 3 { + warn!( + self.logger, + "after three flush_oldest_cf, the expected replay count still exceeds the threshold"; + "replay_count" => replay_count, + "threshold" => flush_threshold, + ); + } if flushed { let admin_flush = self.storage_mut().apply_trace_mut().admin.flushed; let (_, _, tablet_index) = ctx @@ -678,7 +802,7 @@ impl Peer { #[cfg(test)] mod tests { - use engine_traits::RaftEngineReadOnly; + use engine_traits::{CfName, RaftEngineReadOnly}; use kvproto::metapb::Peer; use tempfile::TempDir; @@ -798,6 +922,93 @@ mod tests { // Because modify is recorded, so we know there should be no admin // modification and index can be advanced. assert_eq!(5, trace.admin.flushed); + + fn range_equals(trace: &ApplyTrace, cf: &str, expected: Vec) { + let pending_ranges = &trace.data_cfs[data_cf_offset(cf)].pending_sst_ranges; + assert_eq!( + pending_ranges.len(), + expected.len(), + "actual: {:?}, expected: {:?}", + pending_ranges, + &expected + ); + pending_ranges + .iter() + .zip(expected.iter()) + .for_each(|(r, e)| { + assert_eq!(r, e); + }); + } + + trace.on_modify(CF_DEFAULT, 8); + let ingested_ssts_idx = + make_sst_apply_index(vec![(CF_DEFAULT, 6), (CF_WRITE, 6), (CF_WRITE, 7)]); + trace.on_sst_ingested(&ingested_ssts_idx); + range_equals(&trace, CF_DEFAULT, vec![IndexRange(6, 6)]); + range_equals(&trace, CF_WRITE, vec![IndexRange(6, 7)]); + trace.maybe_advance_admin_flushed(8); + assert_eq!(7, trace.admin.flushed); + for cf in [CF_DEFAULT, CF_WRITE] { + assert_eq!( + trace.data_cfs[data_cf_offset(cf)].pending_sst_ranges.len(), + 0 + ); + } + trace.on_modify(CF_DEFAULT, 10); + let ingested_ssts_idx = make_sst_apply_index(vec![(CF_DEFAULT, 10)]); + trace.on_sst_ingested(&ingested_ssts_idx); + trace.on_flush(CF_DEFAULT, 8); + trace.maybe_advance_admin_flushed(10); + assert_eq!(8, trace.admin.flushed); + range_equals(&trace, CF_DEFAULT, vec![IndexRange(10, 10)]); + + trace.on_modify(CF_DEFAULT, 16); + let ingested_ssts_idx = make_sst_apply_index(vec![ + (CF_DEFAULT, 11), + (CF_WRITE, 12), + (CF_LOCK, 13), + (CF_DEFAULT, 14), + (CF_WRITE, 14), + (CF_WRITE, 15), + (CF_LOCK, 16), + ]); + trace.on_sst_ingested(&ingested_ssts_idx); + range_equals( + &trace, + CF_DEFAULT, + vec![IndexRange(10, 11), IndexRange(14, 14)], + ); + range_equals( + &trace, + CF_WRITE, + vec![IndexRange(12, 12), IndexRange(14, 15)], + ); + range_equals( + &trace, + CF_LOCK, + vec![IndexRange(13, 13), IndexRange(16, 16)], + ); + trace.maybe_advance_admin_flushed(16); + assert_eq!(8, trace.admin.flushed); + + trace.on_flush(CF_DEFAULT, 9); + trace.maybe_advance_admin_flushed(16); + assert_eq!(16, trace.admin.flushed); + for cf in DATA_CFS { + assert_eq!( + trace.data_cfs[data_cf_offset(cf)].pending_sst_ranges.len(), + 0 + ); + } + } + + fn make_sst_apply_index(data: Vec<(CfName, u64)>) -> Vec { + data.into_iter() + .map(|d| SstApplyIndex { + cf_index: data_cf_offset(d.0), + index: d.1, + }) + .collect() } #[test] diff --git a/components/raftstore-v2/src/operation/ready/mod.rs b/components/raftstore-v2/src/operation/ready/mod.rs index b985fd69c27..95eee272a80 100644 --- a/components/raftstore-v2/src/operation/ready/mod.rs +++ b/components/raftstore-v2/src/operation/ready/mod.rs @@ -31,7 +31,7 @@ use std::{ time::Instant, }; -use engine_traits::{KvEngine, RaftEngine}; +use engine_traits::{KvEngine, RaftEngine, DATA_CFS}; use error_code::ErrorCodeExt; use kvproto::{ raft_cmdpb::AdminCmdType, @@ -43,6 +43,7 @@ use raftstore::{ coprocessor::{RegionChangeEvent, RoleChange}, store::{ fsm::store::StoreRegionMeta, + local_metrics::IoType, needs_evict_entry_cache, util::{self, is_first_append_entry, is_initial_msg}, worker_metrics::SNAP_COUNTER, @@ -54,7 +55,8 @@ use tikv_util::{ log::SlogFormat, slog_panic, store::find_peer, - time::{duration_to_sec, monotonic_raw_now, Duration}, + sys::disk::DiskUsage, + time::{duration_to_sec, monotonic_raw_now, Duration, Instant as TiInstant}, }; pub use self::{ @@ -247,10 +249,18 @@ impl Peer { } } + pub fn on_store_maybe_tombstone(&mut self, store_id: u64) { + if !self.is_leader() { + return; + } + self.on_store_maybe_tombstone_gc_peer(store_id); + } + pub fn on_raft_message( &mut self, ctx: &mut StoreContext, mut msg: Box, + send_time: Option, ) { debug!( self.logger, @@ -258,7 +268,15 @@ impl Peer { "message_type" => %util::MsgType(&msg), "from_peer_id" => msg.get_from_peer().get_id(), "to_peer_id" => msg.get_to_peer().get_id(), + "disk_usage" => ?msg.disk_usage, ); + if let Some(send_time) = send_time { + let process_wait_time = send_time.saturating_elapsed(); + ctx.raft_metrics + .process_wait_time + .observe(duration_to_sec(process_wait_time)); + } + if self.pause_for_replay() && msg.get_message().get_msg_type() == MessageType::MsgAppend { ctx.raft_metrics.message_dropped.recovery.inc(); return; @@ -280,6 +298,9 @@ impl Peer { return; } } + + self.handle_reported_disk_usage(ctx, &msg); + if msg.get_to_peer().get_store_id() != self.peer().get_store_id() { ctx.raft_metrics.message_dropped.mismatch_store_id.inc(); return; @@ -418,9 +439,10 @@ impl Peer { return; } + let msg_type = msg.get_message().get_msg_type(); // This can be a message that sent when it's still a follower. Nevertheleast, // it's meaningless to continue to handle the request as callbacks are cleared. - if msg.get_message().get_msg_type() == MessageType::MsgReadIndex + if msg_type == MessageType::MsgReadIndex && self.is_leader() && (msg.get_message().get_from() == raft::INVALID_ID || msg.get_message().get_from() == self.peer_id()) @@ -429,14 +451,18 @@ impl Peer { return; } - if msg.get_message().get_msg_type() == MessageType::MsgReadIndex + if msg_type == MessageType::MsgReadIndex && self.is_leader() && self.on_step_read_index(ctx, msg.mut_message()) { // Read index has respond in `on_step_read_index`, // No need to step again. } else if let Err(e) = self.raft_group_mut().step(msg.take_message()) { - error!(self.logger, "raft step error"; "err" => ?e); + error!(self.logger, "raft step error"; + "from_peer" => ?msg.get_from_peer(), + "region_epoch" => ?msg.get_region_epoch(), + "message_type" => ?msg_type, + "err" => ?e); } else { let committed_index = self.raft_group().raft.raft_log.committed; self.report_commit_log_duration(ctx, pre_committed_index, committed_index); @@ -503,7 +529,11 @@ impl Peer { /// /// If the recipient can't be found, `None` is returned. #[inline] - fn build_raft_message(&mut self, msg: eraftpb::Message) -> Option { + fn build_raft_message( + &mut self, + msg: eraftpb::Message, + disk_usage: DiskUsage, + ) -> Option { let to_peer = match self.peer_from_cache(msg.to) { Some(p) => p, None => { @@ -518,6 +548,8 @@ impl Peer { }; let mut raft_msg = self.prepare_raft_message(); + // Fill in the disk usage. + raft_msg.set_disk_usage(disk_usage); raft_msg.set_to_peer(to_peer); if msg.from != self.peer().id { @@ -760,8 +792,9 @@ impl Peer { if !ready.messages().is_empty() { debug_assert!(self.is_leader()); + let disk_usage = ctx.self_disk_usage; for msg in ready.take_messages() { - if let Some(msg) = self.build_raft_message(msg) { + if let Some(msg) = self.build_raft_message(msg, disk_usage) { self.send_raft_message_on_leader(ctx, msg); } } @@ -786,14 +819,15 @@ impl Peer { self.merge_state_changes_to(&mut write_task); self.storage_mut() .handle_raft_ready(ctx, &mut ready, &mut write_task); - self.try_compelete_recovery(); + self.try_complete_recovery(); self.on_advance_persisted_apply_index(ctx, prev_persisted, &mut write_task); if !ready.persisted_messages().is_empty() { + let disk_usage = ctx.self_disk_usage; write_task.messages = ready .take_persisted_messages() .into_iter() - .flat_map(|m| self.build_raft_message(m)) + .flat_map(|m| self.build_raft_message(m, disk_usage)) .collect(); } if self.has_pending_messages() { @@ -896,6 +930,14 @@ impl Peer { self.storage_mut() .entry_storage_mut() .update_cache_persisted(persisted_index); + if let Some(idx) = self + .storage_mut() + .apply_trace_mut() + .take_flush_index(ready_number) + { + let apply_index = self.flush_state().applied_index(); + self.cleanup_stale_ssts(ctx, DATA_CFS, idx, apply_index); + } if self.is_in_force_leader() { // forward commit index, the committed entries will be applied in @@ -948,7 +990,7 @@ impl Peer { return; } let now = Instant::now(); - let stat_raft_commit_log = &mut ctx.raft_metrics.stat_commit_log; + let health_stats = &mut ctx.raft_metrics.health_stats; for i in old_index + 1..=new_index { if let Some((term, trackers)) = self.proposals().find_trackers(i) { if self.entry_storage().term(i).map_or(false, |t| t == term) { @@ -961,14 +1003,11 @@ impl Peer { for tracker in trackers { // Collect the metrics related to commit_log // durations. - stat_raft_commit_log.record(Duration::from_nanos(tracker.observe( - now, - hist, - |t| { - t.metrics.commit_not_persisted = !commit_persisted; - &mut t.metrics.wf_commit_log_nanos - }, - ))); + let duration = tracker.observe(now, hist, |t| { + t.metrics.commit_not_persisted = !commit_persisted; + &mut t.metrics.wf_commit_log_nanos + }); + health_stats.observe(Duration::from_nanos(duration), IoType::Network); } } } @@ -1049,6 +1088,16 @@ impl Peer { // Exit entry cache warmup state when the peer becomes leader. self.entry_storage_mut().clear_entry_cache_warmup_state(); + if !ctx.store_disk_usages.is_empty() { + self.refill_disk_full_peers(ctx); + debug!( + self.logger, + "become leader refills disk full peers to {:?}", + self.abnormal_peer_context().disk_full_peers(); + "region_id" => self.region_id(), + ); + } + self.region_heartbeat_pd(ctx); self.add_pending_tick(PeerTick::CompactLog); self.add_pending_tick(PeerTick::SplitRegionCheck); @@ -1189,6 +1238,52 @@ impl Peer { ); } } + + fn handle_reported_disk_usage( + &mut self, + ctx: &mut StoreContext, + msg: &RaftMessage, + ) { + let store_id = msg.get_from_peer().get_store_id(); + let peer_id = msg.get_from_peer().get_id(); + let disk_full_peers = self.abnormal_peer_context().disk_full_peers(); + let refill_disk_usages = if matches!(msg.disk_usage, DiskUsage::Normal) { + ctx.store_disk_usages.remove(&store_id); + if !self.is_leader() { + return; + } + disk_full_peers.has(peer_id) + } else { + ctx.store_disk_usages.insert(store_id, msg.disk_usage); + if !self.is_leader() { + return; + } + + disk_full_peers.is_empty() + || disk_full_peers + .get(peer_id) + .map_or(true, |x| x != msg.disk_usage) + }; + + if refill_disk_usages || self.has_region_merge_proposal { + let prev = disk_full_peers.get(peer_id); + if Some(msg.disk_usage) != prev { + info!( + self.logger, + "reported disk usage changes {:?} -> {:?}", prev, msg.disk_usage; + "region_id" => self.region_id(), + "peer_id" => peer_id, + ); + } + self.refill_disk_full_peers(ctx); + debug!( + self.logger, + "raft message refills disk full peers to {:?}", + self.abnormal_peer_context().disk_full_peers(); + "region_id" => self.region_id(), + ); + } + } } impl Storage { diff --git a/components/raftstore-v2/src/operation/ready/snapshot.rs b/components/raftstore-v2/src/operation/ready/snapshot.rs index 17deed333c1..9e0ed449cef 100644 --- a/components/raftstore-v2/src/operation/ready/snapshot.rs +++ b/components/raftstore-v2/src/operation/ready/snapshot.rs @@ -343,10 +343,12 @@ impl Peer { } self.schedule_apply_fsm(ctx); if self.remove_tombstone_tablets(snapshot_index) { + let counter = self.remember_persisted_tablet_index(); let _ = ctx .schedulers .tablet .schedule(tablet::Task::destroy(region_id, snapshot_index)); + counter.store(snapshot_index, Ordering::Relaxed); } if let Some(msg) = self.split_pending_append_mut().take_append_message() { let _ = ctx.router.send_raft_message(msg); diff --git a/components/raftstore-v2/src/operation/txn_ext.rs b/components/raftstore-v2/src/operation/txn_ext.rs index 272b2526b39..6a379b9a1a2 100644 --- a/components/raftstore-v2/src/operation/txn_ext.rs +++ b/components/raftstore-v2/src/operation/txn_ext.rs @@ -9,11 +9,15 @@ use std::sync::{atomic::Ordering, Arc}; use crossbeam::atomic::AtomicCell; use engine_traits::{KvEngine, RaftEngine, CF_LOCK}; -use kvproto::{kvrpcpb::ExtraOp, metapb::Region, raft_cmdpb::RaftRequestHeader}; +use kvproto::{ + kvrpcpb::{DiskFullOpt, ExtraOp}, + metapb::Region, + raft_cmdpb::RaftRequestHeader, +}; use parking_lot::RwLockWriteGuard; use raft::eraftpb; use raftstore::store::{ - LocksStatus, PeerPessimisticLocks, TxnExt, TRANSFER_LEADER_COMMAND_REPLY_CTX, + LocksStatus, PeerPessimisticLocks, RaftCmdExtraOpts, TxnExt, TRANSFER_LEADER_COMMAND_REPLY_CTX, }; use slog::{error, info, Logger}; @@ -266,8 +270,17 @@ impl Peer { self.logger, "propose {} locks before transferring leader", lock_count; ); - let PeerMsg::SimpleWrite(write) = PeerMsg::simple_write(header, encoder.encode()).0 else {unreachable!()}; - self.on_simple_write(ctx, write.header, write.data, write.ch); + let PeerMsg::SimpleWrite(write) = PeerMsg::simple_write_with_opt(header, encoder.encode(), RaftCmdExtraOpts { + disk_full_opt: DiskFullOpt::AllowedOnAlmostFull, + ..Default::default() + }).0 else {unreachable!()}; + self.on_simple_write( + ctx, + write.header, + write.data, + write.ch, + Some(write.extra_opts), + ); true } } diff --git a/components/raftstore-v2/src/operation/unsafe_recovery/demote.rs b/components/raftstore-v2/src/operation/unsafe_recovery/demote.rs index 37962a45452..20a42b9f978 100644 --- a/components/raftstore-v2/src/operation/unsafe_recovery/demote.rs +++ b/components/raftstore-v2/src/operation/unsafe_recovery/demote.rs @@ -75,6 +75,7 @@ impl Peer { "Unsafe recovery, fail to finish demotion"; "err" => ?resp.get_header().get_error(), ); + *self.unsafe_recovery_state_mut() = Some(UnsafeRecoveryState::Failed); return; } *self.unsafe_recovery_state_mut() = Some(UnsafeRecoveryState::DemoteFailedVoters { @@ -129,6 +130,7 @@ impl Peer { "Unsafe recovery, fail to exit joint state"; "err" => ?resp.get_header().get_error(), ); + *self.unsafe_recovery_state_mut()= Some(UnsafeRecoveryState::Failed); } } else { error!(self.logger, diff --git a/components/raftstore-v2/src/operation/unsafe_recovery/force_leader.rs b/components/raftstore-v2/src/operation/unsafe_recovery/force_leader.rs index ba7e391dbef..e6af0fddb7b 100644 --- a/components/raftstore-v2/src/operation/unsafe_recovery/force_leader.rs +++ b/components/raftstore-v2/src/operation/unsafe_recovery/force_leader.rs @@ -5,7 +5,9 @@ use std::mem; use collections::HashSet; use engine_traits::{KvEngine, RaftEngine}; use raft::{eraftpb::MessageType, StateRole, Storage}; -use raftstore::store::{util::LeaseState, ForceLeaderState, UnsafeRecoveryForceLeaderSyncer}; +use raftstore::store::{ + util::LeaseState, ForceLeaderState, UnsafeRecoveryForceLeaderSyncer, UnsafeRecoveryState, +}; use slog::{info, warn}; use tikv_util::time::Instant as TiInstant; @@ -182,11 +184,20 @@ impl Peer { self.set_has_ready(); } - pub fn on_exit_force_leader(&mut self, ctx: &StoreContext) { + // TODO: add exit force leader check tick for raftstore v2 + pub fn on_exit_force_leader(&mut self, ctx: &StoreContext, force: bool) { if !self.has_force_leader() { return; } + if let Some(UnsafeRecoveryState::Failed) = self.unsafe_recovery_state() && !force { + // Skip force leader if the plan failed, so wait for the next retry of plan with force leader state holding + info!( + self.logger, "skip exiting force leader state" + ); + return; + } + info!(self.logger, "exit force leader state"); *self.force_leader_mut() = None; // leader lease shouldn't be renewed in force leader state. diff --git a/components/raftstore-v2/src/operation/unsafe_recovery/report.rs b/components/raftstore-v2/src/operation/unsafe_recovery/report.rs index 7173d00363a..90c8e3db34d 100644 --- a/components/raftstore-v2/src/operation/unsafe_recovery/report.rs +++ b/components/raftstore-v2/src/operation/unsafe_recovery/report.rs @@ -44,11 +44,19 @@ impl Peer { self.raft_group().raft.raft_log.committed }; - *self.unsafe_recovery_state_mut() = Some(UnsafeRecoveryState::WaitApply { - target_index, - syncer, - }); - self.unsafe_recovery_maybe_finish_wait_apply(!self.serving()); + if target_index > self.raft_group().raft.raft_log.applied { + info!( + self.logger, + "Unsafe recovery, start wait apply"; + "target_index" => target_index, + "applied" => self.raft_group().raft.raft_log.applied, + ); + *self.unsafe_recovery_state_mut() = Some(UnsafeRecoveryState::WaitApply { + target_index, + syncer, + }); + self.unsafe_recovery_maybe_finish_wait_apply(!self.serving()); + } } pub fn unsafe_recovery_maybe_finish_wait_apply(&mut self, force: bool) { @@ -113,7 +121,7 @@ impl Peer { Some(UnsafeRecoveryState::DemoteFailedVoters { .. }) => { self.unsafe_recovery_maybe_finish_demote_failed_voters(ctx) } - Some(UnsafeRecoveryState::Destroy(_)) | None => {} + Some(UnsafeRecoveryState::Destroy(_)) | Some(UnsafeRecoveryState::Failed) | None => {} } } } diff --git a/components/raftstore-v2/src/raft/apply.rs b/components/raftstore-v2/src/raft/apply.rs index 3e660c4549c..f3aa5a541c1 100644 --- a/components/raftstore-v2/src/raft/apply.rs +++ b/components/raftstore-v2/src/raft/apply.rs @@ -20,7 +20,7 @@ use tikv_util::{log::SlogFormat, worker::Scheduler, yatp_pool::FuturePool}; use crate::{ operation::{AdminCmdResult, ApplyFlowControl, DataTrace}, - router::CmdResChannel, + router::{CmdResChannel, SstApplyIndex}, TabletTask, }; @@ -64,6 +64,7 @@ pub struct Apply { admin_cmd_result: Vec, flush_state: Arc, sst_apply_state: SstApplyState, + sst_applied_index: Vec, /// The flushed indexes of each column family before being restarted. /// /// If an apply index is less than the flushed index, the log can be @@ -138,6 +139,7 @@ impl Apply { res_reporter, flush_state, sst_apply_state, + sst_applied_index: vec![], log_recovery, metrics: ApplyMetrics::default(), buckets, @@ -308,6 +310,16 @@ impl Apply { &self.sst_apply_state } + #[inline] + pub fn push_sst_applied_index(&mut self, sst_index: SstApplyIndex) { + self.sst_applied_index.push(sst_index); + } + + #[inline] + pub fn take_sst_applied_index(&mut self) -> Vec { + mem::take(&mut self.sst_applied_index) + } + #[inline] pub fn log_recovery(&self) -> &Option> { &self.log_recovery diff --git a/components/raftstore-v2/src/raft/peer.rs b/components/raftstore-v2/src/raft/peer.rs index 2f3a3376fe9..f4c3e1d3d6e 100644 --- a/components/raftstore-v2/src/raft/peer.rs +++ b/components/raftstore-v2/src/raft/peer.rs @@ -23,8 +23,9 @@ use raftstore::{ fsm::ApplyMetrics, metrics::RAFT_PEER_PENDING_DURATION, util::{Lease, RegionReadProgress}, - Config, EntryStorage, ForceLeaderState, PeerStat, ProposalQueue, ReadDelegate, - ReadIndexQueue, ReadProgress, TabletSnapManager, UnsafeRecoveryState, WriteTask, + BucketStatsInfo, Config, EntryStorage, ForceLeaderState, PeerStat, ProposalQueue, + ReadDelegate, ReadIndexQueue, ReadProgress, TabletSnapManager, UnsafeRecoveryState, + WriteTask, }, }; use slog::{debug, info, Logger}; @@ -32,11 +33,12 @@ use tikv_util::{slog_panic, time::duration_to_sec}; use super::storage::Storage; use crate::{ + batch::StoreContext, fsm::ApplyScheduler, operation::{ - AbnormalPeerContext, AsyncWriter, BucketStatsInfo, CompactLogContext, DestroyProgress, - GcPeerContext, MergeContext, ProposalControl, ReplayWatch, SimpleWriteReqEncoder, - SplitFlowControl, SplitPendingAppend, TxnContext, + AbnormalPeerContext, AsyncWriter, CompactLogContext, DestroyProgress, GcPeerContext, + MergeContext, ProposalControl, ReplayWatch, SimpleWriteReqEncoder, SplitFlowControl, + SplitPendingAppend, TxnContext, }, router::{ApplyTask, CmdResChannel, PeerTick, QueryResChannel}, Result, @@ -126,6 +128,10 @@ pub struct Peer { abnormal_peer_context: AbnormalPeerContext, + // region merge logic need to be broadcast to all followers when disk full happens. + pub has_region_merge_proposal: bool, + pub region_merge_proposal_index: u64, + /// Force leader state is only used in online recovery when the majority of /// peers are missing. In this state, it forces one peer to become leader /// out of accordance with Raft election rule, and forbids any @@ -158,6 +164,7 @@ impl Peer { let region_id = storage.region().get_id(); let tablet_index = storage.region_state().get_tablet_index(); let merge_context = MergeContext::from_region_state(&logger, storage.region_state()); + let persisted_applied = storage.apply_trace().persisted_apply_index(); let raft_group = RawNode::new(&raft_cfg, storage, &logger)?; let region = raft_group.store().region_state().get_region().clone(); @@ -184,7 +191,7 @@ impl Peer { self_stat: PeerStat::default(), peer_cache: vec![], peer_heartbeats: HashMap::default(), - compact_log_context: CompactLogContext::new(applied_index), + compact_log_context: CompactLogContext::new(applied_index, persisted_applied), merge_context: merge_context.map(|c| Box::new(c)), last_sent_snapshot_index: 0, raw_write_encoder: None, @@ -226,10 +233,20 @@ impl Peer { pending_messages: vec![], gc_peer_context: GcPeerContext::default(), abnormal_peer_context: AbnormalPeerContext::default(), + has_region_merge_proposal: false, + region_merge_proposal_index: 0_u64, force_leader_state: None, unsafe_recovery_state: None, }; + // If merge_context is not None, it means the PrepareMerge is applied before + // restart. So we have to neter prepare merge again to prevent all proposals + // except for RollbackMerge. + if let Some(ref state) = peer.merge_context { + peer.proposal_control + .enter_prepare_merge(state.prepare_merge_index().unwrap()); + } + // If this region has only one peer and I am the one, campaign directly. let region = peer.region(); if region.get_peers().len() == 1 @@ -264,9 +281,6 @@ impl Peer { } /// Set the region of a peer. - /// - /// This will update the region of the peer, caller must ensure the region - /// has been preserved in a durable device. pub fn set_region( &mut self, host: &CoprocessorHost, @@ -493,7 +507,7 @@ impl Peer { // we may have skipped scheduling raft tick when start due to noticable gap // between commit index and apply index. We should scheduling it when raft log // apply catches up. - pub fn try_compelete_recovery(&mut self) { + pub fn try_complete_recovery(&mut self) { if self.pause_for_replay() && self.storage().entry_storage().commit_index() <= self.storage().entry_storage().applied_index() @@ -576,6 +590,11 @@ impl Peer { self.peer_heartbeats.remove(&peer_id); } + #[inline] + pub fn get_peer_heartbeats(&self) -> &HashMap { + &self.peer_heartbeats + } + #[inline] pub fn has_peer(&self, peer_id: u64) -> bool { self.region() @@ -594,7 +613,7 @@ impl Peer { ) } - pub fn collect_down_peers(&mut self, max_duration: Duration) -> Vec { + pub fn collect_down_peers(&mut self, ctx: &StoreContext) -> Vec { let mut down_peers = Vec::new(); let mut down_peer_ids = Vec::new(); let now = Instant::now(); @@ -604,7 +623,7 @@ impl Peer { } if let Some(instant) = self.peer_heartbeats.get(&p.get_id()) { let elapsed = now.saturating_duration_since(*instant); - if elapsed >= max_duration { + if elapsed >= ctx.cfg.max_peer_down_duration.0 { let mut stats = pdpb::PeerStats::default(); stats.set_peer(p.clone()); stats.set_down_seconds(elapsed.as_secs()); @@ -613,8 +632,11 @@ impl Peer { } } } + let exist_down_peers = !down_peer_ids.is_empty(); *self.abnormal_peer_context_mut().down_peers_mut() = down_peer_ids; - // TODO: `refill_disk_full_peers` + if exist_down_peers { + self.refill_disk_full_peers(ctx); + } down_peers } @@ -861,6 +883,16 @@ impl Peer { ) } + #[inline] + pub fn leader_transferee(&self) -> u64 { + self.leader_transferee + } + + #[inline] + pub fn leader_transferring(&self) -> bool { + self.leader_transferee != raft::INVALID_ID + } + #[inline] pub fn long_uncommitted_threshold(&self) -> Duration { Duration::from_secs(self.long_uncommitted_threshold) @@ -909,6 +941,11 @@ impl Peer { self.last_sent_snapshot_index } + #[inline] + pub fn next_proposal_index(&self) -> u64 { + self.raft_group.raft.raft_log.last_index() + 1 + } + #[inline] pub fn index_term(&self, idx: u64) -> u64 { match self.raft_group.raft.raft_log.term(idx) { diff --git a/components/raftstore-v2/src/router/imp.rs b/components/raftstore-v2/src/router/imp.rs index 9c6cca96ae4..e7a63f6d48f 100644 --- a/components/raftstore-v2/src/router/imp.rs +++ b/components/raftstore-v2/src/router/imp.rs @@ -43,12 +43,18 @@ impl AsyncReadNotifier for StoreRouter { } impl raftstore::coprocessor::StoreHandle for StoreRouter { - fn update_approximate_size(&self, region_id: u64, size: u64) { - let _ = self.send(region_id, PeerMsg::UpdateRegionSize { size }); + // TODO: add splitable logic in raftstore-v2 + fn update_approximate_size(&self, region_id: u64, size: Option, _may_split: Option) { + if let Some(size) = size { + let _ = self.send(region_id, PeerMsg::UpdateRegionSize { size }); + } } - fn update_approximate_keys(&self, region_id: u64, keys: u64) { - let _ = self.send(region_id, PeerMsg::UpdateRegionKeys { keys }); + // TODO: add splitable logic in raftstore-v2 + fn update_approximate_keys(&self, region_id: u64, keys: Option, _may_split: Option) { + if let Some(keys) = keys { + let _ = self.send(region_id, PeerMsg::UpdateRegionKeys { keys }); + } } fn ask_split( @@ -58,7 +64,7 @@ impl raftstore::coprocessor::StoreHandle for Store split_keys: Vec>, source: Cow<'static, str>, ) { - let (msg, _) = PeerMsg::request_split(region_epoch, split_keys, source.to_string()); + let (msg, _) = PeerMsg::request_split(region_epoch, split_keys, source.to_string(), true); let res = self.send(region_id, msg); if let Err(e) = res { warn!( diff --git a/components/raftstore-v2/src/router/internal_message.rs b/components/raftstore-v2/src/router/internal_message.rs index 6c8d1136b3a..7ac86c3f8c7 100644 --- a/components/raftstore-v2/src/router/internal_message.rs +++ b/components/raftstore-v2/src/router/internal_message.rs @@ -25,4 +25,11 @@ pub struct ApplyRes { pub modifications: DataTrace, pub metrics: ApplyMetrics, pub bucket_stat: Option, + pub sst_applied_index: Vec, +} + +#[derive(Copy, Clone, Debug)] +pub struct SstApplyIndex { + pub cf_index: usize, + pub index: u64, } diff --git a/components/raftstore-v2/src/router/message.rs b/components/raftstore-v2/src/router/message.rs index f09314b4f17..59d1edd8198 100644 --- a/components/raftstore-v2/src/router/message.rs +++ b/components/raftstore-v2/src/router/message.rs @@ -14,7 +14,7 @@ use kvproto::{ }; use raftstore::store::{ fsm::ChangeObserver, metrics::RaftEventDurationType, simple_write::SimpleWriteBinary, - util::LatencyInspector, FetchedLogs, GenSnapRes, TabletSnapKey, + util::LatencyInspector, FetchedLogs, GenSnapRes, RaftCmdExtraOpts, TabletSnapKey, UnsafeRecoveryExecutePlanSyncer, UnsafeRecoveryFillOutReportSyncer, UnsafeRecoveryForceLeaderSyncer, UnsafeRecoveryWaitApplySyncer, }; @@ -134,6 +134,7 @@ pub struct SimpleWrite { pub header: Box, pub data: SimpleWriteBinary, pub ch: CmdResChannel, + pub extra_opts: RaftCmdExtraOpts, } #[derive(Debug)] @@ -156,7 +157,7 @@ pub enum PeerMsg { /// Raft message is the message sent between raft nodes in the same /// raft group. Messages need to be redirected to raftstore if target /// peer doesn't exist. - RaftMessage(Box), + RaftMessage(Box, Option), /// Query won't change any state. A typical query is KV read. In most cases, /// it will be processed using lease or read index. RaftQuery(RaftRequest), @@ -197,6 +198,11 @@ pub enum PeerMsg { StoreUnreachable { to_store_id: u64, }, + // A store may be tombstone. Use it with caution, it also means store not + // found, PD can not distinguish them now, as PD may delete tombstone stores. + StoreMaybeTombstone { + store_id: u64, + }, /// Reports whether the snapshot sending is successful or not. SnapshotSent { to_peer_id: u64, @@ -291,6 +297,14 @@ impl PeerMsg { pub fn simple_write( header: Box, data: SimpleWriteBinary, + ) -> (Self, CmdResSubscriber) { + PeerMsg::simple_write_with_opt(header, data, RaftCmdExtraOpts::default()) + } + + pub fn simple_write_with_opt( + header: Box, + data: SimpleWriteBinary, + extra_opts: RaftCmdExtraOpts, ) -> (Self, CmdResSubscriber) { let (ch, sub) = CmdResChannel::pair(); ( @@ -299,6 +313,7 @@ impl PeerMsg { header, data, ch, + extra_opts, }), sub, ) @@ -315,6 +330,7 @@ impl PeerMsg { epoch: metapb::RegionEpoch, split_keys: Vec>, source: String, + share_source_region_size: bool, ) -> (Self, CmdResSubscriber) { let (ch, sub) = CmdResChannel::pair(); ( @@ -323,6 +339,7 @@ impl PeerMsg { epoch, split_keys, source: source.into(), + share_source_region_size, }, ch, }, @@ -344,6 +361,7 @@ impl PeerMsg { epoch, split_keys, source: source.into(), + share_source_region_size: false, }, ch, }, diff --git a/components/raftstore-v2/src/router/mod.rs b/components/raftstore-v2/src/router/mod.rs index 7630e35c2a5..d63e1abc733 100644 --- a/components/raftstore-v2/src/router/mod.rs +++ b/components/raftstore-v2/src/router/mod.rs @@ -12,7 +12,7 @@ pub use self::response_channel::FlushChannel; pub use self::response_channel::FlushSubscriber; pub use self::{ imp::{RaftRouter, UnsafeRecoveryRouter}, - internal_message::ApplyRes, + internal_message::{ApplyRes, SstApplyIndex}, message::{PeerMsg, PeerTick, RaftRequest, StoreMsg, StoreTick}, response_channel::{ build_any_channel, AnyResChannel, AnyResSubscriber, BaseSubscriber, CmdResChannel, @@ -20,3 +20,4 @@ pub use self::{ DebugInfoSubscriber, QueryResChannel, QueryResult, ReadResponse, }, }; +pub use super::operation::DiskSnapBackupHandle; diff --git a/components/raftstore-v2/src/worker/pd/mod.rs b/components/raftstore-v2/src/worker/pd/mod.rs index f89ea75b604..520403b6ce3 100644 --- a/components/raftstore-v2/src/worker/pd/mod.rs +++ b/components/raftstore-v2/src/worker/pd/mod.rs @@ -25,6 +25,7 @@ use tikv_util::{ config::VersionTrack, time::{Instant as TiInstant, UnixSecs}, worker::{Runnable, Scheduler}, + InspectFactor, }; use yatp::{task::future::TaskCell, Remote}; @@ -57,7 +58,6 @@ pub enum Task { }, // In region.rs. RegionHeartbeat(RegionHeartbeatTask), - ReportRegionBuckets(BucketStat), UpdateReadStats(ReadStats), UpdateWriteStats(WriteStats), UpdateRegionCpuRecords(Arc), @@ -70,6 +70,7 @@ pub enum Task { split_keys: Vec>, peer: metapb::Peer, right_derive: bool, + share_source_region_size: bool, ch: CmdResChannel, }, ReportBatchSplit { @@ -84,6 +85,7 @@ pub enum Task { initial_status: u64, txn_ext: Arc, }, + // BucketStat is the delta write flow of the bucket. ReportBuckets(BucketStat), ReportMinResolvedTs { store_id: u64, @@ -122,7 +124,6 @@ impl Display for Task { hb_task.region, hb_task.peer.get_id(), ), - Task::ReportRegionBuckets(ref buckets) => write!(f, "report buckets: {:?}", buckets), Task::UpdateReadStats(ref stats) => { write!(f, "update read stats: {stats:?}") } @@ -257,6 +258,7 @@ where store_heartbeat_interval / NUM_COLLECT_STORE_INFOS_PER_HEARTBEAT, cfg.value().report_min_resolved_ts_interval.0, cfg.value().inspect_interval.0, + std::time::Duration::default(), PdReporter::new(pd_scheduler, logger.clone()), ); stats_monitor.start( @@ -313,7 +315,6 @@ where write_io_rates, } => self.handle_update_store_infos(cpu_usages, read_io_rates, write_io_rates), Task::RegionHeartbeat(task) => self.handle_region_heartbeat(task), - Task::ReportRegionBuckets(buckets) => self.handle_report_region_buckets(buckets), Task::UpdateReadStats(stats) => self.handle_update_read_stats(stats), Task::UpdateWriteStats(stats) => self.handle_update_write_stats(stats), Task::UpdateRegionCpuRecords(records) => self.handle_update_region_cpu_records(records), @@ -324,7 +325,15 @@ where peer, right_derive, ch, - } => self.handle_ask_batch_split(region, split_keys, peer, right_derive, ch), + share_source_region_size, + } => self.handle_ask_batch_split( + region, + split_keys, + peer, + right_derive, + share_source_region_size, + ch, + ), Task::ReportBatchSplit { regions } => self.handle_report_batch_split(regions), Task::AutoSplit { split_infos } => self.handle_auto_split(split_infos), Task::UpdateMaxTimestamp { @@ -332,7 +341,7 @@ where initial_status, txn_ext, } => self.handle_update_max_timestamp(region_id, initial_status, txn_ext), - Task::ReportBuckets(buckets) => self.handle_report_region_buckets(buckets), + Task::ReportBuckets(delta_buckets) => self.handle_report_region_buckets(delta_buckets), Task::ReportMinResolvedTs { store_id, min_resolved_ts, @@ -429,7 +438,7 @@ impl StoreStatsReporter for PdReporter { } } - fn update_latency_stats(&self, timer_tick: u64) { + fn update_latency_stats(&self, timer_tick: u64, _factor: InspectFactor) { // Tick slowness statistics. { if let Err(e) = self.scheduler.schedule(Task::TickSlownessStats) { diff --git a/components/raftstore-v2/src/worker/pd/region.rs b/components/raftstore-v2/src/worker/pd/region.rs index e825dd54c32..d3ef54bd75a 100644 --- a/components/raftstore-v2/src/worker/pd/region.rs +++ b/components/raftstore-v2/src/worker/pd/region.rs @@ -288,6 +288,7 @@ where epoch, split_keys: split_region.take_keys().into(), source: "pd".into(), + share_source_region_size: false, }, ch, } @@ -338,9 +339,9 @@ where self.is_hb_receiver_scheduled = true; } - pub fn handle_report_region_buckets(&mut self, region_buckets: BucketStat) { - let region_id = region_buckets.meta.region_id; - self.merge_buckets(region_buckets); + pub fn handle_report_region_buckets(&mut self, delta_buckets: BucketStat) { + let region_id = delta_buckets.meta.region_id; + self.merge_buckets(delta_buckets); let report_buckets = self.region_buckets.get_mut(®ion_id).unwrap(); let last_report_ts = if report_buckets.last_report_ts.is_zero() { self.start_ts @@ -387,8 +388,8 @@ where .engine_total_query_num .add_query_stats(®ion_info.query_stats.0); } - for (_, region_buckets) in std::mem::take(&mut stats.region_buckets) { - self.merge_buckets(region_buckets); + for (_, delta_buckets) in std::mem::take(&mut stats.region_buckets) { + self.merge_buckets(delta_buckets); } if !stats.region_infos.is_empty() { self.stats_monitor.maybe_send_read_stats(stats); @@ -423,18 +424,18 @@ where } } - fn merge_buckets(&mut self, mut buckets: BucketStat) { - let region_id = buckets.meta.region_id; + fn merge_buckets(&mut self, mut delta: BucketStat) { + let region_id = delta.meta.region_id; self.region_buckets .entry(region_id) .and_modify(|report_bucket| { let current = &mut report_bucket.current_stat; - if current.meta < buckets.meta { - std::mem::swap(current, &mut buckets); + if current.meta < delta.meta { + std::mem::swap(current, &mut delta); } - current.merge(&buckets); + current.merge(&delta); }) - .or_insert_with(|| ReportBucket::new(buckets)); + .or_insert_with(|| ReportBucket::new(delta)); } fn calculate_region_cpu_records( diff --git a/components/raftstore-v2/src/worker/pd/split.rs b/components/raftstore-v2/src/worker/pd/split.rs index bf13e01120a..7fec5a31bb6 100644 --- a/components/raftstore-v2/src/worker/pd/split.rs +++ b/components/raftstore-v2/src/worker/pd/split.rs @@ -17,10 +17,13 @@ fn new_batch_split_region_request( split_keys: Vec>, ids: Vec, right_derive: bool, + share_source_region_size: bool, ) -> AdminRequest { let mut req = AdminRequest::default(); req.set_cmd_type(AdminCmdType::BatchSplit); req.mut_splits().set_right_derive(right_derive); + req.mut_splits() + .set_share_source_region_size(share_source_region_size); let mut requests = Vec::with_capacity(ids.len()); for (mut id, key) in ids.into_iter().zip(split_keys) { let mut split = SplitRequest::default(); @@ -46,6 +49,7 @@ where split_keys: Vec>, peer: metapb::Peer, right_derive: bool, + share_source_region_size: bool, ch: CmdResChannel, ) { Self::ask_batch_split_imp( @@ -57,6 +61,7 @@ where split_keys, peer, right_derive, + share_source_region_size, Some(ch), ); } @@ -70,6 +75,7 @@ where split_keys: Vec>, peer: metapb::Peer, right_derive: bool, + share_source_region_size: bool, ch: Option, ) { if split_keys.is_empty() { @@ -98,6 +104,7 @@ where split_keys, resp.take_ids().into(), right_derive, + share_source_region_size, ); let region_id = region.get_id(); let epoch = region.take_region_epoch(); @@ -148,6 +155,7 @@ where vec![split_key], split_info.peer, true, + false, None, ); // Try to split the region on half within the given key diff --git a/components/raftstore-v2/src/worker/pd/store.rs b/components/raftstore-v2/src/worker/pd/store.rs index a5aad42d85c..75e20a06abd 100644 --- a/components/raftstore-v2/src/worker/pd/store.rs +++ b/components/raftstore-v2/src/worker/pd/store.rs @@ -9,7 +9,7 @@ use kvproto::pdpb; use pd_client::{ metrics::{ REGION_READ_BYTES_HISTOGRAM, REGION_READ_KEYS_HISTOGRAM, REGION_WRITTEN_BYTES_HISTOGRAM, - REGION_WRITTEN_KEYS_HISTOGRAM, STORE_SIZE_GAUGE_VEC, + REGION_WRITTEN_KEYS_HISTOGRAM, STORE_SIZE_EVENT_INT_VEC, }, PdClient, }; @@ -22,6 +22,7 @@ use slog::{error, info, warn}; use tikv_util::{ metrics::RecordPairVec, store::QueryStats, + sys::disk::get_disk_space_stats, time::{Duration, Instant as TiInstant, UnixSecs}, topn::TopN, }; @@ -263,15 +264,9 @@ where self.store_stat.region_bytes_read.flush(); self.store_stat.region_keys_read.flush(); - STORE_SIZE_GAUGE_VEC - .with_label_values(&["capacity"]) - .set(capacity as i64); - STORE_SIZE_GAUGE_VEC - .with_label_values(&["available"]) - .set(available as i64); - STORE_SIZE_GAUGE_VEC - .with_label_values(&["used"]) - .set(used_size as i64); + STORE_SIZE_EVENT_INT_VEC.capacity.set(capacity as i64); + STORE_SIZE_EVENT_INT_VEC.available.set(available as i64); + STORE_SIZE_EVENT_INT_VEC.used.set(used_size as i64); // Update slowness statistics self.update_slowness_in_store_stats(&mut stats, last_query_sum); @@ -447,7 +442,8 @@ where /// Returns (capacity, used, available). fn collect_engine_size(&self) -> Option<(u64, u64, u64)> { - let disk_stats = match fs2::statvfs(self.tablet_registry.tablet_root()) { + let (disk_cap, disk_avail) = match get_disk_space_stats(self.tablet_registry.tablet_root()) + { Err(e) => { error!( self.logger, @@ -457,9 +453,8 @@ where ); return None; } - Ok(stats) => stats, + Ok((total_size, available_size)) => (total_size, available_size), }; - let disk_cap = disk_stats.total_space(); let capacity = if self.cfg.value().capacity.0 == 0 { disk_cap } else { @@ -473,16 +468,20 @@ where true }); let snap_size = self.snap_mgr.total_snap_size().unwrap(); - let used_size = snap_size - + kv_size - + self - .raft_engine - .get_engine_size() - .expect("raft engine used size"); + let raft_size = self + .raft_engine + .get_engine_size() + .expect("engine used size"); + + STORE_SIZE_EVENT_INT_VEC.kv_size.set(kv_size as i64); + STORE_SIZE_EVENT_INT_VEC.raft_size.set(raft_size as i64); + STORE_SIZE_EVENT_INT_VEC.snap_size.set(snap_size as i64); + + let used_size = snap_size + kv_size + raft_size; let mut available = capacity.checked_sub(used_size).unwrap_or_default(); // We only care about rocksdb SST file size, so we should check disk available // here. - available = cmp::min(available, disk_stats.available_space()); + available = cmp::min(available, disk_avail); Some((capacity, used_size, available)) } } diff --git a/components/raftstore-v2/src/worker/tablet.rs b/components/raftstore-v2/src/worker/tablet.rs index 183bb33cd34..206e87b3a8e 100644 --- a/components/raftstore-v2/src/worker/tablet.rs +++ b/components/raftstore-v2/src/worker/tablet.rs @@ -298,6 +298,8 @@ impl Runner { .spawn(async move { let range1 = Range::new(&[], &start_key); let range2 = Range::new(&end_key, keys::DATA_MAX_KEY); + // Note: Refer to https://github.com/facebook/rocksdb/pull/11468. There's could be + // some files missing from compaction if dynamic_level_bytes is off. for r in [range1, range2] { // When compaction filter is present, trivial move is disallowed. if let Err(e) = @@ -323,6 +325,16 @@ impl Runner { return; } } + if let Err(e) = tablet.check_in_range(Some(&start_key), Some(&end_key)) { + debug_assert!(false, "check_in_range failed {:?}, is titan enabled?", e); + error!( + logger, + "trim did not remove all dirty data"; + "path" => tablet.path(), + "err" => %e, + ); + return; + } // drop before callback. drop(tablet); fail_point!("tablet_trimmed_finished"); @@ -581,6 +593,13 @@ impl Runner { } } +#[cfg(test)] +impl Runner { + pub fn get_running_task_count(&self) -> usize { + self.low_pri_pool.get_running_task_count() + } +} + impl Runnable for Runner where EK: KvEngine, @@ -801,6 +820,14 @@ mod tests { runner.run(Task::destroy(r_1, 100)); assert!(path.exists()); registry.remove(r_1); + // waiting for async `pause_background_work` to be finished, + // this task can block tablet's destroy. + for _i in 0..100 { + if runner.get_running_task_count() == 0 { + break; + } + std::thread::sleep(Duration::from_millis(5)); + } runner.on_timeout(); assert!(!path.exists()); assert!(runner.pending_destroy_tasks.is_empty()); diff --git a/components/raftstore-v2/tests/failpoints/test_merge.rs b/components/raftstore-v2/tests/failpoints/test_merge.rs index 890b8c5e27a..11fe666b49b 100644 --- a/components/raftstore-v2/tests/failpoints/test_merge.rs +++ b/components/raftstore-v2/tests/failpoints/test_merge.rs @@ -7,7 +7,7 @@ use std::{ use engine_traits::Peekable; use raftstore_v2::router::{PeerMsg, PeerTick}; -use tikv_util::store::new_peer; +use tikv_util::{config::ReadableDuration, info, store::new_peer}; use crate::cluster::{ life_helper::assert_peer_not_exist, @@ -179,7 +179,9 @@ fn test_rollback() { // Target is merging. #[test] fn test_merge_conflict_0() { - let mut cluster = Cluster::default(); + let mut cluster = Cluster::with_configs(1, None, None, |cfg| { + cfg.merge_check_tick_interval = ReadableDuration::millis(100); + }); let store_id = cluster.node(0).id(); let router = &mut cluster.routers[0]; @@ -216,6 +218,7 @@ fn test_merge_conflict_0() { format!("k{}", region_3_id).as_bytes(), false, ); + info!("regions: {:?}, {:?}, {:?}", region_1, region_2, region_3); // pause merge progress of 2+3. let fp = fail::FailGuard::new("apply_commit_merge", "pause"); @@ -236,9 +239,9 @@ fn test_merge_conflict_0() { .unwrap(); let region_2 = cluster.routers[0].region_detail(region_2.get_id()); merge_region(&cluster, 0, region_1, peer_1, region_2, false); + drop(fp); // wait for rollback. rx.recv_timeout(std::time::Duration::from_secs(1)).unwrap(); - drop(fp); fail::remove("apply_rollback_merge"); // Check region 1 is not merged and can serve writes. diff --git a/components/raftstore/Cargo.toml b/components/raftstore/Cargo.toml index 27380a52882..d086b040f66 100644 --- a/components/raftstore/Cargo.toml +++ b/components/raftstore/Cargo.toml @@ -82,6 +82,8 @@ slog = { workspace = true } slog-global = { workspace = true } smallvec = "1.4" sst_importer = { workspace = true } +strum = { version = "0.20", features = ["derive"] } +strum_macros = "0.24" tempfile = "3.0" thiserror = "1.0" tidb_query_datatype = { workspace = true } diff --git a/components/raftstore/src/coprocessor/OWNERS b/components/raftstore/src/coprocessor/OWNERS new file mode 100644 index 00000000000..70df3de925d --- /dev/null +++ b/components/raftstore/src/coprocessor/OWNERS @@ -0,0 +1,7 @@ +# See the OWNERS docs at https://go.k8s.io/owners +options: + no_parent_owners: true +filters: + "(OWNERS|config\\.rs)$": + approvers: + - sig-critical-approvers-config-components diff --git a/components/raftstore/src/coprocessor/config.rs b/components/raftstore/src/coprocessor/config.rs index e1246e8d59d..b1dc3830bbb 100644 --- a/components/raftstore/src/coprocessor/config.rs +++ b/components/raftstore/src/coprocessor/config.rs @@ -168,7 +168,7 @@ impl Config { Ok(()) } - pub fn validate(&mut self) -> Result<()> { + pub fn validate(&mut self, raft_kv_v2: bool) -> Result<()> { if self.region_split_keys.is_none() { self.region_split_keys = Some((self.region_split_size().as_mb_f64() * 10000.0) as u64); } @@ -199,8 +199,9 @@ impl Config { None => self.region_max_keys = Some(self.region_split_keys() / 2 * 3), } let res = self.validate_bucket_size(); - // If it's OK to enable bucket, we will prefer to enable it if useful. - if let Ok(()) = res && self.enable_region_bucket.is_none() { + // If it's OK to enable bucket, we will prefer to enable it if useful for + // raftstore-v2. + if let Ok(()) = res && self.enable_region_bucket.is_none() && raft_kv_v2 { let useful = self.region_split_size() >= self.region_bucket_size * 2; self.enable_region_bucket = Some(useful); } else if let Err(e) = res && self.enable_region_bucket() { @@ -237,39 +238,39 @@ mod tests { #[test] fn test_config_validate() { let mut cfg = Config::default(); - cfg.validate().unwrap(); + cfg.validate(false).unwrap(); cfg = Config::default(); cfg.region_max_size = Some(ReadableSize(10)); cfg.region_split_size = Some(ReadableSize(20)); - cfg.validate().unwrap_err(); + cfg.validate(false).unwrap_err(); cfg = Config::default(); cfg.region_max_size = None; cfg.region_split_size = Some(ReadableSize(20)); - cfg.validate().unwrap(); + cfg.validate(false).unwrap(); assert_eq!(cfg.region_max_size, Some(ReadableSize(30))); cfg = Config::default(); cfg.region_max_keys = Some(10); cfg.region_split_keys = Some(20); - cfg.validate().unwrap_err(); + cfg.validate(false).unwrap_err(); cfg = Config::default(); cfg.region_max_keys = None; cfg.region_split_keys = Some(20); - cfg.validate().unwrap(); + cfg.validate(false).unwrap(); assert_eq!(cfg.region_max_keys, Some(30)); cfg = Config::default(); cfg.enable_region_bucket = Some(false); cfg.region_split_size = Some(ReadableSize(20)); cfg.region_bucket_size = ReadableSize(30); - cfg.validate().unwrap(); + cfg.validate(false).unwrap(); cfg = Config::default(); cfg.region_split_size = Some(ReadableSize::mb(20)); - cfg.validate().unwrap(); + cfg.validate(false).unwrap(); assert_eq!(cfg.region_split_keys, Some(200000)); } } diff --git a/components/raftstore/src/coprocessor/dispatcher.rs b/components/raftstore/src/coprocessor/dispatcher.rs index d082013cd2c..5b06d92d8e0 100644 --- a/components/raftstore/src/coprocessor/dispatcher.rs +++ b/components/raftstore/src/coprocessor/dispatcher.rs @@ -19,8 +19,8 @@ use crate::store::BucketRange; /// A handle for coprocessor to schedule some command back to raftstore. pub trait StoreHandle: Clone + Send { - fn update_approximate_size(&self, region_id: u64, size: u64); - fn update_approximate_keys(&self, region_id: u64, keys: u64); + fn update_approximate_size(&self, region_id: u64, size: Option, splitable: Option); + fn update_approximate_keys(&self, region_id: u64, keys: Option, splitable: Option); fn ask_split( &self, region_id: u64, @@ -48,11 +48,13 @@ pub trait StoreHandle: Clone + Send { pub enum SchedTask { UpdateApproximateSize { region_id: u64, - size: u64, + splitable: Option, + size: Option, }, UpdateApproximateKeys { region_id: u64, - keys: u64, + splitable: Option, + keys: Option, }, AskSplit { region_id: u64, @@ -75,12 +77,20 @@ pub enum SchedTask { } impl StoreHandle for std::sync::mpsc::SyncSender { - fn update_approximate_size(&self, region_id: u64, size: u64) { - let _ = self.try_send(SchedTask::UpdateApproximateSize { region_id, size }); + fn update_approximate_size(&self, region_id: u64, size: Option, splitable: Option) { + let _ = self.try_send(SchedTask::UpdateApproximateSize { + region_id, + splitable, + size, + }); } - fn update_approximate_keys(&self, region_id: u64, keys: u64) { - let _ = self.try_send(SchedTask::UpdateApproximateKeys { region_id, keys }); + fn update_approximate_keys(&self, region_id: u64, keys: Option, splitable: Option) { + let _ = self.try_send(SchedTask::UpdateApproximateKeys { + region_id, + splitable, + keys, + }); } fn ask_split( @@ -662,6 +672,10 @@ impl CoprocessorHost { ); } + pub fn pre_transfer_leader(&self, r: &Region, tr: &TransferLeaderRequest) -> Result<()> { + try_loop_ob!(r, &self.registry.admin_observers, pre_transfer_leader, tr) + } + pub fn post_apply_snapshot( &self, region: &Region, diff --git a/components/raftstore/src/coprocessor/error.rs b/components/raftstore/src/coprocessor/error.rs index 233c7c4197a..d979cac98dd 100644 --- a/components/raftstore/src/coprocessor/error.rs +++ b/components/raftstore/src/coprocessor/error.rs @@ -1,12 +1,14 @@ // Copyright 2016 TiKV Project Authors. Licensed under Apache-2.0. -use std::{error::Error as StdError, result::Result as StdResult}; +use std::{error::Error as StdError, result::Result as StdResult, time::Duration}; use error_code::{self, ErrorCode, ErrorCodeExt}; use thiserror::Error; #[derive(Debug, Error)] pub enum Error { + #[error("required retry after {after:?}, hint: {reason:?}")] + RequireDelay { after: Duration, reason: String }, #[error("{0}")] Other(#[from] Box), } diff --git a/components/raftstore/src/coprocessor/mod.rs b/components/raftstore/src/coprocessor/mod.rs index 0592e23200b..2e05d01f905 100644 --- a/components/raftstore/src/coprocessor/mod.rs +++ b/components/raftstore/src/coprocessor/mod.rs @@ -13,7 +13,10 @@ use engine_traits::{CfName, SstMetaInfo}; use kvproto::{ metapb::Region, pdpb::CheckPolicy, - raft_cmdpb::{AdminRequest, AdminResponse, RaftCmdRequest, RaftCmdResponse, Request}, + raft_cmdpb::{ + AdminRequest, AdminResponse, RaftCmdRequest, RaftCmdResponse, Request, + TransferLeaderRequest, + }, raft_serverpb::RaftApplyState, }; use raft::{eraftpb, StateRole}; @@ -130,6 +133,14 @@ pub trait AdminObserver: Coprocessor { ) -> bool { false } + + fn pre_transfer_leader( + &self, + _ctx: &mut ObserverContext<'_>, + _tr: &TransferLeaderRequest, + ) -> Result<()> { + Ok(()) + } } pub trait QueryObserver: Coprocessor { diff --git a/components/raftstore/src/coprocessor/split_check/keys.rs b/components/raftstore/src/coprocessor/split_check/keys.rs index 2c0e71dd8cb..d6a49175441 100644 --- a/components/raftstore/src/coprocessor/split_check/keys.rs +++ b/components/raftstore/src/coprocessor/split_check/keys.rs @@ -157,9 +157,11 @@ impl SplitCheckObserver for KeysCheckObserver } }; - self.router.update_approximate_keys(region_id, region_keys); + self.router + .update_approximate_keys(region_id, Some(region_keys), None); REGION_KEYS_HISTOGRAM.observe(region_keys as f64); + // if bucket checker using scan is added, to utilize the scan, // add keys checker as well for free // It has the assumption that the size's checker is before the keys's check in @@ -299,12 +301,28 @@ mod tests { None, )); // keys has not reached the max_keys 100 yet. - match rx.try_recv() { - Ok(SchedTask::UpdateApproximateSize { region_id, .. }) - | Ok(SchedTask::UpdateApproximateKeys { region_id, .. }) => { - assert_eq!(region_id, region.get_id()); + let mut recv_cnt = 0; + loop { + match rx.try_recv() { + Ok(SchedTask::UpdateApproximateSize { + region_id, + splitable, + .. + }) + | Ok(SchedTask::UpdateApproximateKeys { + region_id, + splitable, + .. + }) => { + assert_eq!(region_id, region.get_id()); + assert!(splitable.is_none()); + recv_cnt += 1; + if recv_cnt == 2 { + break; + } + } + others => panic!("expect recv empty, but got {:?}", others), } - others => panic!("expect recv empty, but got {:?}", others), } put_data(&engine, 90, 160, true); @@ -403,12 +421,28 @@ mod tests { None, )); // keys has not reached the max_keys 100 yet. - match rx.try_recv() { - Ok(SchedTask::UpdateApproximateSize { region_id, .. }) - | Ok(SchedTask::UpdateApproximateKeys { region_id, .. }) => { - assert_eq!(region_id, region.get_id()); + let mut recv_cnt = 0; + loop { + match rx.try_recv() { + Ok(SchedTask::UpdateApproximateSize { + region_id, + splitable, + .. + }) + | Ok(SchedTask::UpdateApproximateKeys { + region_id, + splitable, + .. + }) => { + assert_eq!(region_id, region.get_id()); + assert!(splitable.is_none()); + recv_cnt += 1; + if recv_cnt == 2 { + break; + } + } + others => panic!("expect recv empty, but got {:?}", others), } - others => panic!("expect recv empty, but got {:?}", others), } put_data(&engine, 90, 160, true); diff --git a/components/raftstore/src/coprocessor/split_check/size.rs b/components/raftstore/src/coprocessor/split_check/size.rs index 4b320bef1b6..e5048a83826 100644 --- a/components/raftstore/src/coprocessor/split_check/size.rs +++ b/components/raftstore/src/coprocessor/split_check/size.rs @@ -158,13 +158,14 @@ impl SplitCheckObserver for SizeCheckObserver }; // send it to raftstore to update region approximate size - self.router.update_approximate_size(region_id, region_size); + self.router + .update_approximate_size(region_id, Some(region_size), None); + let need_split_region = region_size >= host.cfg.region_max_size().0; let need_bucket_checker = host.cfg.enable_region_bucket() && region_size >= 2 * host.cfg.region_bucket_size.0; REGION_SIZE_HISTOGRAM.observe(region_size as f64); - let need_split_region = region_size >= host.cfg.region_max_size().0; if need_split_region || need_bucket_checker { // when it's a large region use approximate way to produce split keys if need_split_region { @@ -265,11 +266,23 @@ pub mod tests { exp_split_keys: Vec>, ignore_split_keys: bool, ) { + let mut split = false; loop { match rx.try_recv() { - Ok(SchedTask::UpdateApproximateKeys { region_id, .. }) - | Ok(SchedTask::UpdateApproximateSize { region_id, .. }) - | Ok(SchedTask::RefreshRegionBuckets { region_id, .. }) => { + Ok(SchedTask::UpdateApproximateKeys { + region_id, + splitable, + .. + }) + | Ok(SchedTask::UpdateApproximateSize { + region_id, + splitable, + .. + }) => { + assert_eq!(region_id, exp_region.get_id()); + split = split || splitable.unwrap_or(false); + } + Ok(SchedTask::RefreshRegionBuckets { region_id, .. }) => { assert_eq!(region_id, exp_region.get_id()); } Ok(SchedTask::AskSplit { @@ -283,6 +296,7 @@ pub mod tests { if !ignore_split_keys { assert_eq!(split_keys, exp_split_keys); } + assert!(split); break; } others => panic!("expect split check result, but got {:?}", others), @@ -303,11 +317,23 @@ pub mod tests { exp_region: &Region, exp_split_keys_count: usize, ) { + let mut split = false; loop { match rx.try_recv() { - Ok(SchedTask::UpdateApproximateSize { region_id, .. }) - | Ok(SchedTask::UpdateApproximateKeys { region_id, .. }) - | Ok(SchedTask::RefreshRegionBuckets { region_id, .. }) => { + Ok(SchedTask::UpdateApproximateSize { + region_id, + splitable, + .. + }) + | Ok(SchedTask::UpdateApproximateKeys { + region_id, + splitable, + .. + }) => { + assert_eq!(region_id, exp_region.get_id()); + split = split || splitable.unwrap_or(false); + } + Ok(SchedTask::RefreshRegionBuckets { region_id, .. }) => { assert_eq!(region_id, exp_region.get_id()); } Ok(SchedTask::AskSplit { @@ -319,6 +345,7 @@ pub mod tests { assert_eq!(region_id, exp_region.get_id()); assert_eq!(®ion_epoch, exp_region.get_region_epoch()); assert_eq!(split_keys.len(), exp_split_keys_count); + assert!(split); break; } others => panic!("expect split check result, but got {:?}", others), diff --git a/components/raftstore/src/errors.rs b/components/raftstore/src/errors.rs index d1597a77121..6d512e5bf80 100644 --- a/components/raftstore/src/errors.rs +++ b/components/raftstore/src/errors.rs @@ -7,7 +7,10 @@ use error_code::{self, ErrorCode, ErrorCodeExt}; use kvproto::{errorpb, metapb, raft_serverpb}; use protobuf::ProtobufError; use thiserror::Error; -use tikv_util::{codec, deadline::DeadlineError}; +use tikv_util::{ + codec, + deadline::{set_deadline_exceeded_busy_error, DeadlineError}, +}; use super::{coprocessor::Error as CopError, store::SnapError}; @@ -287,6 +290,18 @@ impl From for errorpb::Error { e.set_store_peer_id(store_peer_id); errorpb.set_mismatch_peer_id(e); } + Error::Coprocessor(CopError::RequireDelay { + after, + reason: hint, + }) => { + let mut e = errorpb::ServerIsBusy::new(); + e.set_backoff_ms(after.as_millis() as _); + e.set_reason(hint); + errorpb.set_server_is_busy(e); + } + Error::DeadlineExceeded => { + set_deadline_exceeded_busy_error(&mut errorpb); + } _ => {} }; @@ -350,3 +365,20 @@ impl ErrorCodeExt for Error { } } } + +#[cfg(test)] +mod tests { + use kvproto::errorpb; + + use crate::Error; + + #[test] + fn test_deadline_exceeded_error() { + let err: errorpb::Error = Error::DeadlineExceeded.into(); + assert_eq!( + err.get_server_is_busy().reason, + "deadline is exceeded".to_string() + ); + assert_eq!(err.get_message(), "Deadline is exceeded"); + } +} diff --git a/components/raftstore/src/router.rs b/components/raftstore/src/router.rs index 3a76a5ad26f..f8094de764e 100644 --- a/components/raftstore/src/router.rs +++ b/components/raftstore/src/router.rs @@ -1,6 +1,9 @@ // Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. -use std::borrow::Cow; +use std::{ + borrow::Cow, + sync::{Arc, Mutex}, +}; // #[PerformanceCriticalPath] use crossbeam::channel::TrySendError; @@ -89,7 +92,10 @@ where /// Report a `StoreResolved` event to all Raft groups. fn report_resolved(&self, store_id: u64, group_id: u64) { self.broadcast_normal(|| { - PeerMsg::SignificantMsg(SignificantMsg::StoreResolved { store_id, group_id }) + PeerMsg::SignificantMsg(Box::new(SignificantMsg::StoreResolved { + store_id, + group_id, + })) }) } } @@ -286,11 +292,11 @@ impl RaftStoreRouter for RaftRouter { // duplicated codes. impl crate::coprocessor::StoreHandle for RaftRouter { - fn update_approximate_size(&self, region_id: u64, size: u64) { + fn update_approximate_size(&self, region_id: u64, size: Option, splitable: Option) { if let Err(e) = CasualRouter::send( self, region_id, - CasualMessage::RegionApproximateSize { size }, + CasualMessage::RegionApproximateSize { size, splitable }, ) { warn!( "failed to send approximate region size"; @@ -301,11 +307,11 @@ impl crate::coprocessor::StoreHandle for RaftRoute } } - fn update_approximate_keys(&self, region_id: u64, keys: u64) { + fn update_approximate_keys(&self, region_id: u64, keys: Option, splitable: Option) { if let Err(e) = CasualRouter::send( self, region_id, - CasualMessage::RegionApproximateKeys { keys }, + CasualMessage::RegionApproximateKeys { keys, splitable }, ) { warn!( "failed to send approximate region keys"; @@ -331,6 +337,7 @@ impl crate::coprocessor::StoreHandle for RaftRoute split_keys, callback: Callback::None, source, + share_source_region_size: true, }, ) { warn!( @@ -405,6 +412,33 @@ where ) -> RaftStoreResult<()>; } +impl> CdcHandle for Arc> { + fn capture_change( + &self, + region_id: u64, + region_epoch: metapb::RegionEpoch, + change_observer: ChangeObserver, + callback: Callback<::Snapshot>, + ) -> RaftStoreResult<()> { + Mutex::lock(self).unwrap().capture_change( + region_id, + region_epoch, + change_observer, + callback, + ) + } + + fn check_leadership( + &self, + region_id: u64, + callback: Callback<::Snapshot>, + ) -> RaftStoreResult<()> { + Mutex::lock(self) + .unwrap() + .check_leadership(region_id, callback) + } +} + /// A wrapper of SignificantRouter that is specialized for implementing /// CdcHandle. #[derive(Clone)] diff --git a/components/raftstore/src/store/compaction_guard.rs b/components/raftstore/src/store/compaction_guard.rs index efee09be906..138d730fa29 100644 --- a/components/raftstore/src/store/compaction_guard.rs +++ b/components/raftstore/src/store/compaction_guard.rs @@ -23,10 +23,16 @@ pub struct CompactionGuardGeneratorFactory { cf_name: CfNames, provider: P, min_output_file_size: u64, + max_compaction_size: u64, } impl CompactionGuardGeneratorFactory

{ - pub fn new(cf: CfName, provider: P, min_output_file_size: u64) -> Result { + pub fn new( + cf: CfName, + provider: P, + min_output_file_size: u64, + max_compaction_size: u64, + ) -> Result { let cf_name = match cf { CF_DEFAULT => CfNames::default, CF_LOCK => CfNames::lock, @@ -43,6 +49,7 @@ impl CompactionGuardGeneratorFactory

{ cf_name, provider, min_output_file_size, + max_compaction_size, }) } } @@ -72,6 +79,15 @@ impl SstPartitionerFactory use_guard: false, boundaries: vec![], pos: 0, + next_level_pos: 0, + next_level_boundaries: context + .next_level_boundaries + .iter() + .map(|v| v.to_vec()) + .collect(), + next_level_size: context.next_level_sizes.clone(), + current_next_level_size: 0, + max_compaction_size: self.max_compaction_size, }) } } @@ -86,7 +102,20 @@ pub struct CompactionGuardGenerator { use_guard: bool, // The boundary keys are exclusive. boundaries: Vec>, + /// The SST boundaries overlapped with the compaction input at the next + /// level of output level (let we call it L+2). When the output level is the + /// bottom-most level(usually L6), this will be empty. The boundaries + /// are the first key of the first sst concatenating with all ssts' end key. + next_level_boundaries: Vec>, + /// The size of each "segment" of L+2. If the `next_level_boundaries`(let we + /// call it NLB) isn't empty, `next_level_size` will have length + /// `NLB.len() - 1`, and at the position `N` stores the size of range + /// `[NLB[N], NLB[N+1]]` in L+2. + next_level_size: Vec, pos: usize, + next_level_pos: usize, + current_next_level_size: u64, + max_compaction_size: u64, } impl CompactionGuardGenerator

{ @@ -153,27 +182,52 @@ impl SstPartitioner for CompactionGuardGenerator

{ if !self.use_guard { return SstPartitionerResult::NotRequired; } - let mut pos = self.pos; - let mut skip_count = 0; - while pos < self.boundaries.len() && self.boundaries[pos].as_slice() <= req.prev_user_key { - pos += 1; - skip_count += 1; - if skip_count >= COMPACTION_GUARD_MAX_POS_SKIP { - let prev_user_key = req.prev_user_key.to_vec(); - pos = match self.boundaries.binary_search(&prev_user_key) { - Ok(search_pos) => search_pos + 1, - Err(search_pos) => search_pos, - }; - break; - } + self.pos = seek_to(&self.boundaries, req.prev_user_key, self.pos); + // Generally this shall be a noop... because each time we are moving the cursor + // to the previous key. + let left_next_level_pos = seek_to( + &self.next_level_boundaries, + req.prev_user_key, + self.next_level_pos, + ); + let right_next_level_pos = seek_to( + &self.next_level_boundaries, + req.current_user_key, + left_next_level_pos, + ); + // The cursor has been moved. + if right_next_level_pos > left_next_level_pos { + self.current_next_level_size += self.next_level_size + [left_next_level_pos..right_next_level_pos - 1] + .iter() + .map(|x| *x as u64) + .sum::(); } - self.pos = pos; - if pos < self.boundaries.len() && self.boundaries[pos].as_slice() <= req.current_user_key { - if req.current_output_file_size >= self.min_output_file_size { + self.next_level_pos = right_next_level_pos; + + if self.pos < self.boundaries.len() + && self.boundaries[self.pos].as_slice() <= req.current_user_key + { + if req.current_output_file_size >= self.min_output_file_size + // Or, the output file may make a huge compaction even greater than the max compaction size. + || self.current_next_level_size >= self.max_compaction_size + { COMPACTION_GUARD_ACTION_COUNTER .get(self.cf_name) .partition .inc(); + // The current pointer status should be like (let * be the current pos, ^ be + // where the previous user key is): + // boundaries: A B C D + // size: 1 3 2 + // ^ * + // You will notice that the previous user key is between B and C, which indices + // that there must still be something between previous user key and C. + // We still set `current_next_level_size` to zero here, so the segment will be + // forgotten. I think that will be acceptable given generally a segment won't be + // greater than the `max-sst-size`, which is tiny comparing to the + // `max-compaction-size` usually. + self.current_next_level_size = 0; SstPartitionerResult::Required } else { COMPACTION_GUARD_ACTION_COUNTER @@ -193,10 +247,28 @@ impl SstPartitioner for CompactionGuardGenerator

{ } } +fn seek_to(all_data: &Vec>, target_key: &[u8], from_pos: usize) -> usize { + let mut pos = from_pos; + let mut skip_count = 0; + while pos < all_data.len() && all_data[pos].as_slice() <= target_key { + pos += 1; + skip_count += 1; + if skip_count >= COMPACTION_GUARD_MAX_POS_SKIP { + pos = match all_data.binary_search_by(|probe| probe.as_slice().cmp(target_key)) { + Ok(search_pos) => search_pos + 1, + Err(search_pos) => search_pos, + }; + break; + } + } + pos +} + #[cfg(test)] mod tests { - use std::str; + use std::{path::Path, str}; + use collections::HashMap; use engine_rocks::{ raw::{BlockBasedOptions, DBCompressionType}, util::new_engine_opt, @@ -212,6 +284,13 @@ mod tests { use super::*; use crate::coprocessor::region_info_accessor::MockRegionInfoProvider; + impl CompactionGuardGenerator { + fn reset_next_level_size_state(&mut self) { + self.current_next_level_size = 0; + self.next_level_pos = 0; + } + } + #[test] fn test_compaction_guard_non_data() { let mut guard = CompactionGuardGenerator { @@ -224,6 +303,11 @@ mod tests { use_guard: false, boundaries: vec![], pos: 0, + current_next_level_size: 0, + next_level_pos: 0, + next_level_boundaries: vec![], + next_level_size: vec![], + max_compaction_size: 1 << 30, }; guard.smallest_key = keys::LOCAL_MIN_KEY.to_vec(); @@ -267,8 +351,16 @@ mod tests { provider: MockRegionInfoProvider::new(vec![]), initialized: true, use_guard: true, - boundaries: vec![b"bbb".to_vec(), b"ccc".to_vec()], + boundaries: vec![b"bbb".to_vec(), b"ccc".to_vec(), b"ddd".to_vec()], pos: 0, + current_next_level_size: 0, + next_level_pos: 0, + next_level_boundaries: (0..10) + .map(|x| format!("bbb{:02}", x).into_bytes()) + .chain((0..100).map(|x| format!("cccz{:03}", x).into_bytes())) + .collect(), + next_level_size: [&[1 << 18; 99][..], &[1 << 28; 10][..]].concat(), + max_compaction_size: 1 << 30, // 1GB }; // Crossing region boundary. let mut req = SstPartitionerRequest { @@ -277,7 +369,11 @@ mod tests { current_output_file_size: 32 << 20, }; assert_eq!(guard.should_partition(&req), SstPartitionerResult::Required); + assert_eq!(guard.next_level_pos, 10); assert_eq!(guard.pos, 0); + assert_eq!(guard.current_next_level_size, 0); + guard.reset_next_level_size_state(); + // Output file size too small. req = SstPartitionerRequest { prev_user_key: b"bba", @@ -289,6 +385,10 @@ mod tests { SstPartitionerResult::NotRequired ); assert_eq!(guard.pos, 0); + assert_eq!(guard.next_level_pos, 10); + assert_eq!(guard.current_next_level_size, 9 << 18); + guard.reset_next_level_size_state(); + // Not crossing boundary. req = SstPartitionerRequest { prev_user_key: b"aaa", @@ -300,6 +400,9 @@ mod tests { SstPartitionerResult::NotRequired ); assert_eq!(guard.pos, 0); + assert_eq!(guard.next_level_pos, 0); + guard.reset_next_level_size_state(); + // Move position req = SstPartitionerRequest { prev_user_key: b"cca", @@ -308,6 +411,30 @@ mod tests { }; assert_eq!(guard.should_partition(&req), SstPartitionerResult::Required); assert_eq!(guard.pos, 1); + assert_eq!(guard.next_level_pos, 110); + guard.reset_next_level_size_state(); + + // Move next level posistion + req = SstPartitionerRequest { + prev_user_key: b"cccz000", + current_user_key: b"cccz042", + current_output_file_size: 1 << 20, + }; + assert_eq!( + guard.should_partition(&req), + SstPartitionerResult::NotRequired + ); + assert_eq!(guard.pos, 2); + assert_eq!(guard.next_level_pos, 53); + + req = SstPartitionerRequest { + prev_user_key: b"cccz090", + current_user_key: b"dde", + current_output_file_size: 1 << 20, + }; + assert_eq!(guard.should_partition(&req), SstPartitionerResult::Required); + assert_eq!(guard.pos, 2); + assert_eq!(guard.next_level_pos, 110); } #[test] @@ -339,6 +466,11 @@ mod tests { b"aaa15".to_vec(), ], pos: 0, + current_next_level_size: 0, + next_level_pos: 0, + next_level_boundaries: vec![], + next_level_size: vec![], + max_compaction_size: 1 << 30, }; // Binary search meet exact match. guard.pos = 0; @@ -365,15 +497,23 @@ mod tests { const MIN_OUTPUT_FILE_SIZE: u64 = 1024; const MAX_OUTPUT_FILE_SIZE: u64 = 4096; + const MAX_COMPACTION_SIZE: u64 = 10240; fn new_test_db(provider: MockRegionInfoProvider) -> (RocksEngine, TempDir) { let temp_dir = TempDir::new().unwrap(); let mut cf_opts = RocksCfOptions::default(); + cf_opts.set_max_bytes_for_level_base(MAX_OUTPUT_FILE_SIZE); + cf_opts.set_max_bytes_for_level_multiplier(5); cf_opts.set_target_file_size_base(MAX_OUTPUT_FILE_SIZE); cf_opts.set_sst_partitioner_factory(RocksSstPartitionerFactory( - CompactionGuardGeneratorFactory::new(CF_DEFAULT, provider, MIN_OUTPUT_FILE_SIZE) - .unwrap(), + CompactionGuardGeneratorFactory::new( + CF_DEFAULT, + provider, + MIN_OUTPUT_FILE_SIZE, + MAX_COMPACTION_SIZE, + ) + .unwrap(), )); cf_opts.set_disable_auto_compactions(true); cf_opts.compression_per_level(&[ @@ -412,6 +552,16 @@ mod tests { ret } + fn get_sst_files(dir: &Path) -> Vec { + let files = dir.read_dir().unwrap(); + let mut sst_files = files + .map(|entry| entry.unwrap().path().to_str().unwrap().to_owned()) + .filter(|entry| entry.ends_with(".sst")) + .collect::>(); + sst_files.sort(); + sst_files + } + #[test] fn test_compaction_guard_with_rocks() { let provider = MockRegionInfoProvider::new(vec![ @@ -463,11 +613,7 @@ mod tests { ) .unwrap(); - let files = dir.path().read_dir().unwrap(); - let mut sst_files = files - .map(|entry| entry.unwrap().path().to_str().unwrap().to_owned()) - .filter(|entry| entry.ends_with(".sst")) - .collect::>(); + let mut sst_files = get_sst_files(dir.path()); sst_files.sort(); assert_eq!(3, sst_files.len()); assert_eq!(collect_keys(&sst_files[0]), [b"za1", b"zb1", b"zb2"]); @@ -477,4 +623,120 @@ mod tests { ); assert_eq!(collect_keys(&sst_files[2]), [b"zc6"]); } + + fn simple_regions() -> MockRegionInfoProvider { + MockRegionInfoProvider::new(vec![ + Region { + id: 1, + start_key: b"a".to_vec(), + end_key: b"b".to_vec(), + ..Default::default() + }, + Region { + id: 2, + start_key: b"b".to_vec(), + end_key: b"c".to_vec(), + ..Default::default() + }, + Region { + id: 3, + start_key: b"c".to_vec(), + end_key: b"d".to_vec(), + ..Default::default() + }, + ]) + } + + #[test] + fn test_next_level_compaction() { + let provider = simple_regions(); + let (db, _dir) = new_test_db(provider); + assert_eq!(b"z", DATA_PREFIX_KEY); + let tiny_value = [b'v'; 1]; + let value = vec![b'v'; 1024 * 10]; + ['a', 'b', 'c'] + .into_iter() + .flat_map(|x| (1..10).map(move |n| format!("z{x}{n}").into_bytes())) + .for_each(|key| db.put(&key, &value).unwrap()); + db.flush_cfs(&[], true).unwrap(); + db.compact_files_in_range(None, None, Some(2)).unwrap(); + db.put(b"za0", &tiny_value).unwrap(); + db.put(b"zd0", &tiny_value).unwrap(); + db.flush_cfs(&[], true).unwrap(); + db.compact_files_in_range(None, None, Some(1)).unwrap(); + + let level_1 = &level_files(&db)[&1]; + assert_eq!(level_1.len(), 2, "{:?}", level_1); + assert_eq!(level_1[0].smallestkey, b"za0", "{:?}", level_1); + assert_eq!(level_1[0].largestkey, b"za0", "{:?}", level_1); + assert_eq!(level_1[1].smallestkey, b"zd0", "{:?}", level_1); + assert_eq!(level_1[1].largestkey, b"zd0", "{:?}", level_1); + } + + #[test] + fn test_next_level_compaction_no_split() { + let provider = simple_regions(); + let (db, _dir) = new_test_db(provider); + assert_eq!(b"z", DATA_PREFIX_KEY); + let tiny_value = [b'v'; 1]; + let value = vec![b'v'; 1024 * 10]; + ['a', 'b', 'c'] + .into_iter() + .flat_map(|x| (1..10).map(move |n| format!("z{x}{n}").into_bytes())) + .for_each(|key| db.put(&key, &value).unwrap()); + db.flush_cfs(&[], true).unwrap(); + db.compact_files_in_range(None, None, Some(2)).unwrap(); + // So... the next-level size will be almost 1024 * 9, which doesn't exceeds the + // compaction size limit. + db.put(b"za0", &tiny_value).unwrap(); + db.put(b"za9", &tiny_value).unwrap(); + db.flush_cfs(&[], true).unwrap(); + db.compact_files_in_range(None, None, Some(1)).unwrap(); + + let level_1 = &level_files(&db)[&1]; + assert_eq!(level_1.len(), 1, "{:?}", level_1); + assert_eq!(level_1[0].smallestkey, b"za0", "{:?}", level_1); + assert_eq!(level_1[0].largestkey, b"za9", "{:?}", level_1); + db.compact_range(None, None, false, 1).unwrap(); + + // So... the next-level size will be almost 1024 * 15, which should reach the + // limit. + db.put(b"za30", &tiny_value).unwrap(); + db.put(b"zb90", &tiny_value).unwrap(); + db.flush_cfs(&[], true).unwrap(); + db.compact_files_in_range(None, None, Some(1)).unwrap(); + + let level_1 = &level_files(&db)[&1]; + assert_eq!(level_1.len(), 2, "{:?}", level_1); + assert_eq!(level_1[0].smallestkey, b"za30", "{:?}", level_1); + assert_eq!(level_1[1].largestkey, b"zb90", "{:?}", level_1); + } + + #[derive(Debug)] + #[allow(dead_code)] + struct OwnedSstFileMetadata { + name: String, + size: usize, + smallestkey: Vec, + largestkey: Vec, + } + + #[allow(unused)] + fn level_files(db: &RocksEngine) -> HashMap> { + let db = db.as_inner(); + let cf = db.cf_handle("default").unwrap(); + let md = db.get_column_family_meta_data(cf); + let mut res: HashMap> = HashMap::default(); + for (i, level) in md.get_levels().into_iter().enumerate() { + for file in level.get_files() { + res.entry(i).or_default().push(OwnedSstFileMetadata { + name: file.get_name(), + size: file.get_size(), + smallestkey: file.get_smallestkey().to_owned(), + largestkey: file.get_largestkey().to_owned(), + }); + } + } + res + } } diff --git a/components/raftstore/src/store/config.rs b/components/raftstore/src/store/config.rs index 817be7eb969..febad57331d 100644 --- a/components/raftstore/src/store/config.rs +++ b/components/raftstore/src/store/config.rs @@ -104,12 +104,11 @@ pub struct Config { pub max_manual_flush_rate: f64, // When a peer is not responding for this time, leader will not keep entry cache for it. pub raft_entry_cache_life_time: ReadableDuration, - // Deprecated! The configuration has no effect. - // They are preserved for compatibility check. // When a peer is newly added, reject transferring leader to the peer for a while. #[doc(hidden)] #[serde(skip_serializing)] - #[online_config(skip)] + #[online_config(hidden)] + #[deprecated = "The configuration has been removed. It has no effect"] pub raft_reject_transfer_leader_duration: ReadableDuration, /// Whether to disable checking quorum for the raft group. This will make @@ -140,7 +139,7 @@ pub struct Config { pub region_compact_min_redundant_rows: u64, /// Minimum percentage of redundant rows to trigger manual compaction. /// Should between 1 and 100. - pub region_compact_redundant_rows_percent: u64, + pub region_compact_redundant_rows_percent: Option, pub pd_heartbeat_tick_interval: ReadableDuration, pub pd_store_heartbeat_tick_interval: ReadableDuration, pub snap_mgr_gc_tick_interval: ReadableDuration, @@ -169,6 +168,9 @@ pub struct Config { /// and try to alert monitoring systems, if there is any. pub abnormal_leader_missing_duration: ReadableDuration, pub peer_stale_state_check_interval: ReadableDuration, + /// Interval to check GC peers. + #[doc(hidden)] + pub gc_peer_check_interval: ReadableDuration, #[online_config(hidden)] pub leader_transfer_max_log_lag: u64, @@ -317,34 +319,47 @@ pub struct Config { pub io_reschedule_concurrent_max_count: usize, pub io_reschedule_hotpot_duration: ReadableDuration, - // Deprecated! Batch is done in raft client. #[doc(hidden)] #[serde(skip_serializing)] - #[online_config(skip)] + #[online_config(hidden)] + #[deprecated = "The configuration has been removed. Batch is done in raft client."] pub raft_msg_flush_interval: ReadableDuration, - // Deprecated! These configuration has been moved to Coprocessor. - // They are preserved for compatibility check. #[doc(hidden)] #[serde(skip_serializing)] - #[online_config(skip)] + #[online_config(hidden)] + #[deprecated = "The configuration has been moved to coprocessor.region_max_size."] pub region_max_size: ReadableSize, #[doc(hidden)] #[serde(skip_serializing)] - #[online_config(skip)] + #[online_config(hidden)] + #[deprecated = "The configuration has been moved to coprocessor.region_split_size."] pub region_split_size: ReadableSize, - // Deprecated! The time to clean stale peer safely can be decided based on RocksDB snapshot - // sequence number. #[doc(hidden)] #[serde(skip_serializing)] - #[online_config(skip)] + #[online_config(hidden)] + #[deprecated = "The configuration has been removed. The time to clean stale peer safely can be decided based on RocksDB snapshot sequence number."] pub clean_stale_peer_delay: ReadableDuration, - // Interval to inspect the latency of raftstore for slow store detection. + #[online_config(hidden)] + // Interval to inspect the latency of flushing raft logs for slow store detection. pub inspect_interval: ReadableDuration, - + // Interval to inspect the latency of flushes on kvdb for slow store detection. + // If the kvdb uses the same mount path with raftdb, the default value will be + // optimized to `0` to avoid duplicated inspection. + #[doc(hidden)] + #[online_config(hidden)] + pub inspect_kvdb_interval: ReadableDuration, + /// Threshold of CPU utilization to inspect for slow store detection. + #[doc(hidden)] + #[online_config(hidden)] + pub inspect_cpu_util_thd: f64, + #[doc(hidden)] + #[online_config(hidden)] // The unsensitive(increase it to reduce sensitiveness) of the cause-trend detection pub slow_trend_unsensitive_cause: f64, + #[doc(hidden)] + #[online_config(hidden)] // The unsensitive(increase it to reduce sensitiveness) of the result-trend detection pub slow_trend_unsensitive_result: f64, @@ -394,9 +409,17 @@ pub struct Config { #[online_config(hidden)] #[serde(alias = "enable-partitioned-raft-kv-compatible-learner")] pub enable_v2_compatible_learner: bool, + + /// The minimal count of region pending on applying raft logs. + /// Only when the count of regions which not pending on applying logs is + /// less than the threshold, can the raftstore supply service. + #[doc(hidden)] + #[online_config(hidden)] + pub min_pending_apply_region_count: u64, } impl Default for Config { + #[allow(deprecated)] fn default() -> Config { Config { prevote: true, @@ -429,7 +452,7 @@ impl Default for Config { region_compact_min_tombstones: 10000, region_compact_tombstones_percent: 30, region_compact_min_redundant_rows: 50000, - region_compact_redundant_rows_percent: 20, + region_compact_redundant_rows_percent: Some(20), pd_heartbeat_tick_interval: ReadableDuration::minutes(1), pd_store_heartbeat_tick_interval: ReadableDuration::secs(10), notify_capacity: 40960, @@ -500,7 +523,13 @@ impl Default for Config { region_max_size: ReadableSize(0), region_split_size: ReadableSize(0), clean_stale_peer_delay: ReadableDuration::minutes(0), - inspect_interval: ReadableDuration::millis(500), + inspect_interval: ReadableDuration::millis(100), + inspect_kvdb_interval: ReadableDuration::secs(2), + // The default value of `inspect_cpu_util_thd` is 0.4, which means + // when the cpu utilization is greater than 40%, the store might be + // regarded as a slow node if there exists delayed inspected messages. + // It's good enough for most cases to reduce the false positive rate. + inspect_cpu_util_thd: 0.4, // The param `slow_trend_unsensitive_cause == 2.0` can yield good results, // make it `10.0` to reduce a bit sensitiveness because SpikeFilter is disabled slow_trend_unsensitive_cause: 10.0, @@ -510,6 +539,7 @@ impl Default for Config { renew_leader_lease_advance_duration: ReadableDuration::secs(0), allow_unsafe_vote_after_start: false, report_region_buckets_tick_interval: ReadableDuration::secs(10), + gc_peer_check_interval: ReadableDuration::secs(60), max_snapshot_file_raw_size: ReadableSize::mb(100), unreachable_backoff: ReadableDuration::secs(10), // TODO: make its value reasonable @@ -518,6 +548,7 @@ impl Default for Config { check_request_snapshot_interval: ReadableDuration::minutes(1), enable_v2_compatible_learner: false, unsafe_disable_check_quorum: false, + min_pending_apply_region_count: 10, } } } @@ -581,6 +612,10 @@ impl Config { self.region_compact_check_step.unwrap() } + pub fn region_compact_redundant_rows_percent(&self) -> u64 { + self.region_compact_redundant_rows_percent.unwrap() + } + #[inline] pub fn warmup_entry_cache_enabled(&self) -> bool { self.max_entry_cache_warmup_duration.0 != Duration::from_secs(0) @@ -622,6 +657,29 @@ impl Config { } } + /// Optimize the interval of different inspectors according to the + /// configuration. + pub fn optimize_inspector(&mut self, separated_raft_mount_path: bool) { + // If the kvdb uses the same mount path with raftdb, the health status + // of kvdb will be inspected by raftstore automatically. So it's not necessary + // to inspect kvdb. + if !separated_raft_mount_path { + self.inspect_kvdb_interval = ReadableDuration::ZERO; + } else { + // If the inspect_kvdb_interval is less than inspect_interval, it should + // use `inspect_interval` * 10 as an empirical inspect interval for KvDB Disk + // I/O. + let inspect_kvdb_interval = if self.inspect_kvdb_interval < self.inspect_interval + && self.inspect_kvdb_interval != ReadableDuration::ZERO + { + self.inspect_interval * 10 + } else { + self.inspect_kvdb_interval + }; + self.inspect_kvdb_interval = inspect_kvdb_interval; + } + } + pub fn validate( &mut self, region_split_size: ReadableSize, @@ -766,6 +824,15 @@ impl Config { )); } + let region_compact_redundant_rows_percent = + self.region_compact_redundant_rows_percent.unwrap(); + if !(1..=100).contains(®ion_compact_redundant_rows_percent) { + return Err(box_err!( + "region-compact-redundant-rows-percent must between 1 and 100, current value is {}", + region_compact_redundant_rows_percent + )); + } + if self.local_read_batch_size == 0 { return Err(box_err!("local-read-batch-size must be greater than 0")); } @@ -903,6 +970,12 @@ impl Config { )); } + if self.min_pending_apply_region_count == 0 { + return Err(box_err!( + "min_pending_apply_region_count must be greater than 0" + )); + } + Ok(()) } @@ -992,8 +1065,11 @@ impl Config { .with_label_values(&["region_compact_min_redundant_rows"]) .set(self.region_compact_min_redundant_rows as f64); CONFIG_RAFTSTORE_GAUGE - .with_label_values(&["region_compact_tombstones_percent"]) - .set(self.region_compact_tombstones_percent as f64); + .with_label_values(&["region_compact_redundant_rows_percent"]) + .set( + self.region_compact_redundant_rows_percent + .unwrap_or_default() as f64, + ); CONFIG_RAFTSTORE_GAUGE .with_label_values(&["pd_heartbeat_tick_interval"]) .set(self.pd_heartbeat_tick_interval.as_secs_f64()); @@ -1035,6 +1111,9 @@ impl Config { CONFIG_RAFTSTORE_GAUGE .with_label_values(&["leader_transfer_max_log_lag"]) .set(self.leader_transfer_max_log_lag as f64); + CONFIG_RAFTSTORE_GAUGE + .with_label_values(&["gc_peer_check_interval"]) + .set(self.gc_peer_check_interval.as_secs_f64()); CONFIG_RAFTSTORE_GAUGE .with_label_values(&["snap_apply_batch_size"]) @@ -1517,5 +1596,26 @@ mod tests { cfg.raft_log_gc_count_limit(), split_size * 3 / 4 / ReadableSize::kb(1) ); + + cfg = Config::new(); + cfg.optimize_inspector(false); + assert_eq!(cfg.inspect_kvdb_interval, ReadableDuration::ZERO); + + cfg = Config::new(); + cfg.inspect_kvdb_interval = ReadableDuration::secs(1); + cfg.optimize_inspector(false); + assert_eq!(cfg.inspect_kvdb_interval, ReadableDuration::ZERO); + cfg.optimize_inspector(true); + assert_eq!(cfg.inspect_kvdb_interval, ReadableDuration::ZERO); + + cfg.inspect_kvdb_interval = ReadableDuration::secs(1); + cfg.optimize_inspector(true); + assert_eq!(cfg.inspect_kvdb_interval, ReadableDuration::secs(1)); + + cfg = Config::new(); + cfg.inspect_kvdb_interval = ReadableDuration::millis(1); + cfg.inspect_interval = ReadableDuration::millis(100); + cfg.optimize_inspector(true); + assert_eq!(cfg.inspect_kvdb_interval, ReadableDuration::secs(1)); } } diff --git a/components/raftstore/src/store/fsm/apply.rs b/components/raftstore/src/store/fsm/apply.rs index e2b1cedc88d..c03efc15ad2 100644 --- a/components/raftstore/src/store/fsm/apply.rs +++ b/components/raftstore/src/store/fsm/apply.rs @@ -272,6 +272,7 @@ pub enum ExecResult { regions: Vec, derived: Region, new_split_regions: HashMap, + share_source_region_size: bool, }, PrepareMerge { region: Region, @@ -554,7 +555,8 @@ where delegate.unfinished_write_seqno.push(seqno); } self.prepare_for(delegate); - delegate.last_flush_applied_index = delegate.apply_state.get_applied_index() + delegate.last_flush_applied_index = delegate.apply_state.get_applied_index(); + delegate.has_pending_ssts = false; } self.kv_wb_last_bytes = self.kv_wb().data_size() as u64; self.kv_wb_last_keys = self.kv_wb().count() as u64; @@ -661,9 +663,7 @@ where results: VecDeque>, ) { if self.host.pre_persist(&delegate.region, true, None) { - if !delegate.pending_remove { - delegate.maybe_write_apply_state(self); - } + delegate.maybe_write_apply_state(self); self.commit_opt(delegate, false); } else { debug!("do not persist when finish_for"; @@ -678,7 +678,7 @@ where exec_res: results, metrics: mem::take(&mut delegate.metrics), applied_term: delegate.applied_term, - bucket_stat: delegate.buckets.clone().map(Box::new), + bucket_stat: delegate.buckets.clone(), }); if !self.kv_wb().is_empty() { // Pending writes not flushed, need to set seqno to following ApplyRes later @@ -792,7 +792,7 @@ pub fn notify_stale_req_with_msg(term: u64, msg: String, cb: impl ErrorCallback) } /// Checks if a write is needed to be issued before handling the command. -fn should_write_to_engine(cmd: &RaftCmdRequest) -> bool { +fn should_write_to_engine(has_pending_writes: bool, cmd: &RaftCmdRequest) -> bool { if cmd.has_admin_request() { match cmd.get_admin_request().get_cmd_type() { // ComputeHash require an up to date snapshot. @@ -810,7 +810,7 @@ fn should_write_to_engine(cmd: &RaftCmdRequest) -> bool { if req.has_delete_range() { return true; } - if req.has_ingest_sst() { + if req.has_ingest_sst() && has_pending_writes { return true; } } @@ -918,7 +918,8 @@ where /// All of messages that need to continue to be handled after /// the source peer has applied its logs and pending entries /// are all handled. - pending_msgs: Vec>, + #[allow(clippy::vec_box)] + pending_msgs: Vec>>, /// Cache heap size for itself. heap_size: Option, @@ -1044,6 +1045,8 @@ where buckets: Option, unfinished_write_seqno: Vec, + + has_pending_ssts: bool, } impl ApplyDelegate @@ -1078,6 +1081,7 @@ where trace: ApplyMemoryTrace::default(), buckets: None, unfinished_write_seqno: vec![], + has_pending_ssts: false, } } @@ -1228,9 +1232,15 @@ where if apply_ctx.yield_high_latency_operation && has_high_latency_operation(&cmd) { self.priority = Priority::Low; } + if self.has_pending_ssts { + // we are in low priority handler and to avoid overlapped ssts with same region + // just return Yield + return ApplyResult::Yield; + } let mut has_unflushed_data = self.last_flush_applied_index != self.apply_state.get_applied_index(); - if (has_unflushed_data && should_write_to_engine(&cmd) + if (has_unflushed_data + && should_write_to_engine(!apply_ctx.kv_wb().is_empty(), &cmd) || apply_ctx.kv_wb().should_write_to_engine()) && apply_ctx.host.pre_persist(&self.region, false, Some(&cmd)) { @@ -1998,6 +2008,7 @@ where match ctx.importer.validate(sst) { Ok(meta_info) => { ctx.pending_ssts.push(meta_info.clone()); + self.has_pending_ssts = true; ssts.push(meta_info) } Err(e) => { @@ -2006,7 +2017,6 @@ where panic!("{} ingest {:?}: {:?}", self.tag, sst, e); } }; - Ok(()) } } @@ -2114,14 +2124,14 @@ where match change_type { ConfChangeType::AddNode => { - let add_ndoe_fp = || { + let add_node_fp = || { fail_point!( "apply_on_add_node_1_2", self.id() == 2 && self.region_id() == 1, |_| {} ) }; - add_ndoe_fp(); + add_node_fp(); PEER_ADMIN_CMD_COUNTER_VEC .with_label_values(&["add_peer", "all"]) @@ -2516,6 +2526,9 @@ where admin_req .mut_splits() .set_right_derive(split.get_right_derive()); + admin_req + .mut_split() + .set_share_source_region_size(split.get_share_source_region_size()); admin_req.mut_splits().mut_requests().push(split); // This method is executed only when there are unapplied entries after being // restarted. So there will be no callback, it's OK to return a response @@ -2560,6 +2573,7 @@ where derived.mut_region_epoch().set_version(new_version); let right_derive = split_reqs.get_right_derive(); + let share_source_region_size = split_reqs.get_share_source_region_size(); let mut regions = Vec::with_capacity(new_region_cnt + 1); // Note that the split requests only contain ids for new regions, so we need // to handle new regions and old region separately. @@ -2724,6 +2738,7 @@ where regions, derived, new_split_regions, + share_source_region_size, }), )) } @@ -2850,11 +2865,11 @@ where fail_point!("before_handle_catch_up_logs_for_merge"); // Sends message to the source peer fsm and pause `exec_commit_merge` process let logs_up_to_date = Arc::new(AtomicU64::new(0)); - let msg = SignificantMsg::CatchUpLogs(CatchUpLogs { + let msg = Box::new(SignificantMsg::CatchUpLogs(CatchUpLogs { target_region_id: self.region_id(), merge: merge.to_owned(), logs_up_to_date: logs_up_to_date.clone(), - }); + })); ctx.notifier .notify_one(source_region_id, PeerMsg::SignificantMsg(msg)); return Ok(( @@ -3359,10 +3374,10 @@ where } fn update_memory_trace(&mut self, event: &mut TraceEvent) { - let pending_cmds = self.pending_cmds.heap_size(); + let pending_cmds = self.pending_cmds.approximate_heap_size(); let merge_yield = if let Some(ref mut state) = self.yield_state { if state.heap_size.is_none() { - state.heap_size = Some(state.heap_size()); + state.heap_size = Some(state.approximate_heap_size()); } state.heap_size.unwrap() } else { @@ -3757,10 +3772,13 @@ where }, } -impl ResourceMetered for Msg { +impl ResourceMetered for Box> { fn consume_resource(&self, resource_ctl: &Arc) -> Option { - match self { - Msg::Apply { apply, .. } => { + if !resource_ctl.is_customized() { + return None; + } + match **self { + Msg::Apply { ref apply, .. } => { let mut dominant_group = "".to_owned(); let mut max_write_bytes = 0; for cached_entries in &apply.entries { @@ -3868,7 +3886,7 @@ where pub applied_term: u64, pub exec_res: VecDeque>, pub metrics: ApplyMetrics, - pub bucket_stat: Option>, + pub bucket_stat: Option, pub write_seqno: Vec, } @@ -3893,7 +3911,7 @@ where EK: KvEngine, { delegate: ApplyDelegate, - receiver: Receiver>, + receiver: Receiver>>, mailbox: Option>>, } @@ -3903,12 +3921,14 @@ where { fn from_peer( peer: &Peer, - ) -> (LooseBoundedSender>, Box>) { + ) -> (LooseBoundedSender>>, Box>) { let reg = Registration::new(peer); ApplyFsm::from_registration(reg) } - fn from_registration(reg: Registration) -> (LooseBoundedSender>, Box>) { + fn from_registration( + reg: Registration, + ) -> (LooseBoundedSender>>, Box>) { let (tx, rx) = loose_bounded(usize::MAX); let delegate = ApplyDelegate::from_registration(reg); ( @@ -4070,6 +4090,7 @@ where /// Handles peer destroy. When a peer is destroyed, the corresponding apply /// delegate should be removed too. fn handle_destroy(&mut self, ctx: &mut ApplyContext, d: Destroy) { + fail_point!("on_apply_handle_destroy"); assert_eq!(d.region_id, self.delegate.region_id()); if d.merge_from_snapshot { assert_eq!(self.delegate.stopped, false); @@ -4078,13 +4099,11 @@ where self.destroy(ctx); ctx.notifier.notify_one( self.delegate.region_id(), - PeerMsg::ApplyRes { - res: TaskRes::Destroy { - region_id: self.delegate.region_id(), - peer_id: self.delegate.id(), - merge_from_snapshot: d.merge_from_snapshot, - }, - }, + PeerMsg::ApplyRes(Box::new(TaskRes::Destroy { + region_id: self.delegate.region_id(), + peer_id: self.delegate.id(), + merge_from_snapshot: d.merge_from_snapshot, + })), ); } } @@ -4148,7 +4167,7 @@ where .store(region_id, Ordering::SeqCst); // To trigger the target apply fsm if let Some(mailbox) = ctx.router.mailbox(catch_up_logs.target_region_id) { - let _ = mailbox.force_send(Msg::Noop); + let _ = mailbox.force_send(Box::new(Msg::Noop)); } else { error!( "failed to get mailbox, are we shutting down?"; @@ -4335,7 +4354,8 @@ where } } - fn handle_tasks(&mut self, apply_ctx: &mut ApplyContext, msgs: &mut Vec>) { + #[allow(clippy::vec_box)] + fn handle_tasks(&mut self, apply_ctx: &mut ApplyContext, msgs: &mut Vec>>) { let mut drainer = msgs.drain(..); let mut batch_apply = None; loop { @@ -4350,7 +4370,7 @@ where }; if batch_apply.is_some() { - match &msg { + match *msg { Msg::Apply { .. } => (), _ => { self.handle_apply(apply_ctx, batch_apply.take().unwrap()); @@ -4363,7 +4383,7 @@ where } } - match msg { + match *msg { Msg::Apply { start, mut apply } => { let apply_wait = start.saturating_elapsed(); apply_ctx.apply_wait.observe(apply_wait.as_secs_f64()); @@ -4384,7 +4404,9 @@ where } else { self.handle_apply(apply_ctx, batch_apply.take().unwrap()); if let Some(ref mut state) = self.delegate.yield_state { - state.pending_msgs.push(Msg::Apply { start, apply }); + state + .pending_msgs + .push(Box::new(Msg::Apply { start, apply })); state.pending_msgs.extend(drainer); break; } @@ -4430,7 +4452,7 @@ impl Fsm for ApplyFsm where EK: KvEngine, { - type Message = Msg; + type Message = Box>; #[inline] fn is_stopped(&self) -> bool { @@ -4537,7 +4559,8 @@ pub struct ApplyPoller where EK: KvEngine, { - msg_buf: Vec>, + #[allow(clippy::vec_box)] + msg_buf: Vec>>, apply_ctx: ApplyContext, messages_per_tick: usize, cfg_tracker: Tracker, @@ -4641,6 +4664,7 @@ where self.apply_ctx.flush(); for fsm in fsms.iter_mut().flatten() { fsm.delegate.last_flush_applied_index = fsm.delegate.apply_state.get_applied_index(); + fsm.delegate.has_pending_ssts = false; fsm.delegate.update_memory_trace(&mut self.trace_event); } MEMTRACE_APPLYS.trace(mem::take(&mut self.trace_event)); @@ -4768,9 +4792,9 @@ where EK: KvEngine, { pub fn schedule_task(&self, region_id: u64, msg: Msg) { - let reg = match self.try_send(region_id, msg) { + let reg = match self.try_send(region_id, Box::new(msg)) { Either::Left(Ok(())) => return, - Either::Left(Err(TrySendError::Disconnected(msg))) | Either::Right(msg) => match msg { + Either::Left(Err(TrySendError::Disconnected(msg))) | Either::Right(msg) => match *msg { Msg::Registration(reg) => reg, Msg::Apply { mut apply, .. } => { info!( @@ -4938,7 +4962,7 @@ mod memtrace { } impl HeapSize for PendingCmdQueue { - fn heap_size(&self) -> usize { + fn approximate_heap_size(&self) -> usize { // Some fields of `PendingCmd` are on stack, but ignore them because they are // just some small boxed closures. self.normals.capacity() * mem::size_of::>() @@ -4949,7 +4973,7 @@ mod memtrace { where EK: KvEngine, { - fn heap_size(&self) -> usize { + fn approximate_heap_size(&self) -> usize { let mut size = self.pending_entries.capacity() * mem::size_of::(); for e in &self.pending_entries { size += bytes_capacity(&e.data) + bytes_capacity(&e.context); @@ -4957,7 +4981,7 @@ mod memtrace { size += self.pending_msgs.capacity() * mem::size_of::>(); for msg in &self.pending_msgs { - size += msg.heap_size(); + size += msg.approximate_heap_size(); } size @@ -4969,9 +4993,9 @@ mod memtrace { EK: KvEngine, { /// Only consider large fields in `Msg`. - fn heap_size(&self) -> usize { + fn approximate_heap_size(&self) -> usize { match self { - Msg::LogsUpToDate(l) => l.heap_size(), + Msg::LogsUpToDate(l) => l.approximate_heap_size(), // For entries in `Msg::Apply`, heap size is already updated when fetching them // from `raft::Storage`. So use `0` here. Msg::Apply { .. } => 0, @@ -4989,7 +5013,7 @@ mod memtrace { } impl HeapSize for CatchUpLogs { - fn heap_size(&self) -> usize { + fn approximate_heap_size(&self) -> usize { let mut size: usize = 0; for e in &self.merge.entries { size += bytes_capacity(&e.data) + bytes_capacity(&e.context); @@ -5096,8 +5120,8 @@ mod tests { impl Notifier for TestNotifier { fn notify(&self, apply_res: Vec>) { for r in apply_res { - let res = TaskRes::Apply(r); - let _ = self.tx.send(PeerMsg::ApplyRes { res }); + let res = Box::new(TaskRes::Apply(r)); + let _ = self.tx.send(PeerMsg::ApplyRes(res)); } } fn notify_one(&self, _: u64, msg: PeerMsg) { @@ -5188,7 +5212,7 @@ mod tests { req.set_ingest_sst(IngestSstRequest::default()); let mut cmd = RaftCmdRequest::default(); cmd.mut_requests().push(req); - assert_eq!(should_write_to_engine(&cmd), true); + assert_eq!(should_write_to_engine(true, &cmd), true); assert_eq!(should_sync_log(&cmd), true); // Normal command @@ -5202,7 +5226,17 @@ mod tests { let mut req = RaftCmdRequest::default(); req.mut_admin_request() .set_cmd_type(AdminCmdType::ComputeHash); - assert_eq!(should_write_to_engine(&req), true); + assert_eq!(should_write_to_engine(true, &req), true); + assert_eq!(should_write_to_engine(false, &req), true); + + // DeleteRange command + let mut req = Request::default(); + req.set_cmd_type(CmdType::DeleteRange); + req.set_delete_range(DeleteRangeRequest::default()); + let mut cmd = RaftCmdRequest::default(); + cmd.mut_requests().push(req); + assert_eq!(should_write_to_engine(true, &cmd), true); + assert_eq!(should_write_to_engine(false, &cmd), true); // IngestSst command let mut req = Request::default(); @@ -5210,7 +5244,8 @@ mod tests { req.set_ingest_sst(IngestSstRequest::default()); let mut cmd = RaftCmdRequest::default(); cmd.mut_requests().push(req); - assert_eq!(should_write_to_engine(&cmd), true); + assert_eq!(should_write_to_engine(true, &cmd), true); + assert_eq!(should_write_to_engine(false, &cmd), false); } #[test] @@ -5293,10 +5328,7 @@ mod tests { E: KvEngine, { match receiver.recv_timeout(Duration::from_secs(3)) { - Ok(PeerMsg::ApplyRes { - res: TaskRes::Apply(res), - .. - }) => res, + Ok(PeerMsg::ApplyRes(box TaskRes::Apply(res))) => res, e => panic!("unexpected res {:?}", e), } } @@ -5444,10 +5476,7 @@ mod tests { ], ); let apply_res = match rx.recv_timeout(Duration::from_secs(3)) { - Ok(PeerMsg::ApplyRes { - res: TaskRes::Apply(res), - .. - }) => res, + Ok(PeerMsg::ApplyRes(box TaskRes::Apply(res))) => res, e => panic!("unexpected apply result: {:?}", e), }; let apply_state_key = keys::apply_state_key(2); @@ -5478,12 +5507,9 @@ mod tests { router.schedule_task(2, Msg::destroy(2, false)); let (region_id, peer_id) = match rx.recv_timeout(Duration::from_secs(3)) { - Ok(PeerMsg::ApplyRes { - res: TaskRes::Destroy { - region_id, peer_id, .. - }, - .. - }) => (region_id, peer_id), + Ok(PeerMsg::ApplyRes(box TaskRes::Destroy { + region_id, peer_id, .. + })) => (region_id, peer_id), e => panic!("expected destroy result, but got {:?}", e), }; assert_eq!(peer_id, 1); @@ -5526,6 +5552,21 @@ mod tests { ) } + fn cb_conf_change( + idx: u64, + term: u64, + tx: Sender, + ) -> Proposal> { + proposal( + true, + idx, + term, + Callback::write(Box::new(move |resp: WriteResponse| { + tx.send(resp.response).unwrap(); + })), + ) + } + struct EntryBuilder { entry: Entry, req: RaftCmdRequest, @@ -5653,6 +5694,14 @@ mod tests { self } + fn conf_change(mut self, changes: Vec) -> EntryBuilder { + let mut req = AdminRequest::default(); + req.set_cmd_type(AdminCmdType::ChangePeerV2); + req.mut_change_peer_v2().set_changes(changes.into()); + self.req.set_admin_request(req); + self + } + fn build(mut self) -> Entry { self.entry .set_data(self.req.write_to_bytes().unwrap().into()); @@ -5739,7 +5788,6 @@ mod tests { self.header.clone(), bin, 1000, - false, ); let (bytes, _) = req_encoder.encode(); self.entry.set_data(bytes.into()); @@ -6187,7 +6235,7 @@ mod tests { // nomral put command, so the first apple_res.exec_res should be empty. let apply_res = fetch_apply_res(&rx); assert!(apply_res.exec_res.is_empty()); - // The region was rescheduled low-priority becasuee of ingest command, + // The region was rescheduled low-priority because of ingest command, // only put entry has been applied; let apply_res = fetch_apply_res(&rx); assert_eq!(apply_res.applied_term, 3); @@ -6826,9 +6874,12 @@ mod tests { assert!(!resp.get_header().has_error(), "{:?}", resp); } let mut res = fetch_apply_res(&rx); - // There may be one or two ApplyRes which depends on whether these two apply - // msgs are batched together. - if res.apply_state.get_applied_index() == 3 { + // There are five entries [put, ingest, put, ingest, put] in one region. + // so the apply results should be notified at index 2/4. + if res.apply_state.get_applied_index() == 2 { + res = fetch_apply_res(&rx); + } + if res.apply_state.get_applied_index() == 4 { res = fetch_apply_res(&rx); } assert_eq!(res.apply_state.get_applied_index(), 5); @@ -6924,7 +6975,7 @@ mod tests { router.schedule_task(1, Msg::apply(apply2)); let res = fetch_apply_res(&rx); - let bucket_version = res.bucket_stat.unwrap().as_ref().meta.version; + let bucket_version = res.bucket_stat.unwrap().meta.version; assert_eq!(bucket_version, 2); @@ -7088,6 +7139,7 @@ mod tests { regions, derived: _, new_split_regions: _, + share_source_region_size: _, } = apply_res.exec_res.front().unwrap() { let r8 = regions.get(0).unwrap(); @@ -7650,6 +7702,125 @@ mod tests { system.shutdown(); } + // When a peer is removed, it is necessary to update its apply state because + // this peer may be simultaneously taking a snapshot. An outdated apply state + // invalidates the coprocessor cache assumption (apply state must match data + // in the snapshot) and potentially lead to a violation of linearizability + // (returning stale cache). + #[test] + fn test_conf_change_remove_node_update_apply_state() { + let (_path, engine) = create_tmp_engine("test-delegate"); + let (_import_dir, importer) = create_tmp_importer("test-delegate"); + let peer_id = 3; + let mut reg = Registration { + id: peer_id, + term: 1, + ..Default::default() + }; + reg.region.set_id(1); + reg.region.set_end_key(b"k5".to_vec()); + reg.region.mut_region_epoch().set_version(3); + let peers = vec![new_peer(2, 3), new_peer(4, 5), new_learner_peer(6, 7)]; + reg.region.set_peers(peers.into()); + let (tx, apply_res_rx) = mpsc::channel(); + let sender = Box::new(TestNotifier { tx }); + let coprocessor_host = CoprocessorHost::::default(); + let (region_scheduler, _) = dummy_scheduler(); + let cfg = Arc::new(VersionTrack::new(Config::default())); + let (router, mut system) = create_apply_batch_system(&cfg.value(), None); + let pending_create_peers = Arc::new(Mutex::new(HashMap::default())); + let builder = super::Builder:: { + tag: "test-store".to_owned(), + cfg, + sender, + importer, + region_scheduler, + coprocessor_host, + engine: engine.clone(), + router: router.clone(), + store_id: 2, + pending_create_peers, + }; + system.spawn("test-conf-change".to_owned(), builder); + + router.schedule_task(1, Msg::Registration(reg.dup())); + + let mut index_id = 1; + let epoch = reg.region.get_region_epoch().to_owned(); + + // Write some data. + let (capture_tx, capture_rx) = mpsc::channel(); + let put_entry = EntryBuilder::new(index_id, 1) + .put(b"k1", b"v1") + .epoch(epoch.get_conf_ver(), epoch.get_version()) + .build(); + router.schedule_task( + 1, + Msg::apply(apply( + peer_id, + 1, + 1, + vec![put_entry], + vec![cb(index_id, 1, capture_tx)], + )), + ); + let resp = capture_rx.recv_timeout(Duration::from_secs(3)).unwrap(); + assert!(!resp.get_header().has_error(), "{:?}", resp); + let initial_state: RaftApplyState = engine + .get_msg_cf(CF_RAFT, &keys::apply_state_key(1)) + .unwrap() + .unwrap(); + assert_ne!(initial_state.get_applied_index(), 0); + match apply_res_rx.recv_timeout(Duration::from_secs(3)) { + Ok(PeerMsg::ApplyRes(box TaskRes::Apply(apply_res))) => { + assert_eq!(apply_res.apply_state, initial_state) + } + e => panic!("unexpected result: {:?}", e), + } + index_id += 1; + + // Remove itself. + let (capture_tx, capture_rx) = mpsc::channel(); + let mut remove_node = ChangePeerRequest::default(); + remove_node.set_change_type(ConfChangeType::RemoveNode); + remove_node.set_peer(new_peer(2, 3)); + let conf_change = EntryBuilder::new(index_id, 1) + .conf_change(vec![remove_node]) + .epoch(epoch.get_conf_ver(), epoch.get_version()) + .build(); + router.schedule_task( + 1, + Msg::apply(apply( + peer_id, + 1, + 1, + vec![conf_change], + vec![cb_conf_change(index_id, 1, capture_tx)], + )), + ); + let resp = capture_rx.recv_timeout(Duration::from_secs(3)).unwrap(); + assert!(!resp.get_header().has_error(), "{:?}", resp); + + let apply_state: RaftApplyState = engine + .get_msg_cf(CF_RAFT, &keys::apply_state_key(1)) + .unwrap() + .unwrap(); + match apply_res_rx.recv_timeout(Duration::from_secs(3)) { + Ok(PeerMsg::ApplyRes(box TaskRes::Apply(apply_res))) => { + assert_eq!(apply_res.apply_state, apply_state) + } + e => panic!("unexpected result: {:?}", e), + } + assert!( + apply_state.get_applied_index() > initial_state.get_applied_index(), + "\n{:?}\n{:?}", + apply_state, + initial_state + ); + + system.shutdown(); + } + #[test] fn pending_cmd_leak() { let res = panic_hook::recover_safe(|| { diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index da91e26eb09..702b7a6bd4c 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -22,6 +22,7 @@ use engine_traits::{Engines, KvEngine, RaftEngine, SstMetaInfo, WriteBatchExt, C use error_code::ErrorCodeExt; use fail::fail_point; use futures::channel::mpsc::UnboundedSender; +use itertools::Itertools; use keys::{self, enc_end_key, enc_start_key}; use kvproto::{ brpb::CheckAdminResponse, @@ -41,7 +42,7 @@ use kvproto::{ replication_modepb::{DrAutoSyncState, ReplicationMode}, }; use parking_lot::RwLockWriteGuard; -use pd_client::{new_bucket_stats, BucketMeta, BucketStat}; +use pd_client::BucketMeta; use protobuf::Message; use raft::{ self, @@ -49,13 +50,15 @@ use raft::{ GetEntriesContext, Progress, ReadState, SnapshotStatus, StateRole, INVALID_INDEX, NO_LIMIT, }; use smallvec::SmallVec; +use strum::{EnumCount, VariantNames}; use tikv_alloc::trace::TraceEvent; use tikv_util::{ - box_err, debug, defer, error, escape, info, is_zero_duration, + box_err, debug, defer, error, escape, info, info_or_debug, is_zero_duration, mpsc::{self, LooseBoundedSender, Receiver}, + slow_log, store::{find_peer, find_peer_by_id, is_learner, region_on_same_stores}, sys::disk::DiskUsage, - time::{monotonic_raw_now, Instant as TiInstant}, + time::{monotonic_raw_now, Instant as TiInstant, SlowTimer}, trace, warn, worker::{ScheduleError, Scheduler}, Either, @@ -89,22 +92,22 @@ use crate::{ TRANSFER_LEADER_COMMAND_REPLY_CTX, }, region_meta::RegionMeta, + snapshot_backup::{AbortReason, SnapshotBrState, SnapshotBrWaitApplyRequest}, transport::Transport, unsafe_recovery::{ - exit_joint_request, ForceLeaderState, SnapshotRecoveryState, - SnapshotRecoveryWaitApplySyncer, UnsafeRecoveryExecutePlanSyncer, + exit_joint_request, ForceLeaderState, UnsafeRecoveryExecutePlanSyncer, UnsafeRecoveryFillOutReportSyncer, UnsafeRecoveryForceLeaderSyncer, UnsafeRecoveryState, UnsafeRecoveryWaitApplySyncer, }, - util, - util::{is_region_initialized, KeysInfoFormatter, LeaseState}, + util::{self, compare_region_epoch, KeysInfoFormatter, LeaseState}, worker::{ Bucket, BucketRange, CleanupTask, ConsistencyCheckTask, GcSnapshotTask, RaftlogGcTask, ReadDelegate, ReadProgress, RegionTask, SplitCheckTask, }, CasualMessage, Config, LocksStatus, MergeResultKind, PdTask, PeerMsg, PeerTick, - ProposalContext, RaftCmdExtraOpts, RaftCommand, RaftlogFetchResult, ReadCallback, ReadTask, - SignificantMsg, SnapKey, StoreMsg, WriteCallback, + ProposalContext, RaftCmdExtraOpts, RaftCommand, RaftlogFetchResult, ReadCallback, + ReadIndexContext, ReadTask, SignificantMsg, SnapKey, StoreMsg, WriteCallback, + RAFT_INIT_LOG_INDEX, }, Error, Result, }; @@ -128,6 +131,10 @@ enum DelayReason { /// in most case. const MAX_REGIONS_IN_ERROR: usize = 10; const REGION_SPLIT_SKIP_MAX_COUNT: usize = 3; +/// Limits the request size that can be batched in a single RaftCmdRequest. +// todo: this fugure maybe changed to a more suitable value. +#[allow(clippy::identity_op)] +const MAX_BATCH_SIZE_LIMIT: u64 = 1 * 1024 * 1024; const UNSAFE_RECOVERY_STATE_TIMEOUT: Duration = Duration::from_secs(60); pub const MAX_PROPOSAL_SIZE_RATIO: f64 = 0.4; @@ -205,8 +212,8 @@ where while let Ok(msg) = self.receiver.try_recv() { let callback = match msg { PeerMsg::RaftCommand(cmd) => cmd.callback, - PeerMsg::CasualMessage(CasualMessage::SplitRegion { callback, .. }) => callback, - PeerMsg::RaftMessage(im) => { + PeerMsg::CasualMessage(box CasualMessage::SplitRegion { callback, .. }) => callback, + PeerMsg::RaftMessage(im, _) => { raft_messages_size += im.heap_size; continue; } @@ -286,6 +293,7 @@ where region, meta_peer, wait_data, + None, )?, tick_registry: [false; PeerTick::VARIANT_COUNT], missing_ticks: 0, @@ -316,12 +324,16 @@ where engines: Engines, region_id: u64, peer: metapb::Peer, + create_by_peer: metapb::Peer, ) -> Result> { // We will remove tombstone key when apply snapshot info!( "replicate peer"; "region_id" => region_id, "peer_id" => peer.get_id(), + "store_id" => store_id, + "create_by_peer_id" => create_by_peer.get_id(), + "create_by_peer_store_id" => create_by_peer.get_store_id(), ); let mut region = metapb::Region::default(); @@ -341,6 +353,7 @@ where ®ion, peer, false, + Some(create_by_peer), )?, tick_registry: [false; PeerTick::VARIANT_COUNT], missing_ticks: 0, @@ -431,8 +444,13 @@ where // No batch request whose size exceed 20% of raft_entry_max_size, // so total size of request in batch_raft_request would not exceed // (40% + 20%) of raft_entry_max_size + // Also, to prevent the write batch size from becoming too large when + // raft_entry_max_size is set too high (all requests in a RaftCmdRequest will be + // written in one RocksDB write batch), we use MAX_APPLY_BATCH_SIZE to + // limit the number of requests batched within a single RaftCmdRequest. if req.get_requests().is_empty() || req_size as u64 > (cfg.raft_entry_max_size.0 as f64 * 0.2) as u64 + || (self.batch_req_size + req_size as u64) > MAX_BATCH_SIZE_LIMIT { return false; } @@ -612,14 +630,29 @@ where } pub fn handle_msgs(&mut self, msgs: &mut Vec>) { - let timer = TiInstant::now_coarse(); + let timer = SlowTimer::from_millis(100); let count = msgs.len(); + #[allow(const_evaluatable_unchecked)] + let mut distribution = [0; PeerMsg::::COUNT]; for m in msgs.drain(..) { + // skip handling remain messages if fsm is destroyed. This can aviod handling + // arbitary messages(e.g. CasualMessage::ForceCompactRaftLogs) that may need + // to read raft logs which maybe lead to panic. + if self.fsm.stopped { + break; + } + distribution[m.discriminant()] += 1; match m { - PeerMsg::RaftMessage(msg) => { + PeerMsg::RaftMessage(msg, sent_time) => { + if let Some(sent_time) = sent_time { + let wait_time = sent_time.saturating_elapsed().as_secs_f64(); + self.ctx.raft_metrics.process_wait_time.observe(wait_time); + } + if !self.ctx.coprocessor_host.on_raft_message(&msg.msg) { continue; } + if let Err(e) = self.on_raft_message(msg) { error!(%e; "handle raft message err"; @@ -656,7 +689,7 @@ where && !self.fsm.peer.disk_full_peers.majority()) || cmd.extra_opts.disk_full_opt == DiskFullOpt::NotAllowedOnFull) { - self.fsm.batch_req_builder.add(cmd, req_size); + self.fsm.batch_req_builder.add(*cmd, req_size); if self.fsm.batch_req_builder.should_finish(&self.ctx.cfg) { self.propose_pending_batch_raft_command(); } @@ -669,7 +702,7 @@ where } } PeerMsg::Tick(tick) => self.on_tick(tick), - PeerMsg::ApplyRes { res } => { + PeerMsg::ApplyRes(res) => { self.on_apply_res(res); } PeerMsg::SignificantMsg(msg) => self.on_significant_msg(msg), @@ -688,23 +721,25 @@ where PeerMsg::UpdateReplicationMode => self.on_update_replication_mode(), PeerMsg::Destroy(peer_id) => { if self.fsm.peer.peer_id() == peer_id { - match self.fsm.peer.maybe_destroy(self.ctx) { - None => self.ctx.raft_metrics.message_dropped.applying_snap.inc(), - Some(job) => { - self.handle_destroy_peer(job); - } - } + self.maybe_destroy(); } } } } self.on_loop_finished(); + slow_log!( + T timer, + "{} handle {} peer messages {:?}", + self.fsm.peer.tag, + count, + PeerMsg::::VARIANTS.iter().zip(distribution).filter(|(_, c)| *c > 0).format(", "), + ); self.ctx.raft_metrics.peer_msg_len.observe(count as f64); self.ctx .raft_metrics .event_time .peer_msg - .observe(timer.saturating_elapsed_secs()); + .observe(timer.saturating_elapsed().as_secs_f64()); } #[inline] @@ -823,6 +858,8 @@ where target_index: self.fsm.peer.raft_group.raft.raft_log.last_index(), demote_after_exit: true, }); + } else { + self.fsm.peer.unsafe_recovery_state = Some(UnsafeRecoveryState::Failed); } } else { self.unsafe_recovery_demote_failed_voters(syncer, failed_voters); @@ -862,6 +899,8 @@ where target_index: self.fsm.peer.raft_group.raft.raft_log.last_index(), demote_after_exit: false, }); + } else { + self.fsm.peer.unsafe_recovery_state = Some(UnsafeRecoveryState::Failed); } } else { warn!( @@ -912,19 +951,28 @@ where self.fsm.peer.raft_group.raft.raft_log.committed }; - self.fsm.peer.unsafe_recovery_state = Some(UnsafeRecoveryState::WaitApply { - target_index, - syncer, - }); - self.fsm - .peer - .unsafe_recovery_maybe_finish_wait_apply(/* force= */ self.fsm.stopped); + if target_index > self.fsm.peer.raft_group.raft.raft_log.applied { + info!( + "Unsafe recovery, start wait apply"; + "region_id" => self.region().get_id(), + "peer_id" => self.fsm.peer_id(), + "target_index" => target_index, + "applied" => self.fsm.peer.raft_group.raft.raft_log.applied, + ); + self.fsm.peer.unsafe_recovery_state = Some(UnsafeRecoveryState::WaitApply { + target_index, + syncer, + }); + self.fsm + .peer + .unsafe_recovery_maybe_finish_wait_apply(/* force= */ self.fsm.stopped); + } } // func be invoked firstly after assigned leader by BR, wait all leader apply to // last log index func be invoked secondly wait follower apply to last // index, however the second call is broadcast, it may improve in future - fn on_snapshot_recovery_wait_apply(&mut self, syncer: SnapshotRecoveryWaitApplySyncer) { + fn on_snapshot_br_wait_apply(&mut self, req: SnapshotBrWaitApplyRequest) { if let Some(state) = &self.fsm.peer.snapshot_recovery_state { warn!( "can't wait apply, another recovery in progress"; @@ -932,20 +980,47 @@ where "peer_id" => self.fsm.peer_id(), "state" => ?state, ); - syncer.abort(); + req.syncer.abort(AbortReason::Duplicated); return; } let target_index = self.fsm.peer.raft_group.raft.raft_log.last_index(); + let applied_index = self.fsm.peer.raft_group.raft.raft_log.applied; + let term = self.fsm.peer.raft_group.raft.term; + if let Some(e) = &req.expected_epoch { + if let Err(err) = compare_region_epoch(e, self.region(), true, true, true) { + warn!("epoch not match for wait apply, aborting."; "err" => %err, + "peer" => self.fsm.peer.peer_id(), + "region" => self.fsm.peer.region().get_id()); + let mut pberr = errorpb::Error::from(err); + req.syncer + .abort(AbortReason::EpochNotMatch(pberr.take_epoch_not_match())); + return; + } + } + + // trivial case: no need to wait apply -- already the latest. + // Return directly for avoiding to print tons of logs. + if target_index == applied_index { + debug!( + "skip trivial case of waiting apply."; + "region_id" => self.region_id(), + "peer_id" => self.fsm.peer_id(), + "target_index" => target_index, + "applied_index" => applied_index, + ); + SNAP_BR_WAIT_APPLY_EVENT.trivial.inc(); + return; + } // during the snapshot recovery, broadcast waitapply, some peer may stale if !self.fsm.peer.is_leader() { info!( - "snapshot follower recovery started"; + "snapshot follower wait apply started"; "region_id" => self.region_id(), "peer_id" => self.fsm.peer_id(), "target_index" => target_index, - "applied_index" => self.fsm.peer.raft_group.raft.raft_log.applied, + "applied_index" => applied_index, "pending_remove" => self.fsm.peer.pending_remove, "voter" => self.fsm.peer.raft_group.raft.vote, ); @@ -955,7 +1030,8 @@ where // case#2 if peer is suppose to remove if self.fsm.peer.raft_group.raft.vote == 0 || self.fsm.peer.pending_remove { info!( - "this peer is never vote before or pending remove, it should be skip to wait apply" + "this peer is never vote before or pending remove, it should be skip to wait apply"; + "region" => %self.region_id(), ); return; } @@ -965,13 +1041,15 @@ where "region_id" => self.region_id(), "peer_id" => self.fsm.peer_id(), "target_index" => target_index, - "applied_index" => self.fsm.peer.raft_group.raft.raft_log.applied, + "applied_index" => applied_index, ); } + SNAP_BR_WAIT_APPLY_EVENT.accepted.inc(); - self.fsm.peer.snapshot_recovery_state = Some(SnapshotRecoveryState::WaitLogApplyToLast { + self.fsm.peer.snapshot_recovery_state = Some(SnapshotBrState::WaitLogApplyToLast { target_index, - syncer, + valid_for_term: req.abort_when_term_change.then_some(term), + syncer: req.syncer, }); self.fsm .peer @@ -1041,15 +1119,22 @@ where } } - fn on_casual_msg(&mut self, msg: CasualMessage) { - match msg { + fn on_casual_msg(&mut self, msg: Box>) { + match *msg { CasualMessage::SplitRegion { region_epoch, split_keys, callback, source, + share_source_region_size, } => { - self.on_prepare_split_region(region_epoch, split_keys, callback, &source); + self.on_prepare_split_region( + region_epoch, + split_keys, + callback, + &source, + share_source_region_size, + ); } CasualMessage::ComputeHashResult { index, @@ -1058,11 +1143,11 @@ where } => { self.on_hash_computed(index, context, hash); } - CasualMessage::RegionApproximateSize { size } => { - self.on_approximate_region_size(size); + CasualMessage::RegionApproximateSize { size, splitable } => { + self.on_approximate_region_size(size, splitable); } - CasualMessage::RegionApproximateKeys { keys } => { - self.on_approximate_region_keys(keys); + CasualMessage::RegionApproximateKeys { keys, splitable } => { + self.on_approximate_region_keys(keys, splitable); } CasualMessage::RefreshRegionBuckets { region_epoch, @@ -1174,8 +1259,21 @@ where let raft_msg = self.fsm.peer.build_raft_messages(self.ctx, vec![msg]); self.fsm.peer.send_raft_messages(self.ctx, raft_msg); } - CasualMessage::SnapshotApplied => { + CasualMessage::SnapshotApplied { peer_id, tombstone } => { self.fsm.has_ready = true; + // If failed on applying snapshot, it should record the peer as an invalid peer. + if tombstone && self.fsm.peer.peer_id() == peer_id && !self.fsm.peer.is_leader() { + info!( + "mark the region damaged on applying snapshot"; + "region_id" => self.region_id(), + "peer_id" => peer_id, + ); + let mut meta = self.ctx.store_meta.lock().unwrap(); + meta.damaged_regions.insert(self.region_id()); + } + if self.fsm.peer.should_destroy_after_apply_snapshot() { + self.maybe_destroy(); + } } CasualMessage::Campaign => { let _ = self.fsm.peer.raft_group.campaign(); @@ -1340,9 +1438,7 @@ where } fn on_clear_region_size(&mut self) { - self.fsm.peer.approximate_size = None; - self.fsm.peer.approximate_keys = None; - self.fsm.peer.may_skip_split_check = false; + self.fsm.peer.split_check_trigger.on_clear_region_size(); self.register_split_region_check_tick(); } @@ -1385,8 +1481,8 @@ where ); } - fn on_significant_msg(&mut self, msg: SignificantMsg) { - match msg { + fn on_significant_msg(&mut self, msg: Box>) { + match *msg { SignificantMsg::SnapshotStatus { to_peer_id, status, .. } => { @@ -1458,7 +1554,7 @@ where } => { self.on_enter_pre_force_leader(syncer, failed_stores); } - SignificantMsg::ExitForceLeaderState => self.on_exit_force_leader(), + SignificantMsg::ExitForceLeaderState => self.on_exit_force_leader(false), SignificantMsg::UnsafeRecoveryDemoteFailedVoters { syncer, failed_voters, @@ -1473,9 +1569,7 @@ where self.on_unsafe_recovery_fill_out_report(syncer) } // for snapshot recovery (safe recovery) - SignificantMsg::SnapshotRecoveryWaitApply(syncer) => { - self.on_snapshot_recovery_wait_apply(syncer) - } + SignificantMsg::SnapshotBrWaitApply(syncer) => self.on_snapshot_br_wait_apply(syncer), SignificantMsg::CheckPendingAdmin(ch) => self.on_check_pending_admin(ch), } } @@ -1692,10 +1786,19 @@ where self.fsm.has_ready = true; } - fn on_exit_force_leader(&mut self) { + fn on_exit_force_leader(&mut self, force: bool) { if self.fsm.peer.force_leader.is_none() { return; } + if let Some(UnsafeRecoveryState::Failed) = self.fsm.peer.unsafe_recovery_state && !force { + // Skip force leader if the plan failed, so wait for the next retry of plan with force leader state holding + info!( + "skip exiting force leader state"; + "region_id" => self.fsm.region_id(), + "peer_id" => self.fsm.peer_id(), + ); + return; + } info!( "exit force leader state"; @@ -1704,7 +1807,7 @@ where ); self.fsm.peer.force_leader = None; // make sure it's not hibernated - assert_eq!(self.fsm.hibernate_state.group_state(), GroupState::Ordered); + assert_ne!(self.fsm.hibernate_state.group_state(), GroupState::Idle); // leader lease shouldn't be renewed in force leader state. assert_eq!( self.fsm.peer.leader_lease().inspect(None), @@ -1723,7 +1826,7 @@ where // follower state let _ = self.ctx.router.send( self.region_id(), - PeerMsg::CasualMessage(CasualMessage::Campaign), + PeerMsg::CasualMessage(Box::new(CasualMessage::Campaign)), ); } self.fsm.has_ready = true; @@ -2105,11 +2208,27 @@ where self.fsm.hibernate_state.group_state() == GroupState::Idle, |_| {} ); + fail_point!( + "on_raft_base_tick_chaos", + self.fsm.hibernate_state.group_state() == GroupState::Chaos, + |_| {} + ); if self.fsm.peer.pending_remove { self.fsm.peer.mut_store().flush_entry_cache_metrics(); return; } + + // Update the state whether the peer is pending on applying raft + // logs if necesssary. + self.on_check_peer_complete_apply_logs(); + + // If the peer is busy on apply and missing the last leader committed index, + // it should propose a read index to check whether its lag is behind the leader. + // It won't generate flooding fetching messages. This proposal will only be sent + // out before it gets response and updates the `last_leader_committed_index`. + self.try_to_fetch_committed_index(); + // When having pending snapshot, if election timeout is met, it can't pass // the pending conf change check because first index has been updated to // a value that is larger than last index. @@ -2266,13 +2385,16 @@ where } } // Destroy does not need be processed, the state is cleaned up together with peer. - Some(_) | None => {} + Some(UnsafeRecoveryState::Destroy { .. }) + | Some(UnsafeRecoveryState::Failed) + | Some(UnsafeRecoveryState::WaitInitialize(..)) + | None => {} } } - fn on_apply_res(&mut self, res: ApplyTaskRes) { + fn on_apply_res(&mut self, res: Box>) { fail_point!("on_apply_res", |_| {}); - match res { + match *res { ApplyTaskRes::Apply(mut res) => { debug!( "async apply finish"; @@ -2288,10 +2410,11 @@ where return; } let applied_index = res.apply_state.applied_index; - let buckets = self.fsm.peer.region_buckets.as_mut(); - if let (Some(delta), Some(buckets)) = (res.bucket_stat, buckets) { - buckets.merge(&delta); - } + self.fsm + .peer + .region_buckets_info_mut() + .add_bucket_flow(&res.bucket_stat); + self.fsm.has_ready |= self.fsm.peer.post_apply( self.ctx, res.apply_state, @@ -2439,8 +2562,8 @@ where } } - fn on_raft_message(&mut self, msg: InspectedRaftMessage) -> Result<()> { - let InspectedRaftMessage { heap_size, mut msg } = msg; + fn on_raft_message(&mut self, m: Box) -> Result<()> { + let InspectedRaftMessage { heap_size, mut msg } = *m; let peer_disk_usage = msg.disk_usage; let stepped = Cell::new(false); let memtrace_raft_entries = &mut self.fsm.peer.memtrace_raft_entries as *mut usize; @@ -2460,6 +2583,7 @@ where } }); + let is_initialized_peer = self.fsm.peer.is_initialized(); debug!( "handle raft message"; "region_id" => self.region_id(), @@ -2467,6 +2591,35 @@ where "message_type" => %util::MsgType(&msg), "from_peer_id" => msg.get_from_peer().get_id(), "to_peer_id" => msg.get_to_peer().get_id(), + "is_initialized_peer" => is_initialized_peer, + ); + + let msg_type = msg.get_message().get_msg_type(); + let fp_enable = |target_msg_type: MessageType| -> bool { + self.fsm.region_id() == 1000 + && self.store_id() == 2 + && !is_initialized_peer + && msg_type == target_msg_type + }; + fail_point!( + "on_snap_msg_1000_2", + fp_enable(MessageType::MsgSnapshot), + |_| Ok(()) + ); + fail_point!( + "on_vote_msg_1000_2", + fp_enable(MessageType::MsgRequestVote), + |_| Ok(()) + ); + fail_point!( + "on_append_msg_1000_2", + fp_enable(MessageType::MsgAppend), + |_| Ok(()) + ); + fail_point!( + "on_heartbeat_msg_1000_2", + fp_enable(MessageType::MsgHeartbeat), + |_| Ok(()) ); if self.fsm.peer.pending_remove || self.fsm.stopped { @@ -2475,7 +2628,6 @@ where self.handle_reported_disk_usage(&msg); - let msg_type = msg.get_message().get_msg_type(); if matches!(self.ctx.self_disk_usage, DiskUsage::AlreadyFull) && MessageType::MsgTimeoutNow == msg_type { @@ -2520,6 +2672,22 @@ where return Ok(()); } + // If this peer is restarting, it may lose some logs, so it should update + // the `last_leader_committed_idx` with the commited index of the first + // `MsgAppend`` message or the committed index in `MsgReadIndexResp` it received + // from leader. + if self.fsm.peer.needs_update_last_leader_committed_idx() + && (MessageType::MsgAppend == msg_type || MessageType::MsgReadIndexResp == msg_type) + { + let committed_index = cmp::max( + msg.get_message().get_commit(), // from MsgAppend + msg.get_message().get_index(), // from MsgReadIndexResp + ); + self.fsm + .peer + .update_last_leader_committed_idx(committed_index); + } + if msg.has_extra_msg() { self.on_extra_message(msg); return Ok(()); @@ -2561,7 +2729,7 @@ where } else { // This can be a message that sent when it's still a follower. Nevertheleast, // it's meaningless to continue to handle the request as callbacks are cleared. - if msg.get_message().get_msg_type() == MessageType::MsgReadIndex + if msg_type == MessageType::MsgReadIndex && self.fsm.peer.is_leader() && (msg.get_message().get_from() == raft::INVALID_ID || msg.get_message().get_from() == self.fsm.peer_id()) @@ -2783,18 +2951,19 @@ where fn on_extra_message(&mut self, mut msg: RaftMessage) { match msg.get_extra_msg().get_type() { ExtraMessageType::MsgRegionWakeUp | ExtraMessageType::MsgCheckStalePeer => { - if self.fsm.hibernate_state.group_state() == GroupState::Idle { - if msg.get_extra_msg().forcely_awaken { - // Forcely awaken this region by manually setting this GroupState - // into Chaos to trigger a new voting in this RaftGroup. - self.reset_raft_tick(if !self.fsm.peer.is_leader() { - GroupState::Chaos - } else { - GroupState::Ordered - }); + if msg.get_extra_msg().forcely_awaken { + // Forcely awaken this region by manually setting the GroupState + // into `Chaos` to trigger a new voting in the Raft Group. + // Meanwhile, it avoids the peer entering the `PreChaos` state, + // which would wait for another long tick to enter the `Chaos` state. + self.reset_raft_tick(if !self.fsm.peer.is_leader() { + GroupState::Chaos } else { - self.reset_raft_tick(GroupState::Ordered); - } + GroupState::Ordered + }); + } + if self.fsm.hibernate_state.group_state() == GroupState::Idle { + self.reset_raft_tick(GroupState::Ordered); } if msg.get_extra_msg().get_type() == ExtraMessageType::MsgRegionWakeUp && self.fsm.peer.is_leader() @@ -2849,6 +3018,11 @@ where } fn reset_raft_tick(&mut self, state: GroupState) { + debug!( + "reset raft tick to {:?}", state; + "region_id"=> self.fsm.region_id(), + "peer_id" => self.fsm.peer_id(), + ); self.fsm.reset_hibernate_state(state); self.fsm.missing_ticks = 0; self.fsm.peer.should_wake_up = false; @@ -2956,10 +3130,10 @@ where ); if self.handle_destroy_peer(job) { // It's not frequent, so use 0 as `heap_size` is ok. - let store_msg = StoreMsg::RaftMessage(InspectedRaftMessage { + let store_msg = StoreMsg::RaftMessage(Box::new(InspectedRaftMessage { heap_size: 0, msg: msg.clone(), - }); + })); if let Err(e) = self.ctx.router.send_control(store_msg) { info!( "failed to send back store message, are we shutting down?"; @@ -3069,11 +3243,12 @@ where return; } - if self.fsm.peer.peer != *msg.get_to_peer() { + if self.fsm.peer.peer.get_id() != msg.get_to_peer().get_id() { info!( "receive stale gc message, ignore."; "region_id" => self.fsm.region_id(), "peer_id" => self.fsm.peer_id(), + "to_peer_id" => msg.get_to_peer().get_id(), ); self.ctx.raft_metrics.message_dropped.stale_msg.inc(); return; @@ -3116,7 +3291,7 @@ where // No need to get snapshot for witness, as witness's empty snapshot bypass // snapshot manager. let key = SnapKey::from_region_snap(region_id, snap); - self.ctx.snap_mgr.get_snapshot_for_applying(&key)?; + self.ctx.snap_mgr.meta_file_exist(&key)?; Some(key) } else { None @@ -3189,7 +3364,24 @@ where } let mut meta = self.ctx.store_meta.lock().unwrap(); - if meta.regions[&self.region_id()] != *self.region() { + // Check if the region matches the metadata. A mismatch means another + // peer has replaced the current peer, which can happen during a split: a + // peer is first created via raft message, then replaced by another peer + // (of the same region) when the split is applied. + let region_mismatch = match meta.regions.get(&self.region_id()) { + Some(region) => *region != *self.region(), + None => { + // If the region doesn't exist, treat it as a mismatch. This can + // happen in rare situations (e.g. #17469). + warn!( + "region not found in meta"; + "region_id" => self.fsm.region_id(), + "peer_id" => self.fsm.peer_id(), + ); + true + } + }; + if region_mismatch { if !self.fsm.peer.is_initialized() { info!( "stale delegate detected, skip"; @@ -3202,7 +3394,7 @@ where panic!( "{} meta corrupted: {:?} != {:?}", self.fsm.peer.tag, - meta.regions[&self.region_id()], + meta.regions.get(&self.region_id()), self.region() ); } @@ -3288,7 +3480,7 @@ where // may has been merged/splitted already. let _ = self.ctx.router.force_send( exist_region.get_id(), - PeerMsg::CasualMessage(CasualMessage::RegionOverlapped), + PeerMsg::CasualMessage(Box::new(CasualMessage::RegionOverlapped)), ); } } @@ -3366,11 +3558,11 @@ where .router .force_send( source_region_id, - PeerMsg::SignificantMsg(SignificantMsg::MergeResult { + PeerMsg::SignificantMsg(Box::new(SignificantMsg::MergeResult { target_region_id: self.fsm.region_id(), target: self.fsm.peer.peer.clone(), result, - }), + })), ) .unwrap(); } @@ -3542,6 +3734,15 @@ where } } + fn maybe_destroy(&mut self) { + match self.fsm.peer.maybe_destroy(self.ctx) { + None => self.ctx.raft_metrics.message_dropped.applying_snap.inc(), + Some(job) => { + self.handle_destroy_peer(job); + } + } + } + /// Check if destroy can be executed immediately. If it can't, the reason is /// returned. fn maybe_delay_destroy(&mut self) -> Option { @@ -3589,9 +3790,10 @@ where ) .flush() .when_done(move || { - if let Err(e) = - mb.force_send(PeerMsg::SignificantMsg(SignificantMsg::RaftLogGcFlushed)) - { + fail_point!("destroy_region_before_gc_flush"); + if let Err(e) = mb.force_send(PeerMsg::SignificantMsg(Box::new( + SignificantMsg::RaftLogGcFlushed, + ))) { if tikv_util::thread_group::is_shutdown(!cfg!(test)) { return; } @@ -3600,6 +3802,7 @@ where region_id, peer_id, e ); } + fail_point!("destroy_region_after_gc_flush"); }); if let Err(e) = self.ctx.raftlog_gc_scheduler.schedule(task) { if tikv_util::thread_group::is_shutdown(!cfg!(test)) { @@ -3664,14 +3867,7 @@ where } let region_id = self.region_id(); - let is_initialized = self.fsm.peer.is_initialized(); - info!( - "starts destroy"; - "region_id" => region_id, - "peer_id" => self.fsm.peer_id(), - "merged_by_target" => merged_by_target, - "is_initialized" => is_initialized, - ); + let is_peer_initialized = self.fsm.peer.is_initialized(); // We can't destroy a peer which is handling snapshot. assert!(!self.fsm.peer.is_handling_snapshot()); @@ -3688,25 +3884,46 @@ where .snapshot_recovery_maybe_finish_wait_apply(/* force= */ true); } + (|| { + fail_point!( + "before_destroy_peer_on_peer_1003", + self.fsm.peer.peer_id() == 1003, + |_| {} + ); + })(); let mut meta = self.ctx.store_meta.lock().unwrap(); - let is_region_initialized_in_meta = meta - .regions - .get(®ion_id) - .map_or(false, |region| is_region_initialized(region)); - if !is_initialized && is_region_initialized_in_meta { - let region_in_meta = meta.regions.get(®ion_id).unwrap(); - error!( - "peer is destroyed inconsistently"; - "region_id" => region_id, + meta.damaged_regions.remove(&self.fsm.region_id()); + meta.damaged_regions.shrink_to_fit(); + let is_latest_initialized = { + if let Some(latest_region_info) = meta.regions.get(®ion_id) { + util::is_region_initialized(latest_region_info) + } else { + false + } + }; + + if !is_peer_initialized && is_latest_initialized { + info!("skip destroy uninitialized peer as it's already initialized in meta"; + "region_id" => self.fsm.region_id(), "peer_id" => self.fsm.peer_id(), - "peers" => ?self.region().get_peers(), "merged_by_target" => merged_by_target, - "is_initialized" => is_initialized, - "is_region_initialized_in_meta" => is_region_initialized_in_meta, - "start_key_in_meta" => log_wrappers::Value::key(region_in_meta.get_start_key()), - "end_key_in_meta" => log_wrappers::Value::key(region_in_meta.get_end_key()), - "peers_in_meta" => ?region_in_meta.get_peers(), ); + return false; + } + + info!( + "starts destroy"; + "region_id" => self.fsm.region_id(), + "peer_id" => self.fsm.peer_id(), + "merged_by_target" => merged_by_target, + "is_peer_initialized" => is_peer_initialized, + "is_latest_initialized" => is_latest_initialized, + ); + + // Ensure this peer is removed in the pending apply list. + meta.busy_apply_peers.remove(&self.fsm.peer_id()); + if let Some(count) = meta.completed_apply_peers_count.as_mut() { + *count += 1; } if meta.atomic_snap_regions.contains_key(&self.region_id()) { @@ -3764,7 +3981,7 @@ where self.ctx.router.close(region_id); self.fsm.stop(); - if is_initialized + if is_peer_initialized && !merged_by_target && meta .region_ranges @@ -3773,6 +3990,7 @@ where { panic!("{} meta corruption detected", self.fsm.peer.tag); } + if meta.regions.remove(®ion_id).is_none() && !merged_by_target { panic!("{} meta corruption detected", self.fsm.peer.tag) } @@ -3785,14 +4003,19 @@ where self.fsm.peer.tag ); } else { + // Remove itself from atomic_snap_regions as it has cleaned both + // data and metadata. let target_region_id = *meta.targets_map.get(®ion_id).unwrap(); - let is_ready = meta - .atomic_snap_regions + meta.atomic_snap_regions .get_mut(&target_region_id) .unwrap() - .get_mut(®ion_id) - .unwrap(); - *is_ready = true; + .remove(®ion_id); + meta.destroyed_region_for_snap.remove(®ion_id); + info!("peer has destroyed, clean up for incoming overlapped snapshot"; + "region_id" => region_id, + "peer_id" => self.fsm.peer_id(), + "target_region_id" => target_region_id, + ); } } @@ -4032,6 +4255,7 @@ where derived: metapb::Region, regions: Vec, new_split_regions: HashMap, + share_source_region_size: bool, ) { fail_point!("on_split", self.ctx.store_id() == 3, |_| {}); @@ -4053,8 +4277,25 @@ where // Roughly estimate the size and keys for new regions. let new_region_count = regions.len() as u64; - let estimated_size = self.fsm.peer.approximate_size.map(|v| v / new_region_count); - let estimated_keys = self.fsm.peer.approximate_keys.map(|v| v / new_region_count); + let mut share_size = None; + let mut share_keys = None; + // if share_source_region_size is true, it means the new region contains any + // data from the origin region + if share_source_region_size { + share_size = self + .fsm + .peer + .split_check_trigger + .approximate_size + .map(|v| v / new_region_count); + share_keys = self + .fsm + .peer + .split_check_trigger + .approximate_keys + .map(|v| v / new_region_count); + } + let mut meta = self.ctx.store_meta.lock().unwrap(); meta.set_region( &self.ctx.coprocessor_host, @@ -4064,13 +4305,12 @@ where ); self.fsm.peer.post_split(); - // It's not correct anymore, so set it to false to schedule a split check task. - self.fsm.peer.may_skip_split_check = false; - let is_leader = self.fsm.peer.is_leader(); if is_leader { - self.fsm.peer.approximate_size = estimated_size; - self.fsm.peer.approximate_keys = estimated_keys; + if share_source_region_size { + self.fsm.peer.split_check_trigger.approximate_size = share_size; + self.fsm.peer.split_check_trigger.approximate_keys = share_keys; + } self.fsm.peer.heartbeat_pd(self.ctx); // Notify pd immediately to let it update the region meta. info!( @@ -4098,7 +4338,6 @@ where if meta.region_ranges.remove(&last_key).is_none() { panic!("{} original region should exist", self.fsm.peer.tag); } - let last_region_id = regions.last().unwrap().get_id(); for (new_region, locks) in regions.into_iter().zip(region_locks) { let new_region_id = new_region.get_id(); @@ -4139,6 +4378,7 @@ where // Insert new regions and validation let mut is_uninitialized_peer_exist = false; + let self_store_id = self.ctx.store.get_id(); if let Some(r) = meta.regions.get(&new_region_id) { // Suppose a new node is added by conf change and the snapshot comes slowly. // Then, the region splits and the first vote message comes to the new node @@ -4160,6 +4400,7 @@ where "region_id" => new_region_id, "region" => ?new_region, "is_uninitialized_peer_exist" => is_uninitialized_peer_exist, + "store_id" => self_store_id, ); let (sender, mut new_peer) = match PeerFsm::create( @@ -4203,8 +4444,8 @@ where new_peer.has_ready |= campaigned; if is_leader { - new_peer.peer.approximate_size = estimated_size; - new_peer.peer.approximate_keys = estimated_keys; + new_peer.peer.split_check_trigger.approximate_size = share_size; + new_peer.peer.split_check_trigger.approximate_keys = share_keys; *new_peer.peer.txn_ext.pessimistic_locks.write() = locks; // The new peer is likely to become leader, send a heartbeat immediately to // reduce client query miss. @@ -4222,11 +4463,6 @@ where .insert(new_region_id, ReadDelegate::from_peer(new_peer.get_peer())); meta.region_read_progress .insert(new_region_id, new_peer.peer.read_progress.clone()); - if last_region_id == new_region_id { - // To prevent from big region, the right region needs run split - // check again after split. - new_peer.peer.size_diff_hint = self.ctx.cfg.region_split_check_diff().0; - } let mailbox = BasicMailbox::new(sender, new_peer, self.ctx.router.state_cnt().clone()); self.ctx.router.register(new_region_id, mailbox); self.ctx @@ -4239,7 +4475,10 @@ where .pending_msgs .swap_remove_front(|m| m.get_to_peer() == &meta_peer) { - let peer_msg = PeerMsg::RaftMessage(InspectedRaftMessage { heap_size: 0, msg }); + let peer_msg = PeerMsg::RaftMessage( + Box::new(InspectedRaftMessage { heap_size: 0, msg }), + Some(TiInstant::now()), + ); if let Err(e) = self.ctx.router.force_send(new_region_id, peer_msg) { warn!("handle first requset failed"; "region_id" => region_id, "error" => ?e); } @@ -4405,6 +4644,9 @@ where fn schedule_merge(&mut self) -> Result<()> { fail_point!("on_schedule_merge", |_| Ok(())); + fail_point!("on_schedule_merge_ret_err", |_| Err(Error::RegionNotFound( + 1 + ))); let (request, target_id) = { let state = self.fsm.peer.pending_merge_state.as_ref().unwrap(); let expect_region = state.get_target(); @@ -4465,14 +4707,14 @@ where .router .force_send( target_id, - PeerMsg::RaftCommand(RaftCommand::new_ext( + PeerMsg::RaftCommand(Box::new(RaftCommand::new_ext( request, Callback::None, RaftCmdExtraOpts { deadline: None, disk_full_opt: DiskFullOpt::AllowedOnAlmostFull, }, - )), + ))), ) .map_err(|_| Error::RegionNotFound(target_id)) } @@ -4528,6 +4770,17 @@ where "error_code" => %e.error_code(), ); self.rollback_merge(); + } else if let Some(ForceLeaderState::ForceLeader { .. }) = + &self.fsm.peer.force_leader + { + info!( + "failed to schedule merge, rollback in force leader state"; + "region_id" => self.fsm.region_id(), + "peer_id" => self.fsm.peer_id(), + "err" => %e, + "error_code" => %e.error_code(), + ); + self.rollback_merge(); } } else if !is_learner(&self.fsm.peer.peer) { info!( @@ -4554,6 +4807,7 @@ where } fn on_ready_prepare_merge(&mut self, region: metapb::Region, state: MergeState) { + fail_point!("on_apply_res_prepare_merge"); { let mut meta = self.ctx.store_meta.lock().unwrap(); meta.set_region( @@ -4703,7 +4957,7 @@ where // make approximate size and keys updated in time. // the reason why follower need to update is that there is a issue that after // merge and then transfer leader, the new leader may have stale size and keys. - self.fsm.peer.size_diff_hint = self.ctx.cfg.region_split_check_diff().0; + self.fsm.peer.split_check_trigger.reset_skip_check(); self.fsm.peer.reset_region_buckets(); if self.fsm.peer.is_leader() { info!( @@ -4717,11 +4971,11 @@ where } if let Err(e) = self.ctx.router.force_send( source.get_id(), - PeerMsg::SignificantMsg(SignificantMsg::MergeResult { + PeerMsg::SignificantMsg(Box::new(SignificantMsg::MergeResult { target_region_id: self.fsm.region_id(), target: self.fsm.peer.peer.clone(), result: MergeResultKind::FromTargetLog, - }), + })), ) { panic!( "{} failed to send merge result(FromTargetLog) to source region {}, err {}", @@ -4896,6 +5150,7 @@ where "region_id" => self.fsm.region_id(), "peer_id" => self.fsm.peer_id(), "region" => ?region, + "destroy_regions" => ?persist_res.destroy_regions, ); let mut state = self.ctx.global_replication_state.lock().unwrap(); @@ -4994,11 +5249,11 @@ where for r in &persist_res.destroy_regions { if let Err(e) = self.ctx.router.force_send( r.get_id(), - PeerMsg::SignificantMsg(SignificantMsg::MergeResult { + PeerMsg::SignificantMsg(Box::new(SignificantMsg::MergeResult { target_region_id: self.fsm.region_id(), target: self.fsm.peer.peer.clone(), result: MergeResultKind::FromTargetSnapshotStep2, - }), + })), ) { panic!( "{} failed to send merge result(FromTargetSnapshotStep2) to source region {}, err {}", @@ -5031,7 +5286,13 @@ where derived, regions, new_split_regions, - } => self.on_ready_split_region(derived, regions, new_split_regions), + share_source_region_size, + } => self.on_ready_split_region( + derived, + regions, + new_split_regions, + share_source_region_size, + ), ExecResult::PrepareMerge { region, state } => { self.on_ready_prepare_merge(region, state) } @@ -5158,6 +5419,14 @@ where &mut self, msg: &RaftCmdRequest, ) -> Result> { + // failpoint + fail_point!( + "fail_pre_propose_split", + msg.has_admin_request() + && msg.get_admin_request().get_cmd_type() == AdminCmdType::BatchSplit, + |_| Err(Error::Other(box_err!("fail_point"))) + ); + // Check store_id, make sure that the msg is dispatched to the right place. if let Err(e) = util::check_store_id(msg.get_header(), self.store_id()) { self.ctx @@ -5193,7 +5462,8 @@ where // error-prone if !(msg.has_admin_request() && (msg.get_admin_request().get_cmd_type() == AdminCmdType::ChangePeer - || msg.get_admin_request().get_cmd_type() == AdminCmdType::ChangePeerV2)) + || msg.get_admin_request().get_cmd_type() == AdminCmdType::ChangePeerV2 + || msg.get_admin_request().get_cmd_type() == AdminCmdType::RollbackMerge)) { return Err(Error::RecoveryInProgress(self.region_id())); } @@ -5381,7 +5651,10 @@ where return; } Err(e) => { - debug!( + // log for admin requests + let is_admin_request = msg.has_admin_request(); + info_or_debug!( + is_admin_request; "failed to propose"; "region_id" => self.region_id(), "peer_id" => self.fsm.peer_id(), @@ -5488,7 +5761,7 @@ where } fail_point!("on_raft_log_gc_tick_1", self.fsm.peer_id() == 1, |_| {}); fail_point!("on_raft_gc_log_tick", |_| {}); - debug_assert!(!self.fsm.stopped); + assert!(!self.fsm.stopped); // As leader, we would not keep caches for the peers that didn't response // heartbeat in the last few seconds. That happens probably because @@ -5749,14 +6022,16 @@ where // whether the region should split. // We assume that `may_skip_split_check` is only set true after the split check // task is scheduled. - if self.fsm.peer.may_skip_split_check - && self.fsm.peer.compaction_declined_bytes < self.ctx.cfg.region_split_check_diff().0 - && self.fsm.peer.size_diff_hint < self.ctx.cfg.region_split_check_diff().0 + if self + .fsm + .peer + .split_check_trigger + .should_skip(self.ctx.cfg.region_split_check_diff().0) { return; } - fail_point!("on_split_region_check_tick"); + fail_point!("on_split_region_check_tick", |_| {}); self.register_split_region_check_tick(); // To avoid frequent scan, we only add new scan tasks if all previous tasks @@ -5766,6 +6041,11 @@ where return; } + // To avoid run the check if it's splitting. + if self.fsm.peer.is_splitting() { + return; + } + // When Lightning or BR is importing data to TiKV, their ingest-request may fail // because of region-epoch not matched. So we hope TiKV do not check region size // and split region during importing. @@ -5804,10 +6084,7 @@ where ); return; } - self.fsm.peer.size_diff_hint = 0; - self.fsm.peer.compaction_declined_bytes = 0; - // the task is scheduled, next tick may skip it. - self.fsm.peer.may_skip_split_check = true; + self.fsm.peer.split_check_trigger.post_triggered(); } fn on_prepare_split_region( @@ -5816,6 +6093,7 @@ where split_keys: Vec>, cb: Callback, source: &str, + share_source_region_size: bool, ) { info!( "on split"; @@ -5861,6 +6139,7 @@ where split_keys, peer: self.fsm.peer.peer.clone(), right_derive: self.ctx.cfg.right_derive_when_split, + share_source_region_size, callback: cb, }; if let Err(ScheduleError::Stopped(t)) = self.ctx.pd_scheduler.schedule(task) { @@ -5881,15 +6160,21 @@ where } } - fn on_approximate_region_size(&mut self, size: u64) { - self.fsm.peer.approximate_size = Some(size); + fn on_approximate_region_size(&mut self, size: Option, splitable: Option) { + self.fsm + .peer + .split_check_trigger + .on_approximate_region_size(size, splitable); self.register_split_region_check_tick(); self.register_pd_heartbeat_tick(); fail_point!("on_approximate_region_size"); } - fn on_approximate_region_keys(&mut self, keys: u64) { - self.fsm.peer.approximate_keys = Some(keys); + fn on_approximate_region_keys(&mut self, keys: Option, splitable: Option) { + self.fsm + .peer + .split_check_trigger + .on_approximate_region_keys(keys, splitable); self.register_split_region_check_tick(); self.register_pd_heartbeat_tick(); } @@ -5897,7 +6182,7 @@ where fn on_refresh_region_buckets( &mut self, region_epoch: RegionEpoch, - mut buckets: Vec, + buckets: Vec, bucket_ranges: Option>, _cb: Callback, ) { @@ -5912,27 +6197,6 @@ where } }; - // bucket version layout - // term logical counter - // |-----------|-----------| - // high bits low bits - // term: given 10s election timeout, the 32 bit means 1362 year running time - let gen_bucket_version = |term, current_version| { - let current_version_term = current_version >> 32; - let bucket_version: u64 = if current_version_term == term { - current_version + 1 - } else { - if term > u32::MAX.into() { - error!( - "unexpected term {} more than u32::MAX. Bucket version will be backward.", - term - ); - } - term << 32 - }; - bucket_version - }; - let region = self.fsm.peer.region(); if util::is_epoch_stale(®ion_epoch, region.get_region_epoch()) { info!( @@ -5946,14 +6210,14 @@ where // test purpose #[cfg(any(test, feature = "testexport"))] { - let default_buckets = BucketStat::default(); test_only_callback( _cb, self.fsm .peer - .region_buckets + .region_buckets_info() + .bucket_stat() .as_ref() - .unwrap_or(&default_buckets) + .unwrap() .meta .clone(), ); @@ -5961,108 +6225,54 @@ where return; } - let mut current_version = self + let current_version = self.fsm.peer.region_buckets_info().version(); + let next_bucket_version = util::gen_bucket_version(self.fsm.peer.term(), current_version); + let region = self.region().clone(); + let change_bucket_version = self + .fsm + .peer + .region_buckets_info_mut() + .on_refresh_region_buckets( + &self.ctx.coprocessor_host.cfg, + next_bucket_version, + buckets, + region_epoch, + ®ion, + bucket_ranges, + ); + let region_buckets = self .fsm .peer - .region_buckets + .region_buckets_info() + .bucket_stat() .as_ref() - .map(|b| b.meta.version) - .unwrap_or_default(); - if current_version == 0 { - current_version = self - .fsm - .peer - .last_region_buckets - .as_ref() - .map(|b| b.meta.version) - .unwrap_or_default(); - } - let mut region_buckets: BucketStat; - if let Some(bucket_ranges) = bucket_ranges { - assert_eq!(buckets.len(), bucket_ranges.len()); - let mut i = 0; - region_buckets = self.fsm.peer.region_buckets.clone().unwrap(); - let mut meta = (*region_buckets.meta).clone(); - if !buckets.is_empty() { - meta.version = gen_bucket_version(self.fsm.peer.term(), current_version); - } - meta.region_epoch = region_epoch; - for (bucket, bucket_range) in buckets.into_iter().zip(bucket_ranges) { - while i < meta.keys.len() && meta.keys[i] != bucket_range.0 { - i += 1; - } - assert!(i != meta.keys.len()); - // the bucket size is small and does not have split keys, - // then it should be merged with its left neighbor - let region_bucket_merge_size = - self.ctx.coprocessor_host.cfg.region_bucket_merge_size_ratio - * (self.ctx.coprocessor_host.cfg.region_bucket_size.0 as f64); - if bucket.keys.is_empty() && bucket.size <= (region_bucket_merge_size as u64) { - meta.sizes[i] = bucket.size; - // i is not the last entry (which is end key) - assert!(i < meta.keys.len() - 1); - // the region has more than one bucket - // and the left neighbor + current bucket size is not very big - if meta.keys.len() > 2 - && i != 0 - && meta.sizes[i - 1] + bucket.size - < self.ctx.coprocessor_host.cfg.region_bucket_size.0 * 2 - { - // bucket is too small - region_buckets.left_merge(i); - meta.left_merge(i); - continue; - } - } else { - // update size - meta.sizes[i] = bucket.size / (bucket.keys.len() + 1) as u64; - // insert new bucket keys (split the original bucket) - for bucket_key in bucket.keys { - i += 1; - region_buckets.split(i); - meta.split(i, bucket_key); - } - } - i += 1; - } - region_buckets.meta = Arc::new(meta); - } else { - debug!( - "refresh_region_buckets re-generates buckets"; + .unwrap() + .clone(); + let buckets_count = region_buckets.meta.keys.len() - 1; + if change_bucket_version { + // TODO: we may need to make it debug once the coprocessor timeout is resolved. + info!( + "finished on_refresh_region_buckets"; "region_id" => self.fsm.region_id(), + "buckets_count" => buckets_count, + "buckets_size" => ?region_buckets.meta.sizes, ); - assert_eq!(buckets.len(), 1); - let bucket_keys = buckets.pop().unwrap().keys; - let bucket_count = bucket_keys.len() + 1; - - let mut meta = BucketMeta { - region_id: self.fsm.region_id(), - region_epoch, - version: gen_bucket_version(self.fsm.peer.term(), current_version), - keys: bucket_keys, - sizes: vec![self.ctx.coprocessor_host.cfg.region_bucket_size.0; bucket_count], - }; - meta.keys.insert(0, region.get_start_key().to_vec()); - meta.keys.push(region.get_end_key().to_vec()); - region_buckets = BucketStat::from_meta(Arc::new(meta)); + } else { + // it means the buckets key range not any change, so don't need to refresh. + #[cfg(any(test, feature = "testexport"))] + test_only_callback(_cb, region_buckets.meta); + return; } - - let buckets_count = region_buckets.meta.keys.len() - 1; self.ctx.coprocessor_host.on_region_changed( - region, + self.region(), RegionChangeEvent::UpdateBuckets(buckets_count), self.fsm.peer.get_role(), ); let keys = region_buckets.meta.keys.clone(); - let old_region_buckets: Option = - self.fsm.peer.region_buckets.replace(region_buckets); - self.fsm.peer.last_region_buckets = old_region_buckets; + let version = region_buckets.meta.version; let mut store_meta = self.ctx.store_meta.lock().unwrap(); - let version = self.fsm.peer.region_buckets.as_ref().unwrap().meta.version; if let Some(reader) = store_meta.readers.get_mut(&self.fsm.region_id()) { - reader.update(ReadProgress::region_buckets( - self.fsm.peer.region_buckets.as_ref().unwrap().meta.clone(), - )); + reader.update(ReadProgress::region_buckets(region_buckets.meta.clone())); } // Notify followers to refresh their buckets version @@ -6083,19 +6293,9 @@ where .send_extra_message(extra_msg, &mut self.ctx.trans, &p); } } - - debug!( - "finished on_refresh_region_buckets"; - "region_id" => self.fsm.region_id(), - "buckets_count" => buckets_count, - "buckets_size" => ?self.fsm.peer.region_buckets.as_ref().unwrap().meta.sizes, - ); // test purpose #[cfg(any(test, feature = "testexport"))] - test_only_callback( - _cb, - self.fsm.peer.region_buckets.as_ref().unwrap().meta.clone(), - ); + test_only_callback(_cb, region_buckets.meta); } pub fn on_msg_refresh_buckets(&mut self, msg: RaftMessage) { @@ -6122,8 +6322,10 @@ where } fn on_compaction_declined_bytes(&mut self, declined_bytes: u64) { - self.fsm.peer.compaction_declined_bytes += declined_bytes; - if self.fsm.peer.compaction_declined_bytes >= self.ctx.cfg.region_split_check_diff().0 { + self.fsm.peer.split_check_trigger.compaction_declined_bytes += declined_bytes; + if self.fsm.peer.split_check_trigger.compaction_declined_bytes + >= self.ctx.cfg.region_split_check_diff().0 + { UPDATE_REGION_SIZE_BY_COMPACTION_COUNTER.inc(); } self.register_split_region_check_tick(); @@ -6134,50 +6336,11 @@ where if !self.ctx.coprocessor_host.cfg.enable_region_bucket() { return None; } - let region_buckets = self.fsm.peer.region_buckets.as_ref()?; - let stats = ®ion_buckets.stats; - let keys = ®ion_buckets.meta.keys; - - let empty_last_keys = vec![]; - let empty_last_stats = metapb::BucketStats::default(); - let (last_keys, last_stats, stats_reset) = self - .fsm + let region_bucket_max_size = self.ctx.coprocessor_host.cfg.region_bucket_size.0 * 2; + self.fsm .peer - .last_region_buckets - .as_ref() - .map(|b| { - ( - &b.meta.keys, - &b.stats, - region_buckets.create_time != b.create_time, - ) - }) - .unwrap_or((&empty_last_keys, &empty_last_stats, false)); - - let mut bucket_ranges = vec![]; - let mut j = 0; - assert_eq!(keys.len(), stats.write_bytes.len() + 1); - for i in 0..stats.write_bytes.len() { - let mut diff_in_bytes = stats.write_bytes[i]; - while j < last_keys.len() && keys[i] > last_keys[j] { - j += 1; - } - if j < last_keys.len() && keys[i] == last_keys[j] { - if !stats_reset { - diff_in_bytes -= last_stats.write_bytes[j]; - } - j += 1; - } - - // if the bucket's write_bytes exceed half of the configured region_bucket_size, - // add it to the bucket_ranges for checking update - let bucket_update_diff_size_threshold = - self.ctx.coprocessor_host.cfg.region_bucket_size.0 / 2; - if diff_in_bytes >= bucket_update_diff_size_threshold { - bucket_ranges.push(BucketRange(keys[i].clone(), keys[i + 1].clone())); - } - } - Some(bucket_ranges) + .region_buckets_info() + .gen_bucket_range_for_update(region_bucket_max_size) } fn on_schedule_half_split_region( @@ -6237,6 +6400,12 @@ where cb(peer_stat); } } + + // only check the suspect buckets, not split region. + if source == "bucket" { + return; + } + let task = SplitCheckTask::split_check_key_range( region.clone(), start_key, @@ -6323,13 +6492,6 @@ where return; } - if let Some(ForceLeaderState::ForceLeader { time, .. }) = self.fsm.peer.force_leader { - // Clean up the force leader state after a timeout, since the PD recovery - // process may have been aborted for some reasons. - if time.saturating_elapsed() > UNSAFE_RECOVERY_STATE_TIMEOUT { - self.on_exit_force_leader(); - } - } if let Some(state) = &mut self.fsm.peer.unsafe_recovery_state { let unsafe_recovery_state_timeout_failpoint = || -> bool { fail_point!("unsafe_recovery_state_timeout", |_| true); @@ -6342,6 +6504,15 @@ where { info!("timeout, abort unsafe recovery"; "state" => ?state); state.abort(); + self.fsm.peer.unsafe_recovery_state = None; + } + } + + if let Some(ForceLeaderState::ForceLeader { time, .. }) = self.fsm.peer.force_leader { + // Clean up the force leader state after a timeout, since the PD recovery + // process may have been aborted for some reasons. + if time.saturating_elapsed() > UNSAFE_RECOVERY_STATE_TIMEOUT { + self.on_exit_force_leader(true); } } @@ -6392,19 +6563,26 @@ where fail_point!("peer_check_stale_state", state != StaleState::Valid, |_| {}); match state { StaleState::Valid => (), - StaleState::LeaderMissing => { - warn!( - "leader missing longer than abnormal_leader_missing_duration"; - "region_id" => self.fsm.region_id(), - "peer_id" => self.fsm.peer_id(), - "expect" => %self.ctx.cfg.abnormal_leader_missing_duration, - ); - self.ctx - .raft_metrics - .leader_missing - .lock() - .unwrap() - .insert(self.region_id()); + StaleState::LeaderMissing | StaleState::MaybeLeaderMissing => { + if state == StaleState::LeaderMissing { + warn!( + "leader missing longer than abnormal_leader_missing_duration"; + "region_id" => self.fsm.region_id(), + "peer_id" => self.fsm.peer_id(), + "expect" => %self.ctx.cfg.abnormal_leader_missing_duration, + ); + self.ctx + .raft_metrics + .leader_missing + .lock() + .unwrap() + .insert(self.region_id()); + } + + // It's very likely that this is a stale peer. To prevent + // resolved ts from being blocked for too long, we check stale + // peer eagerly. + self.fsm.peer.bcast_check_stale_peer_message(self.ctx); } StaleState::ToValidate => { // for peer B in case 1 above @@ -6471,7 +6649,7 @@ where fn on_report_region_buckets_tick(&mut self) { if !self.fsm.peer.is_leader() - || self.fsm.peer.region_buckets.is_none() + || self.fsm.peer.region_buckets_info().bucket_stat().is_none() || self.fsm.hibernate_state.group_state() == GroupState::Idle { return; @@ -6479,11 +6657,11 @@ where let region_id = self.region_id(); let peer_id = self.fsm.peer_id(); - let region_buckets = self.fsm.peer.region_buckets.as_mut().unwrap(); + let region_buckets = self.fsm.peer.region_buckets_info_mut().report_bucket_stat(); if let Err(e) = self .ctx .pd_scheduler - .schedule(PdTask::ReportBuckets(region_buckets.clone())) + .schedule(PdTask::ReportBuckets(region_buckets)) { error!( "failed to report region buckets"; @@ -6492,8 +6670,6 @@ where "err" => ?e, ); } - // todo: it will delete in next pr. - region_buckets.stats = new_bucket_stats(®ion_buckets.meta); self.register_report_region_buckets_tick(); } @@ -6501,6 +6677,121 @@ where fn register_report_region_buckets_tick(&mut self) { self.schedule_tick(PeerTick::ReportBuckets) } + + /// Check whether the peer should send a request to fetch the committed + /// index from the leader. + fn try_to_fetch_committed_index(&mut self) { + // Already completed, skip. + if !self.fsm.peer.needs_update_last_leader_committed_idx() || self.fsm.peer.is_leader() { + return; + } + // Construct a MsgReadIndex message and send it to the leader to + // fetch the latest committed index of this raft group. + let leader_id = self.fsm.peer.leader_id(); + if leader_id == raft::INVALID_ID { + // The leader is unknown, so we can't fetch the committed index. + return; + } + let rctx = ReadIndexContext { + id: uuid::Uuid::new_v4(), + request: None, + locked: None, + }; + self.fsm.peer.raft_group.read_index(rctx.to_bytes()); + debug!( + "try to fetch committed index from leader"; + "region_id" => self.region_id(), + "peer_id" => self.fsm.peer_id() + ); + } + + /// Check whether the peer is pending on applying raft logs. + /// + /// If busy, the peer will be recorded, until the pending logs are + /// applied. And after it completes applying, it will be removed from + /// the recording list. + fn on_check_peer_complete_apply_logs(&mut self) { + // Already completed, skip. + if self.fsm.peer.busy_on_apply.is_none() { + return; + } + + let peer_id = self.fsm.peer.peer_id(); + // No need to check the applying state if the peer is leader. + if self.fsm.peer.is_leader() { + self.fsm.peer.busy_on_apply = None; + // Clear it from recoding list and update the counter, to avoid + // missing it when the peer is changed to leader. + let mut meta = self.ctx.store_meta.lock().unwrap(); + meta.busy_apply_peers.remove(&peer_id); + if let Some(count) = meta.completed_apply_peers_count.as_mut() { + *count += 1; + } + return; + } + + let applied_idx = self.fsm.peer.get_store().applied_index(); + let mut last_idx = self.fsm.peer.get_store().last_index(); + // If the peer is newly added or created, no need to check the apply status. + if last_idx <= RAFT_INIT_LOG_INDEX { + self.fsm.peer.busy_on_apply = None; + // And it should be recorded in the `completed_apply_peers_count`. + let mut meta = self.ctx.store_meta.lock().unwrap(); + meta.busy_apply_peers.remove(&peer_id); + if let Some(count) = meta.completed_apply_peers_count.as_mut() { + *count += 1; + } + debug!( + "no need to check initialized peer"; + "last_commit_idx" => last_idx, + "last_applied_idx" => applied_idx, + "region_id" => self.fsm.region_id(), + "peer_id" => peer_id, + ); + return; + } + assert!(self.fsm.peer.busy_on_apply.is_some()); + + // This peer is restarted and the last leader commit index is not set, so + // it use `u64::MAX` as the last commit index to make it wait for the update + // of the `last_leader_committed_idx` until the `last_leader_committed_idx` has + // been updated. + last_idx = self.fsm.peer.last_leader_committed_idx.unwrap_or(u64::MAX); + + // If the peer has large unapplied logs, this peer should be recorded until + // the lag is less than the given threshold. + if last_idx >= applied_idx + self.ctx.cfg.leader_transfer_max_log_lag { + if !self.fsm.peer.busy_on_apply.unwrap() { + let mut meta = self.ctx.store_meta.lock().unwrap(); + meta.busy_apply_peers.insert(peer_id); + } + self.fsm.peer.busy_on_apply = Some(true); + debug!( + "peer is busy on applying logs"; + "last_commit_idx" => last_idx, + "last_applied_idx" => applied_idx, + "region_id" => self.fsm.region_id(), + "peer_id" => peer_id, + ); + } else { + // Already finish apply, remove it from recording list. + { + let mut meta = self.ctx.store_meta.lock().unwrap(); + meta.busy_apply_peers.remove(&peer_id); + if let Some(count) = meta.completed_apply_peers_count.as_mut() { + *count += 1; + } + } + debug!( + "peer completes applying logs"; + "last_commit_idx" => last_idx, + "last_applied_idx" => applied_idx, + "region_id" => self.fsm.region_id(), + "peer_id" => peer_id, + ); + self.fsm.peer.busy_on_apply = None; + } + } } impl<'a, EK, ER, T: Transport> PeerFsmDelegate<'a, EK, ER, T> @@ -6562,17 +6853,14 @@ where size += sst.total_bytes; keys += sst.total_kvs; } - self.fsm.peer.approximate_size = - Some(self.fsm.peer.approximate_size.unwrap_or_default() + size); - self.fsm.peer.approximate_keys = - Some(self.fsm.peer.approximate_keys.unwrap_or_default() + keys); + self.fsm + .peer + .split_check_trigger + .on_ingest_sst_result(size, keys); - if let Some(buckets) = &mut self.fsm.peer.region_buckets { + if let Some(buckets) = &mut self.fsm.peer.region_buckets_info_mut().bucket_stat_mut() { buckets.ingest_sst(keys, size); } - // The ingested file may be overlapped with the data in engine, so we need to - // check it again to get the accurate value. - self.fsm.peer.may_skip_split_check = false; if self.fsm.peer.is_leader() { self.on_pd_heartbeat_tick(); self.register_split_region_check_tick(); @@ -7073,4 +7361,39 @@ mod tests { assert!(flag.load(Ordering::Acquire)); } } + + #[test] + fn test_batch_raft_cmd_request_builder_size_limit() { + let mut cfg = Config::default(); + cfg.raft_entry_max_size = ReadableSize::gb(1); + let mut q = Request::default(); + let mut builder = BatchRaftCmdRequestBuilder::::new(); + + let mut req = RaftCmdRequest::default(); + let mut put = PutRequest::default(); + put.set_key(b"aaaa".to_vec()); + let val = (0..200_000).map(|_| 0).collect_vec(); + put.set_value(val); + q.set_cmd_type(CmdType::Put); + q.set_put(put); + req.mut_requests().push(q.clone()); + let _ = q.take_put(); + let req_size = req.compute_size(); + assert!(builder.can_batch(&cfg, &req, req_size)); + let cb = Callback::write_ext(Box::new(move |_| {}), None, None); + let cmd = RaftCommand::new(req.clone(), cb); + builder.add(cmd, req_size); + + let mut req = RaftCmdRequest::default(); + let mut put = PutRequest::default(); + put.set_key(b"aaaa".to_vec()); + let val = (0..900_000).map(|_| 0).collect_vec(); + put.set_value(val); + q.set_cmd_type(CmdType::Put); + q.set_put(put); + req.mut_requests().push(q.clone()); + let _ = q.take_put(); + let req_size = req.compute_size(); + assert!(!builder.can_batch(&cfg, &req, req_size)); + } } diff --git a/components/raftstore/src/store/fsm/store.rs b/components/raftstore/src/store/fsm/store.rs index c21ea65a589..deecbdd0e02 100644 --- a/components/raftstore/src/store/fsm/store.rs +++ b/components/raftstore/src/store/fsm/store.rs @@ -14,7 +14,7 @@ use std::{ atomic::{AtomicU64, Ordering}, Arc, Mutex, }, - time::{Duration, Instant}, + time::{Duration, Instant, SystemTime}, u64, }; @@ -34,9 +34,9 @@ use fail::fail_point; use file_system::{IoType, WithIoType}; use futures::{compat::Future01CompatExt, FutureExt}; use grpcio_health::HealthService; +use itertools::Itertools; use keys::{self, data_end_key, data_key, enc_end_key, enc_start_key}; use kvproto::{ - import_sstpb::{SstMeta, SwitchMode}, metapb::{self, Region, RegionEpoch}, pdpb::{self, QueryStats, StoreStats}, raft_cmdpb::{AdminCmdType, AdminRequest}, @@ -50,6 +50,7 @@ use resource_control::{channel::unbounded, ResourceGroupManager}; use resource_metering::CollectorRegHandle; use service::service_manager::GrpcServiceManager; use sst_importer::SstImporter; +use strum::{EnumCount, VariantNames}; use tikv_alloc::trace::TraceEvent; use tikv_util::{ box_try, @@ -62,12 +63,12 @@ use tikv_util::{ store::{find_peer, region_on_stores}, sys as sys_util, sys::disk::{get_disk_status, DiskUsage}, - time::{duration_to_sec, monotonic_raw_now, Instant as TiInstant}, + time::{duration_to_sec, monotonic_raw_now, Instant as TiInstant, SlowTimer}, timer::SteadyTimer, warn, worker::{LazyWorker, Scheduler, Worker}, yatp_pool::FuturePool, - Either, RingQueue, + Either, InspectFactor, RingQueue, }; use time::{self, Timespec}; @@ -101,9 +102,9 @@ use crate::{ worker::{ AutoSplitController, CleanupRunner, CleanupSstRunner, CleanupSstTask, CleanupTask, CompactRunner, CompactTask, ConsistencyCheckRunner, ConsistencyCheckTask, - GcSnapshotRunner, GcSnapshotTask, PdRunner, RaftlogGcRunner, RaftlogGcTask, - ReadDelegate, RefreshConfigRunner, RefreshConfigTask, RegionRunner, RegionTask, - SplitCheckTask, + DiskCheckRunner, DiskCheckTask, GcSnapshotRunner, GcSnapshotTask, PdRunner, + RaftlogGcRunner, RaftlogGcTask, ReadDelegate, RefreshConfigRunner, RefreshConfigTask, + RegionRunner, RegionTask, SplitCheckTask, }, Callback, CasualMessage, CompactThreshold, GlobalReplicationState, InspectedRaftMessage, MergeResultKind, PdTask, PeerMsg, PeerTick, RaftCommand, SignificantMsg, SnapManager, @@ -118,6 +119,15 @@ pub const PENDING_MSG_CAP: usize = 100; pub const ENTRY_CACHE_EVICT_TICK_DURATION: Duration = Duration::from_secs(1); pub const MULTI_FILES_SNAPSHOT_FEATURE: Feature = Feature::require(6, 1, 0); // it only makes sense for large region +// When the store is started, it will take some time for applying pending +// snapshots and delayed raft logs. Before the store is ready, it will report +// `is_busy` to PD, so PD will not schedule operators to the store. +const STORE_CHECK_PENDING_APPLY_DURATION: Duration = Duration::from_secs(5 * 60); +// The minimal percent of region finishing applying pending logs. +// Only when the count of regions which finish applying logs exceed +// the threshold, can the raftstore supply service. +const STORE_CHECK_COMPLETE_APPLY_REGIONS_PERCENT: u64 = 99; + pub struct StoreInfo { pub kv_engine: EK, pub raft_engine: ER, @@ -170,6 +180,22 @@ pub struct StoreMeta { pub region_read_progress: RegionReadProgressRegistry, /// record sst_file_name -> (sst_smallest_key, sst_largest_key) pub damaged_ranges: HashMap, Vec)>, + /// Record regions are damaged on some corner cases, the relative peer must + /// be safely removed from the store, such as applying snapshot or + /// compacting raft logs. + pub damaged_regions: HashSet, + /// Record peers are busy with applying logs + /// (applied_index <= last_idx - leader_transfer_max_log_lag). + /// `busy_apply_peers` and `completed_apply_peers_count` are used + /// to record the accurate count of busy apply peers and peers complete + /// applying logs + pub busy_apply_peers: HashSet, + /// Record the number of peers done for applying logs. + /// Without `completed_apply_peers_count`, it's hard to know whether all + /// peers are ready for applying logs. + /// If None, it means the store is start from empty, no need to check and + /// update it anymore. + pub completed_apply_peers_count: Option, } impl StoreRegionMeta for StoreMeta { @@ -220,6 +246,9 @@ impl StoreMeta { destroyed_region_for_snap: HashMap::default(), region_read_progress: RegionReadProgressRegistry::new(), damaged_ranges: HashMap::default(), + damaged_regions: HashSet::default(), + busy_apply_peers: HashSet::default(), + completed_apply_peers_count: Some(0), } } @@ -345,9 +374,7 @@ where let region_id = r.region_id; if let Err(e) = self.router.force_send( region_id, - PeerMsg::ApplyRes { - res: ApplyTaskRes::Apply(r), - }, + PeerMsg::ApplyRes(Box::new(ApplyTaskRes::Apply(r))), ) { error!("failed to send apply result"; "region_id" => region_id, "err" => ?e); } @@ -383,7 +410,10 @@ where for e in msg.get_message().get_entries() { heap_size += bytes_capacity(&e.data) + bytes_capacity(&e.context); } - let peer_msg = PeerMsg::RaftMessage(InspectedRaftMessage { heap_size, msg }); + let peer_msg = PeerMsg::RaftMessage( + Box::new(InspectedRaftMessage { heap_size, msg }), + Some(TiInstant::now()), + ); let event = TraceEvent::Add(heap_size); let send_failed = Cell::new(true); @@ -398,13 +428,13 @@ where send_failed.set(false); return Ok(()); } - Either::Left(Err(TrySendError::Full(PeerMsg::RaftMessage(im)))) => { + Either::Left(Err(TrySendError::Full(PeerMsg::RaftMessage(im, _)))) => { return Err(TrySendError::Full(im.msg)); } - Either::Left(Err(TrySendError::Disconnected(PeerMsg::RaftMessage(im)))) => { + Either::Left(Err(TrySendError::Disconnected(PeerMsg::RaftMessage(im, _)))) => { return Err(TrySendError::Disconnected(im.msg)); } - Either::Right(PeerMsg::RaftMessage(im)) => StoreMsg::RaftMessage(im), + Either::Right(PeerMsg::RaftMessage(im, _)) => StoreMsg::RaftMessage(im), _ => unreachable!(), }; match self.send_control(store_msg) { @@ -426,10 +456,10 @@ where cmd: RaftCommand, ) -> std::result::Result<(), TrySendError>> { let region_id = cmd.request.get_header().get_region_id(); - match self.send(region_id, PeerMsg::RaftCommand(cmd)) { + match self.send(region_id, PeerMsg::RaftCommand(Box::new(cmd))) { Ok(()) => Ok(()), - Err(TrySendError::Full(PeerMsg::RaftCommand(cmd))) => Err(TrySendError::Full(cmd)), - Err(TrySendError::Disconnected(PeerMsg::RaftCommand(cmd))) => { + Err(TrySendError::Full(PeerMsg::RaftCommand(box cmd))) => Err(TrySendError::Full(cmd)), + Err(TrySendError::Disconnected(PeerMsg::RaftCommand(box cmd))) => { Err(TrySendError::Disconnected(cmd)) } _ => unreachable!(), @@ -438,7 +468,7 @@ where fn report_unreachable(&self, store_id: u64) { self.broadcast_normal(|| { - PeerMsg::SignificantMsg(SignificantMsg::StoreUnreachable { store_id }) + PeerMsg::SignificantMsg(Box::new(SignificantMsg::StoreUnreachable { store_id })) }); } @@ -449,7 +479,10 @@ where /// Broadcasts resolved result to all regions. pub fn report_resolved(&self, store_id: u64, group_id: u64) { self.broadcast_normal(|| { - PeerMsg::SignificantMsg(SignificantMsg::StoreResolved { store_id, group_id }) + PeerMsg::SignificantMsg(Box::new(SignificantMsg::StoreResolved { + store_id, + group_id, + })) }) } @@ -468,10 +501,6 @@ where self.update_trace(); } - pub fn clear_cache(&self) { - self.router.clear_cache(); - } - fn update_trace(&self) { let router_trace = self.router.trace(); MEMTRACE_RAFT_ROUTER_ALIVE.trace(TraceEvent::Reset(router_trace.alive)); @@ -525,6 +554,7 @@ where pub raftlog_gc_scheduler: Scheduler, pub raftlog_fetch_scheduler: Scheduler>, pub region_scheduler: Scheduler>, + pub disk_check_scheduler: Scheduler, pub apply_router: ApplyRouter, pub router: RaftRouter, pub importer: Arc, @@ -676,6 +706,8 @@ where "region_id" => region_id, "current_region_epoch" => ?cur_epoch, "msg_type" => ?msg_type, + "to_peer_id" => ?from_peer.get_id(), + "to_peer_store_id" => ?from_peer.get_store_id(), ); self.raft_metrics.message_dropped.stale_msg.inc(); @@ -694,6 +726,8 @@ where error!(?e; "send gc message failed"; "region_id" => region_id, + "to_peer_id" => ?from_peer.get_id(), + "to_peer_store_id" => ?from_peer.get_store_id(), ); } } @@ -781,15 +815,19 @@ impl<'a, EK: KvEngine + 'static, ER: RaftEngine + 'static, T: Transport> .observe(duration_to_sec(elapsed)); slow_log!( elapsed, - "[store {}] handle timeout {:?}", + "[store {}] handle tick {:?}", self.fsm.store.id, tick ); } fn handle_msgs(&mut self, msgs: &mut Vec>) { - let timer = TiInstant::now_coarse(); + let timer = SlowTimer::from_millis(100); + let count = msgs.len(); + #[allow(const_evaluatable_unchecked)] + let mut distribution = [0; StoreMsg::::COUNT]; for m in msgs.drain(..) { + distribution[m.discriminant()] += 1; match m { StoreMsg::Tick(tick) => self.on_tick(tick), StoreMsg::RaftMessage(msg) => { @@ -814,9 +852,6 @@ impl<'a, EK: KvEngine + 'static, ER: RaftEngine + 'static, T: Transport> } } StoreMsg::CompactedEvent(event) => self.on_compaction_finished(event), - StoreMsg::ValidateSstResult { invalid_ssts } => { - self.on_validate_sst_result(invalid_ssts) - } StoreMsg::ClearRegionSizeInRange { start_key, end_key } => { self.clear_region_size_in_range(&start_key, &end_key) } @@ -828,11 +863,30 @@ impl<'a, EK: KvEngine + 'static, ER: RaftEngine + 'static, T: Transport> #[cfg(any(test, feature = "testexport"))] StoreMsg::Validate(f) => f(&self.ctx.cfg), StoreMsg::LatencyInspect { + factor, send_time, mut inspector, } => { - inspector.record_store_wait(send_time.saturating_elapsed()); - self.ctx.pending_latency_inspect.push(inspector); + match factor { + InspectFactor::RaftDisk => { + inspector.record_store_wait(send_time.saturating_elapsed()); + self.ctx.pending_latency_inspect.push(inspector); + } + InspectFactor::KvDisk => { + // Send LatencyInspector to disk_check_scheduler to inspect latency. + if let Err(e) = self + .ctx + .disk_check_scheduler + .schedule(DiskCheckTask::InspectLatency { inspector }) + { + warn!( + "Failed to schedule disk check task"; + "error" => ?e, + "store_id" => self.fsm.store.id + ); + } + } + } } StoreMsg::UnsafeRecoveryReport(report) => self.store_heartbeat_pd(Some(report)), StoreMsg::UnsafeRecoveryCreatePeer { syncer, create } => { @@ -845,11 +899,18 @@ impl<'a, EK: KvEngine + 'static, ER: RaftEngine + 'static, T: Transport> } } } + slow_log!( + T timer, + "[store {}] handle {} store messages {:?}", + self.fsm.store.id, + count, + StoreMsg::::VARIANTS.iter().zip(distribution).filter(|(_, c)| *c > 0).format(", "), + ); self.ctx .raft_metrics .event_time .store_msg - .observe(timer.saturating_elapsed_secs()); + .observe(timer.saturating_elapsed().as_secs_f64()); } fn start(&mut self, store: metapb::Store) { @@ -1000,12 +1061,7 @@ impl PollHandler, St fail_point!( "pause_on_peer_destroy_res", peer.peer_id() == 1 - && matches!( - msg, - PeerMsg::ApplyRes { - res: ApplyTaskRes::Destroy { .. }, - } - ), + && matches!(msg, PeerMsg::ApplyRes(box ApplyTaskRes::Destroy { .. })), |_| unreachable!() ); self.peer_msg_buf.push(msg); @@ -1185,6 +1241,7 @@ pub struct RaftPollerBuilder { cleanup_scheduler: Scheduler, raftlog_gc_scheduler: Scheduler, raftlog_fetch_scheduler: Scheduler>, + disk_check_scheduler: Scheduler, pub region_scheduler: Scheduler>, apply_router: ApplyRouter, pub router: RaftRouter, @@ -1421,6 +1478,7 @@ where pd_scheduler: self.pd_scheduler.clone(), consistency_check_scheduler: self.consistency_check_scheduler.clone(), split_check_scheduler: self.split_check_scheduler.clone(), + disk_check_scheduler: self.disk_check_scheduler.clone(), region_scheduler: self.region_scheduler.clone(), apply_router: self.apply_router.clone(), router: self.router.clone(), @@ -1497,6 +1555,7 @@ where cleanup_scheduler: self.cleanup_scheduler.clone(), raftlog_gc_scheduler: self.raftlog_gc_scheduler.clone(), raftlog_fetch_scheduler: self.raftlog_fetch_scheduler.clone(), + disk_check_scheduler: self.disk_check_scheduler.clone(), region_scheduler: self.region_scheduler.clone(), apply_router: self.apply_router.clone(), router: self.router.clone(), @@ -1586,6 +1645,7 @@ impl RaftBatchSystem { collector_reg_handle: CollectorRegHandle, health_service: Option, causal_ts_provider: Option>, // used for rawkv apiv2 + mut disk_check_runner: DiskCheckRunner, grpc_service_mgr: GrpcServiceManager, safe_point: Arc, ) -> Result<()> { @@ -1604,7 +1664,9 @@ impl RaftBatchSystem { for region_id in regions { let _ = router_clone.send( region_id, - PeerMsg::CasualMessage(CasualMessage::ForceCompactRaftLogs), + PeerMsg::CasualMessage(Box::new( + CasualMessage::ForceCompactRaftLogs, + )), ); } } @@ -1656,12 +1718,7 @@ impl RaftBatchSystem { ); let compact_runner = CompactRunner::new(engines.kv.clone()); - let cleanup_sst_runner = CleanupSstRunner::new( - meta.get_id(), - self.router.clone(), - Arc::clone(&importer), - Arc::clone(&pd_client), - ); + let cleanup_sst_runner = CleanupSstRunner::new(Arc::clone(&importer)); let gc_snapshot_runner = GcSnapshotRunner::new( meta.get_id(), self.router.clone(), // RaftRouter @@ -1677,6 +1734,12 @@ impl RaftBatchSystem { let consistency_check_scheduler = workers .background_worker .start("consistency-check", consistency_check_runner); + // The scheduler dedicated to health checking the KvEngine disk when it's using + // a separate disk from RaftEngine. + disk_check_runner.bind_background_worker(workers.background_worker.clone()); + let disk_check_scheduler = workers + .background_worker + .start("disk-check-worker", disk_check_runner); self.store_writers.spawn( meta.get_id(), @@ -1695,6 +1758,7 @@ impl RaftBatchSystem { router: self.router.clone(), split_check_scheduler, region_scheduler, + disk_check_scheduler, pd_scheduler: workers.pd_worker.scheduler(), consistency_check_scheduler, cleanup_scheduler, @@ -1841,14 +1905,12 @@ impl RaftBatchSystem { causal_ts_provider, grpc_service_mgr, ); - assert!(workers.pd_worker.start_with_timer(pd_runner)); + assert!(workers.pd_worker.start(pd_runner)); if let Err(e) = sys_util::thread::set_priority(sys_util::HIGH_PRI) { warn!("set thread priority for raftstore failed"; "error" => ?e); } self.workers = Some(workers); - // This router will not be accessed again, free all caches. - self.router.clear_cache(); Ok(()) } @@ -1961,7 +2023,8 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER } info!( "region doesn't exist yet, wait for it to be split"; - "region_id" => region_id + "region_id" => region_id, + "to_peer_id" => msg.get_to_peer().get_id(), ); return Ok(CheckMsgStatus::FirstRequest); } @@ -2074,21 +2137,25 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER Ok(CheckMsgStatus::NewPeer) } - fn on_raft_message(&mut self, msg: InspectedRaftMessage) -> Result<()> { + fn on_raft_message(&mut self, msg: Box) -> Result<()> { let (heap_size, forwarded) = (msg.heap_size, Cell::new(false)); defer!(if !forwarded.get() { MEMTRACE_RAFT_MESSAGES.trace(TraceEvent::Sub(heap_size)); }); let region_id = msg.msg.get_region_id(); - let msg = match self.ctx.router.send(region_id, PeerMsg::RaftMessage(msg)) { + let msg = match self + .ctx + .router + .send(region_id, PeerMsg::RaftMessage(msg, None)) + { Ok(()) => { forwarded.set(true); return Ok(()); } Err(TrySendError::Full(_)) => return Ok(()), Err(TrySendError::Disconnected(_)) if self.ctx.router.is_shutdown() => return Ok(()), - Err(TrySendError::Disconnected(PeerMsg::RaftMessage(im))) => im.msg, + Err(TrySendError::Disconnected(PeerMsg::RaftMessage(im, None))) => im.msg, Err(_) => unreachable!(), }; @@ -2160,7 +2227,10 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER check_msg_status == CheckMsgStatus::NewPeerFirst, )? { // Peer created, send the message again. - let peer_msg = PeerMsg::RaftMessage(InspectedRaftMessage { heap_size, msg }); + let peer_msg = PeerMsg::RaftMessage( + Box::new(InspectedRaftMessage { heap_size, msg }), + None, + ); if self.ctx.router.send(region_id, peer_msg).is_ok() { forwarded.set(true); } @@ -2183,7 +2253,8 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER store_meta.pending_msgs.push(msg); } else { drop(store_meta); - let peer_msg = PeerMsg::RaftMessage(InspectedRaftMessage { heap_size, msg }); + let peer_msg = + PeerMsg::RaftMessage(Box::new(InspectedRaftMessage { heap_size, msg }), None); if let Err(e) = self.ctx.router.force_send(region_id, peer_msg) { warn!("handle first request failed"; "region_id" => region_id, "error" => ?e); } else { @@ -2349,7 +2420,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER // region may has been merged/splitted already. let _ = self.ctx.router.force_send( exist_region.get_id(), - PeerMsg::CasualMessage(CasualMessage::RegionOverlapped), + PeerMsg::CasualMessage(Box::new(CasualMessage::RegionOverlapped)), ); } } @@ -2364,11 +2435,11 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER .router .force_send( id, - PeerMsg::SignificantMsg(SignificantMsg::MergeResult { + PeerMsg::SignificantMsg(Box::new(SignificantMsg::MergeResult { target_region_id: region_id, target: target.clone(), result: MergeResultKind::Stale, - }), + })), ) .unwrap(); } @@ -2382,6 +2453,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER self.ctx.engines.clone(), region_id, target.clone(), + msg.get_from_peer().clone(), )?; // WARNING: The checking code must be above this line. @@ -2435,9 +2507,9 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER for (region_id, declined_bytes) in region_declined_bytes.drain(..) { let _ = self.ctx.router.send( region_id, - PeerMsg::CasualMessage(CasualMessage::CompactionDeclinedBytes { + PeerMsg::CasualMessage(Box::new(CasualMessage::CompactionDeclinedBytes { bytes: declined_bytes, - }), + })), ); } } @@ -2525,7 +2597,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER self.ctx.cfg.region_compact_min_tombstones, self.ctx.cfg.region_compact_tombstones_percent, self.ctx.cfg.region_compact_min_redundant_rows, - self.ctx.cfg.region_compact_redundant_rows_percent, + self.ctx.cfg.region_compact_redundant_rows_percent(), ), }, )) { @@ -2537,10 +2609,75 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER } } + fn check_store_is_busy_on_apply( + &self, + start_ts_sec: u32, + region_count: u64, + busy_apply_peers_count: u64, + completed_apply_peers_count: Option, + ) -> bool { + STORE_BUSY_ON_APPLY_REGIONS_GAUGE_VEC + .busy_apply_peers + .set(busy_apply_peers_count as i64); + STORE_BUSY_ON_APPLY_REGIONS_GAUGE_VEC + .completed_apply_peers + .set(completed_apply_peers_count.unwrap_or_default() as i64); + // No need to check busy status if there are no regions. + if completed_apply_peers_count.is_none() || region_count == 0 { + return false; + } + + let completed_apply_peers_count = completed_apply_peers_count.unwrap(); + let during_starting_stage = { + (time::get_time().sec as u32).saturating_sub(start_ts_sec) + <= STORE_CHECK_PENDING_APPLY_DURATION.as_secs() as u32 + }; + // If the store is busy in handling applying logs when starting, it should not + // be treated as a normal store for balance. Only when the store is + // almost idle (no more pending regions on applying logs), it can be + // regarded as the candidate for balancing leaders. + if during_starting_stage { + let completed_target_count = (|| { + fail_point!("on_mock_store_completed_target_count", |_| 0); + std::cmp::max( + 1, + STORE_CHECK_COMPLETE_APPLY_REGIONS_PERCENT * region_count / 100, + ) + })(); + // If the number of regions on completing applying logs does not occupy the + // majority of regions, the store is regarded as busy. + if completed_apply_peers_count < completed_target_count { + debug!("check store is busy on apply"; + "region_count" => region_count, + "completed_apply_peers_count" => completed_apply_peers_count, + "completed_target_count" => completed_target_count); + true + } else { + let pending_target_count = std::cmp::min( + self.ctx.cfg.min_pending_apply_region_count, + region_count.saturating_sub(completed_target_count), + ); + debug!("check store is busy on apply, has pending peers"; + "region_count" => region_count, + "completed_apply_peers_count" => completed_apply_peers_count, + "completed_target_count" => completed_target_count, + "pending_target_count" => pending_target_count, + "busy_apply_peers_count" => busy_apply_peers_count); + pending_target_count > 0 && busy_apply_peers_count >= pending_target_count + } + } else { + // Already started for a fairy long time. + false + } + } + fn store_heartbeat_pd(&mut self, report: Option) { let mut stats = StoreStats::default(); stats.set_store_id(self.ctx.store_id()); + + let completed_apply_peers_count: Option; + let busy_apply_peers_count: u64; { let meta = self.ctx.store_meta.lock().unwrap(); stats.set_region_count(meta.regions.len() as u32); @@ -2549,6 +2686,16 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER let damaged_regions_id = meta.get_all_damaged_region_ids().into_iter().collect(); stats.set_damaged_regions_id(damaged_regions_id); } + + if !meta.damaged_regions.is_empty() { + // Note: no need to filter overlapped regions, since the regions in + // `damaged_ranges` are already non-overlapping. + stats + .mut_damaged_regions_id() + .extend(meta.damaged_regions.iter()); + } + completed_apply_peers_count = meta.completed_apply_peers_count; + busy_apply_peers_count = meta.busy_apply_peers.len() as u64; } let snap_stats = self.ctx.snap_mgr.stats(); @@ -2563,7 +2710,8 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER .with_label_values(&["receiving"]) .set(snap_stats.receiving_count as i64); - stats.set_start_time(self.fsm.store.start_time.unwrap().sec as u32); + let start_time = self.fsm.store.start_time.unwrap().sec as u32; + stats.set_start_time(start_time); // report store write flow to pd stats.set_bytes_written( @@ -2581,13 +2729,32 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER .swap(0, Ordering::Relaxed), ); - stats.set_is_busy( - self.ctx - .global_stat - .stat - .is_busy - .swap(false, Ordering::Relaxed), + let busy_on_apply = self.check_store_is_busy_on_apply( + start_time, + stats.get_region_count() as u64, + busy_apply_peers_count, + completed_apply_peers_count, ); + // If the store already pass the check, it should clear the + // `completed_apply_peers_count` to skip the check next time. + if !busy_on_apply { + let mut meta = self.ctx.store_meta.lock().unwrap(); + meta.completed_apply_peers_count = None; + meta.busy_apply_peers.clear(); + } + let store_is_busy = self + .ctx + .global_stat + .stat + .is_busy + .swap(false, Ordering::Relaxed); + stats.set_is_busy(store_is_busy || busy_on_apply); + STORE_PROCESS_BUSY_GAUGE_VEC + .applystore_busy + .set(busy_on_apply as i64); + STORE_PROCESS_BUSY_GAUGE_VEC + .raftstore_busy + .set(store_is_busy as i64); let mut query_stats = QueryStats::default(); query_stats.set_put( @@ -2759,60 +2926,47 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER } } -impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER, T> { - fn on_validate_sst_result(&mut self, ssts: Vec) { - if ssts.is_empty() || self.ctx.importer.get_mode() == SwitchMode::Import { - return; - } - // A stale peer can still ingest a stale Sst before it is - // destroyed. We need to make sure that no stale peer exists. - let mut delete_ssts = Vec::new(); - { - let meta = self.ctx.store_meta.lock().unwrap(); - for sst in ssts { - if !meta.regions.contains_key(&sst.get_region_id()) { - delete_ssts.push(sst); - } - } - } - if delete_ssts.is_empty() { - return; - } - - let task = CleanupSstTask::DeleteSst { ssts: delete_ssts }; - if let Err(e) = self - .ctx - .cleanup_scheduler - .schedule(CleanupTask::CleanupSst(task)) - { - error!( - "schedule to delete ssts failed"; - "store_id" => self.fsm.store.id, - "err" => ?e, - ); - } - } +// we will remove 1-week old version 1 SST files. +const VERSION_1_SST_CLEANUP_DURATION: Duration = Duration::from_secs(7 * 24 * 60 * 60); +impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER, T> { fn on_cleanup_import_sst(&mut self) -> Result<()> { let mut delete_ssts = Vec::new(); - let mut validate_ssts = Vec::new(); let ssts = box_try!(self.ctx.importer.list_ssts()); if ssts.is_empty() { return Ok(()); } + let now = SystemTime::now(); { let meta = self.ctx.store_meta.lock().unwrap(); for sst in ssts { - if let Some(r) = meta.regions.get(&sst.get_region_id()) { + if let Some(r) = meta.regions.get(&sst.0.get_region_id()) { let region_epoch = r.get_region_epoch(); - if util::is_epoch_stale(sst.get_region_epoch(), region_epoch) { + if util::is_epoch_stale(sst.0.get_region_epoch(), region_epoch) { // If the SST epoch is stale, it will not be ingested anymore. - delete_ssts.push(sst); + delete_ssts.push(sst.0); } + } else if sst.1 >= sst_importer::API_VERSION_2 { + // The write RPC of import sst service have make sure the region do exist at + // the write time, and now the region is not found, + // sst can be deleted because it won't be used by + // ingest in future. + delete_ssts.push(sst.0); } else { - // If the peer doesn't exist, we need to validate the SST through PD. - validate_ssts.push(sst); + // in the old protocol, we can't easily know if the SST will be used in the + // committed raft log, so we only delete the SST + // files that has not be modified for 1 week. + if let Ok(duration) = now.duration_since(sst.2) { + if duration > VERSION_1_SST_CLEANUP_DURATION { + warn!( + "found 1-week old SST file of version 1, will delete it"; + "sst_meta" => ?sst.0, + "last_modified" => ?sst.2 + ); + delete_ssts.push(sst.0); + } + } } } } @@ -2832,27 +2986,6 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER } } - // When there is an import job running, the region which this sst belongs may - // has not been split from the origin region because the apply thread is so busy - // that it can not apply SplitRequest as soon as possible. So we can not - // delete this sst file. - if !validate_ssts.is_empty() && self.ctx.importer.get_mode() != SwitchMode::Import { - let task = CleanupSstTask::ValidateSst { - ssts: validate_ssts, - }; - if let Err(e) = self - .ctx - .cleanup_scheduler - .schedule(CleanupTask::CleanupSst(task)) - { - error!( - "schedule to validate ssts failed"; - "store_id" => self.fsm.store.id, - "err" => ?e, - ); - } - } - Ok(()) } @@ -2912,7 +3045,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER let _ = self.ctx.router.send( target_region_id, - PeerMsg::RaftCommand(RaftCommand::new(request, Callback::None)), + PeerMsg::RaftCommand(Box::new(RaftCommand::new(request, Callback::None))), ); } @@ -2950,7 +3083,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER for region_id in regions { let _ = self.ctx.router.send( region_id, - PeerMsg::CasualMessage(CasualMessage::ClearRegionSize), + PeerMsg::CasualMessage(Box::new(CasualMessage::ClearRegionSize)), ); } } diff --git a/components/raftstore/src/store/local_metrics.rs b/components/raftstore/src/store/local_metrics.rs index 5460a57ae0f..dc94a3afbe7 100644 --- a/components/raftstore/src/store/local_metrics.rs +++ b/components/raftstore/src/store/local_metrics.rs @@ -4,7 +4,7 @@ use std::sync::{Arc, Mutex}; use collections::HashSet; -use prometheus::local::LocalHistogram; +use prometheus::local::{LocalHistogram, LocalIntCounter}; use raft::eraftpb::MessageType; use tikv_util::time::{Duration, Instant}; use tracker::{Tracker, TrackerToken, GLOBAL_TRACKERS, INVALID_TRACKER_TOKEN}; @@ -68,35 +68,81 @@ impl RaftSendMessageMetrics { } } +/// Buffered statistics for recording local raftstore message duration. +/// +/// As it's only used for recording local raftstore message duration, +/// and it will be manually reset preiodically, so it's not necessary +/// to use `LocalHistogram`. #[derive(Default)] -pub struct RaftCommitLogStatistics { - pub last_commit_log_duration_sum: Duration, - pub last_commit_log_count_sum: u64, +struct LocalHealthStatistics { + duration_sum: Duration, + count: u64, } -impl RaftCommitLogStatistics { +impl LocalHealthStatistics { #[inline] - pub fn record(&mut self, dur: Duration) { - self.last_commit_log_count_sum += 1; - self.last_commit_log_duration_sum += dur; + fn observe(&mut self, dur: Duration) { + self.count += 1; + self.duration_sum += dur; } #[inline] - pub fn avg(&self) -> Duration { - if self.last_commit_log_count_sum > 0 { - Duration::from_micros( - self.last_commit_log_duration_sum.as_micros() as u64 - / self.last_commit_log_count_sum, - ) + fn avg(&self) -> Duration { + if self.count > 0 { + Duration::from_micros(self.duration_sum.as_micros() as u64 / self.count) } else { Duration::default() } } #[inline] + fn reset(&mut self) { + self.count = 0; + self.duration_sum = Duration::default(); + } +} + +#[repr(u8)] +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum IoType { + Disk = 0, + Network = 1, +} + +/// Buffered statistics for recording the health of raftstore. +#[derive(Default)] +pub struct HealthStatistics { + // represents periodic latency on the disk io. + disk_io_dur: LocalHealthStatistics, + // represents the latency of the network io. + network_io_dur: LocalHealthStatistics, +} + +impl HealthStatistics { + #[inline] + pub fn observe(&mut self, dur: Duration, io_type: IoType) { + match io_type { + IoType::Disk => self.disk_io_dur.observe(dur), + IoType::Network => self.network_io_dur.observe(dur), + } + } + + #[inline] + pub fn avg(&self, io_type: IoType) -> Duration { + match io_type { + IoType::Disk => self.disk_io_dur.avg(), + IoType::Network => self.network_io_dur.avg(), + } + } + + #[inline] + /// Reset HealthStatistics. + /// + /// Should be manually reset when the metrics are + /// accepted by slowness inspector. pub fn reset(&mut self) { - self.last_commit_log_count_sum = 0; - self.last_commit_log_duration_sum = Duration::default(); + self.disk_io_dur.reset(); + self.network_io_dur.reset(); } } @@ -112,7 +158,10 @@ pub struct RaftMetrics { // local histogram pub store_time: LocalHistogram, + // the wait time for processing a raft command pub propose_wait_time: LocalHistogram, + // the wait time for processing a raft message + pub process_wait_time: LocalHistogram, pub process_ready: LocalHistogram, pub event_time: RaftEventDurationVec, pub peer_msg_len: LocalHistogram, @@ -130,8 +179,9 @@ pub struct RaftMetrics { pub wf_commit_not_persist_log: LocalHistogram, // local statistics for slowness - pub stat_commit_log: RaftCommitLogStatistics, + pub health_stats: HealthStatistics, + pub check_stale_peer: LocalIntCounter, pub leader_missing: Arc>>, last_flush_time: Instant, @@ -152,6 +202,7 @@ impl RaftMetrics { raft_log_gc_skipped: RaftLogGcSkippedCounterVec::from(&RAFT_LOG_GC_SKIPPED_VEC), store_time: STORE_TIME_HISTOGRAM.local(), propose_wait_time: REQUEST_WAIT_TIME_HISTOGRAM.local(), + process_wait_time: RAFT_MESSAGE_WAIT_TIME_HISTOGRAM.local(), process_ready: PEER_RAFT_PROCESS_DURATION .with_label_values(&["ready"]) .local(), @@ -167,7 +218,8 @@ impl RaftMetrics { wf_persist_log: STORE_WF_PERSIST_LOG_DURATION_HISTOGRAM.local(), wf_commit_log: STORE_WF_COMMIT_LOG_DURATION_HISTOGRAM.local(), wf_commit_not_persist_log: STORE_WF_COMMIT_NOT_PERSIST_LOG_DURATION_HISTOGRAM.local(), - stat_commit_log: RaftCommitLogStatistics::default(), + health_stats: HealthStatistics::default(), + check_stale_peer: CHECK_STALE_PEER_COUNTER.local(), leader_missing: Arc::default(), last_flush_time: Instant::now_coarse(), } @@ -190,6 +242,7 @@ impl RaftMetrics { self.store_time.flush(); self.propose_wait_time.flush(); + self.process_wait_time.flush(); self.process_ready.flush(); self.event_time.flush(); self.peer_msg_len.flush(); @@ -206,6 +259,7 @@ impl RaftMetrics { self.wf_commit_not_persist_log.flush(); } + self.check_stale_peer.flush(); let mut missing = self.leader_missing.lock().unwrap(); LEADER_MISSING.set(missing.len() as i64); missing.clear(); diff --git a/components/raftstore/src/store/metrics.rs b/components/raftstore/src/store/metrics.rs index a5aa164e63e..399e2cb9401 100644 --- a/components/raftstore/src/store/metrics.rs +++ b/components/raftstore/src/store/metrics.rs @@ -257,6 +257,31 @@ make_static_metric! { unable_to_split_cpu_top, } + pub label_enum SnapshotBrWaitApplyEventType { + sent, + trivial, + accepted, + term_not_match, + epoch_not_match, + duplicated, + finished, + } + + pub struct SnapshotBrWaitApplyEvent : IntCounter { + "event" => SnapshotBrWaitApplyEventType + } + + pub label_enum SnapshotBrLeaseEventType { + create, + renew, + expired, + reset, + } + + pub struct SnapshotBrLeaseEvent : IntCounter { + "event" => SnapshotBrLeaseEventType + } + pub struct HibernatedPeerStateGauge: IntGauge { "state" => { awaken, @@ -296,6 +321,20 @@ make_static_metric! { pub struct LoadBaseSplitEventCounterVec: IntCounter { "type" => LoadBaseSplitEventType, } + + pub struct StoreBusyOnApplyRegionsGaugeVec: IntGauge { + "type" => { + busy_apply_peers, + completed_apply_peers, + }, + } + + pub struct StoreBusyStateGaugeVec: IntGauge { + "type" => { + raftstore_busy, + applystore_busy, + }, + } } lazy_static! { @@ -551,6 +590,13 @@ lazy_static! { exponential_buckets(0.00001, 2.0, 26).unwrap() ).unwrap(); + pub static ref RAFT_MESSAGE_WAIT_TIME_HISTOGRAM: Histogram = + register_histogram!( + "tikv_raftstore_raft_msg_wait_time_duration_secs", + "Bucketed histogram of raft message wait time duration.", + exponential_buckets(0.00001, 2.0, 26).unwrap() + ).unwrap(); + pub static ref PEER_GC_RAFT_LOG_COUNTER: IntCounter = register_int_counter!( "tikv_raftstore_gc_raft_log_total", @@ -651,6 +697,11 @@ lazy_static! { "Total number of leader missed region." ).unwrap(); + pub static ref CHECK_STALE_PEER_COUNTER: IntCounter = register_int_counter!( + "tikv_raftstore_check_stale_peer", + "Total number of checking stale peers." + ).unwrap(); + pub static ref INGEST_SST_DURATION_SECONDS: Histogram = register_histogram!( "tikv_snapshot_ingest_sst_duration_seconds", @@ -778,8 +829,11 @@ lazy_static! { exponential_buckets(0.00001, 2.0, 26).unwrap() ).unwrap(); - pub static ref STORE_SLOW_SCORE_GAUGE: Gauge = - register_gauge!("tikv_raftstore_slow_score", "Slow score of the store.").unwrap(); + pub static ref STORE_SLOW_SCORE_GAUGE: IntGaugeVec = register_int_gauge_vec!( + "tikv_raftstore_slow_score", + "Slow score of the store.", + &["type"] + ).unwrap(); pub static ref STORE_SLOW_TREND_GAUGE: Gauge = register_gauge!("tikv_raftstore_slow_trend", "Slow trend changing rate.").unwrap(); @@ -881,4 +935,45 @@ lazy_static! { "tikv_raftstore_peer_in_flashback_state", "Total number of peers in the flashback state" ).unwrap(); + + pub static ref SNAP_BR_SUSPEND_COMMAND_TYPE: IntCounterVec = register_int_counter_vec!( + "tikv_raftstore_snap_br_suspend_command_type", + "The statistic of rejecting some admin commands being proposed.", + &["type"] + ).unwrap(); + + pub static ref SNAP_BR_WAIT_APPLY_EVENT: SnapshotBrWaitApplyEvent = register_static_int_counter_vec!( + SnapshotBrWaitApplyEvent, + "tikv_raftstore_snap_br_wait_apply_event", + "The events of wait apply issued by snapshot br.", + &["event"] + ).unwrap(); + + pub static ref SNAP_BR_SUSPEND_COMMAND_LEASE_UNTIL: IntGauge = register_int_gauge!( + "tikv_raftstore_snap_br_suspend_command_lease_until", + "The lease that snapshot br holds of rejecting some type of commands. (In unix timestamp.)" + ).unwrap(); + + pub static ref SNAP_BR_LEASE_EVENT: SnapshotBrLeaseEvent = register_static_int_counter_vec!( + SnapshotBrLeaseEvent, + "tikv_raftstore_snap_br_lease_event", + "The events of the lease to denying new admin commands being proposed by snapshot br.", + &["event"] + ).unwrap(); + + pub static ref STORE_BUSY_ON_APPLY_REGIONS_GAUGE_VEC: StoreBusyOnApplyRegionsGaugeVec = + register_static_int_gauge_vec!( + StoreBusyOnApplyRegionsGaugeVec, + "tikv_raftstore_busy_on_apply_region_total", + "Total number of regions busy on apply or complete apply.", + &["type"] + ).unwrap(); + + pub static ref STORE_PROCESS_BUSY_GAUGE_VEC: StoreBusyStateGaugeVec = + register_static_int_gauge_vec!( + StoreBusyStateGaugeVec, + "tikv_raftstore_process_busy", + "Is raft process busy or not", + &["type"] + ).unwrap(); } diff --git a/components/raftstore/src/store/mod.rs b/components/raftstore/src/store/mod.rs index efd149e7c41..3d9d698743f 100644 --- a/components/raftstore/src/store/mod.rs +++ b/components/raftstore/src/store/mod.rs @@ -11,6 +11,7 @@ pub mod msg; mod peer; mod read_queue; pub mod region_meta; +pub mod snapshot_backup; pub mod transport; #[macro_use] pub mod util; @@ -57,8 +58,8 @@ pub use self::{ }, peer::{ can_amend_read, get_sync_log_from_request, make_transfer_leader_response, - propose_read_index, should_renew_lease, Peer, PeerStat, ProposalContext, ProposalQueue, - RequestInspector, RequestPolicy, TRANSFER_LEADER_COMMAND_REPLY_CTX, + propose_read_index, should_renew_lease, DiskFullPeers, Peer, PeerStat, ProposalContext, + ProposalQueue, RequestInspector, RequestPolicy, TRANSFER_LEADER_COMMAND_REPLY_CTX, }, peer_storage::{ clear_meta, do_snapshot, write_initial_apply_state, write_initial_raft_state, @@ -74,25 +75,26 @@ pub use self::{ ApplyOptions, CfFile, Error as SnapError, SnapEntry, SnapKey, SnapManager, SnapManagerBuilder, Snapshot, SnapshotStatistics, TabletSnapKey, TabletSnapManager, }, + snapshot_backup::SnapshotBrWaitApplySyncer, transport::{CasualRouter, ProposalRouter, SignificantRouter, StoreRouter, Transport}, txn_ext::{LocksStatus, PeerPessimisticLocks, PessimisticLockPair, TxnExt}, unsafe_recovery::{ demote_failed_voters_request, exit_joint_request, ForceLeaderState, - SnapshotRecoveryWaitApplySyncer, UnsafeRecoveryExecutePlanSyncer, - UnsafeRecoveryFillOutReportSyncer, UnsafeRecoveryForceLeaderSyncer, UnsafeRecoveryHandle, - UnsafeRecoveryState, UnsafeRecoveryWaitApplySyncer, + UnsafeRecoveryExecutePlanSyncer, UnsafeRecoveryFillOutReportSyncer, + UnsafeRecoveryForceLeaderSyncer, UnsafeRecoveryHandle, UnsafeRecoveryState, + UnsafeRecoveryWaitApplySyncer, }, util::{RegionReadProgress, RegionReadProgressRegistry}, worker::{ metrics as worker_metrics, need_compact, AutoSplitController, BatchComponent, Bucket, - BucketRange, CachedReadDelegate, CheckLeaderRunner, CheckLeaderTask, CompactThreshold, - FlowStatistics, FlowStatsReporter, KeyEntry, LocalReadContext, LocalReader, - LocalReaderCore, PdStatsMonitor, PdTask, ReadDelegate, ReadExecutor, ReadExecutorProvider, - ReadProgress, ReadStats, RefreshConfigTask, RegionTask, SplitCheckRunner, SplitCheckTask, - SplitConfig, SplitConfigManager, SplitInfo, StoreMetaDelegate, StoreStatsReporter, - TrackVer, WriteStats, WriterContoller, BIG_REGION_CPU_OVERLOAD_THRESHOLD_RATIO, - DEFAULT_BIG_REGION_BYTE_THRESHOLD, DEFAULT_BIG_REGION_QPS_THRESHOLD, - DEFAULT_BYTE_THRESHOLD, DEFAULT_QPS_THRESHOLD, NUM_COLLECT_STORE_INFOS_PER_HEARTBEAT, - REGION_CPU_OVERLOAD_THRESHOLD_RATIO, + BucketRange, BucketStatsInfo, CachedReadDelegate, CheckLeaderRunner, CheckLeaderTask, + CompactThreshold, DiskCheckRunner, FlowStatistics, FlowStatsReporter, KeyEntry, + LocalReadContext, LocalReader, LocalReaderCore, PdStatsMonitor, PdTask, ReadDelegate, + ReadExecutor, ReadExecutorProvider, ReadProgress, ReadStats, RefreshConfigTask, RegionTask, + SplitCheckRunner, SplitCheckTask, SplitConfig, SplitConfigManager, SplitInfo, + StoreMetaDelegate, StoreStatsReporter, TrackVer, WriteStats, WriterContoller, + BIG_REGION_CPU_OVERLOAD_THRESHOLD_RATIO, DEFAULT_BIG_REGION_BYTE_THRESHOLD, + DEFAULT_BIG_REGION_QPS_THRESHOLD, DEFAULT_BYTE_THRESHOLD, DEFAULT_QPS_THRESHOLD, + NUM_COLLECT_STORE_INFOS_PER_HEARTBEAT, REGION_CPU_OVERLOAD_THRESHOLD_RATIO, }, }; diff --git a/components/raftstore/src/store/msg.rs b/components/raftstore/src/store/msg.rs index 1ed8934e0f0..1624c449a9a 100644 --- a/components/raftstore/src/store/msg.rs +++ b/components/raftstore/src/store/msg.rs @@ -10,7 +10,6 @@ use engine_traits::{CompactedEvent, KvEngine, Snapshot}; use futures::channel::mpsc::UnboundedSender; use kvproto::{ brpb::CheckAdminResponse, - import_sstpb::SstMeta, kvrpcpb::{DiskFullOpt, ExtraOp as TxnExtraOp}, metapb, metapb::RegionEpoch, @@ -24,17 +23,20 @@ use pd_client::BucketMeta; use raft::SnapshotStatus; use resource_control::ResourceMetered; use smallvec::{smallvec, SmallVec}; -use tikv_util::{deadline::Deadline, escape, memory::HeapSize, time::Instant}; +use strum::{EnumCount, EnumVariantNames}; +use tikv_util::{deadline::Deadline, escape, memory::HeapSize, time::Instant, InspectFactor}; use tracker::{get_tls_tracker_token, TrackerToken}; -use super::{local_metrics::TimeTracker, region_meta::RegionMeta, FetchedLogs, RegionSnapshot}; +use super::{ + local_metrics::TimeTracker, region_meta::RegionMeta, + snapshot_backup::SnapshotBrWaitApplyRequest, FetchedLogs, RegionSnapshot, +}; use crate::store::{ fsm::apply::{CatchUpLogs, ChangeObserver, TaskRes as ApplyTaskRes}, metrics::RaftEventDurationType, unsafe_recovery::{ - SnapshotRecoveryWaitApplySyncer, UnsafeRecoveryExecutePlanSyncer, - UnsafeRecoveryFillOutReportSyncer, UnsafeRecoveryForceLeaderSyncer, - UnsafeRecoveryWaitApplySyncer, + UnsafeRecoveryExecutePlanSyncer, UnsafeRecoveryFillOutReportSyncer, + UnsafeRecoveryForceLeaderSyncer, UnsafeRecoveryWaitApplySyncer, }, util::{KeysInfoFormatter, LatencyInspector}, worker::{Bucket, BucketRange}, @@ -529,7 +531,7 @@ where UnsafeRecoveryDestroy(UnsafeRecoveryExecutePlanSyncer), UnsafeRecoveryWaitApply(UnsafeRecoveryWaitApplySyncer), UnsafeRecoveryFillOutReport(UnsafeRecoveryFillOutReportSyncer), - SnapshotRecoveryWaitApply(SnapshotRecoveryWaitApplySyncer), + SnapshotBrWaitApply(SnapshotBrWaitApplyRequest), CheckPendingAdmin(UnboundedSender), } @@ -545,6 +547,7 @@ pub enum CasualMessage { split_keys: Vec>, callback: Callback, source: Cow<'static, str>, + share_source_region_size: bool, }, /// Hash result of ComputeHash command. @@ -557,12 +560,14 @@ pub enum CasualMessage { /// Approximate size of target region. This message can only be sent by /// split-check thread. RegionApproximateSize { - size: u64, + size: Option, + splitable: Option, }, /// Approximate key count of target region. RegionApproximateKeys { - keys: u64, + keys: Option, + splitable: Option, }, CompactionDeclinedBytes { bytes: u64, @@ -617,7 +622,11 @@ pub enum CasualMessage { RenewLease, // Snapshot is applied - SnapshotApplied, + SnapshotApplied { + peer_id: u64, + /// Whether the peer is destroyed after applying the snapshot + tombstone: bool, + }, // Trigger raft to campaign which is used after exiting force leader Campaign, @@ -647,11 +656,19 @@ impl fmt::Debug for CasualMessage { KeysInfoFormatter(split_keys.iter()), source, ), - CasualMessage::RegionApproximateSize { size } => { - write!(fmt, "Region's approximate size [size: {:?}]", size) + CasualMessage::RegionApproximateSize { size, splitable } => { + write!( + fmt, + "Region's approximate size [size: {:?}], [splitable: {:?}]", + size, splitable + ) } - CasualMessage::RegionApproximateKeys { keys } => { - write!(fmt, "Region's approximate keys [keys: {:?}]", keys) + CasualMessage::RegionApproximateKeys { keys, splitable } => { + write!( + fmt, + "Region's approximate keys [keys: {:?}], [splitable: {:?}", + keys, splitable + ) } CasualMessage::CompactionDeclinedBytes { bytes } => { write!(fmt, "compaction declined bytes {}", bytes) @@ -678,7 +695,11 @@ impl fmt::Debug for CasualMessage { } CasualMessage::RefreshRegionBuckets { .. } => write!(fmt, "RefreshRegionBuckets"), CasualMessage::RenewLease => write!(fmt, "RenewLease"), - CasualMessage::SnapshotApplied => write!(fmt, "SnapshotApplied"), + CasualMessage::SnapshotApplied { peer_id, tombstone } => write!( + fmt, + "SnapshotApplied, peer_id={}, tombstone={}", + peer_id, tombstone + ), CasualMessage::Campaign => write!(fmt, "Campaign"), } } @@ -735,27 +756,25 @@ pub struct InspectedRaftMessage { } /// Message that can be sent to a peer. -#[allow(clippy::large_enum_variant)] +#[derive(EnumCount, EnumVariantNames)] pub enum PeerMsg { /// Raft message is the message sent between raft nodes in the same /// raft group. Messages need to be redirected to raftstore if target /// peer doesn't exist. - RaftMessage(InspectedRaftMessage), + RaftMessage(Box, Option), /// Raft command is the command that is expected to be proposed by the /// leader of the target raft group. If it's failed to be sent, callback /// usually needs to be called before dropping in case of resource leak. - RaftCommand(RaftCommand), + RaftCommand(Box>), /// Tick is periodical task. If target peer doesn't exist there is a /// potential that the raft node will not work anymore. Tick(PeerTick), /// Result of applying committed entries. The message can't be lost. - ApplyRes { - res: ApplyTaskRes, - }, + ApplyRes(Box>), /// Message that can't be lost but rarely created. If they are lost, real /// bad things happen like some peers will be considered dead in the /// group. - SignificantMsg(SignificantMsg), + SignificantMsg(Box>), /// Start the FSM. Start, /// A message only used to notify a peer. @@ -765,7 +784,7 @@ pub enum PeerMsg { ready_number: u64, }, /// Message that is not important and can be dropped occasionally. - CasualMessage(CasualMessage), + CasualMessage(Box>), /// Ask region to report a heartbeat to PD. HeartbeatPd, /// Asks region to change replication mode. @@ -778,7 +797,7 @@ impl ResourceMetered for PeerMsg {} impl fmt::Debug for PeerMsg { fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { match self { - PeerMsg::RaftMessage(_) => write!(fmt, "Raft Message"), + PeerMsg::RaftMessage(..) => write!(fmt, "Raft Message"), PeerMsg::RaftCommand(_) => write!(fmt, "Raft Command"), PeerMsg::Tick(tick) => write! { fmt, @@ -786,7 +805,7 @@ impl fmt::Debug for PeerMsg { tick }, PeerMsg::SignificantMsg(msg) => write!(fmt, "{:?}", msg), - PeerMsg::ApplyRes { res } => write!(fmt, "ApplyRes {:?}", res), + PeerMsg::ApplyRes(res) => write!(fmt, "ApplyRes {:?}", res), PeerMsg::Start => write!(fmt, "Startup"), PeerMsg::Noop => write!(fmt, "Noop"), PeerMsg::Persisted { @@ -806,26 +825,40 @@ impl fmt::Debug for PeerMsg { } impl PeerMsg { + pub fn discriminant(&self) -> usize { + match self { + PeerMsg::RaftMessage(..) => 0, + PeerMsg::RaftCommand(_) => 1, + PeerMsg::Tick(_) => 2, + PeerMsg::SignificantMsg(_) => 3, + PeerMsg::ApplyRes { .. } => 4, + PeerMsg::Start => 5, + PeerMsg::Noop => 6, + PeerMsg::Persisted { .. } => 7, + PeerMsg::CasualMessage(_) => 8, + PeerMsg::HeartbeatPd => 9, + PeerMsg::UpdateReplicationMode => 10, + PeerMsg::Destroy(_) => 11, + } + } + /// For some specific kind of messages, it's actually acceptable if failed /// to send it by `significant_send`. This function determine if the /// current message is acceptable to fail. pub fn is_send_failure_ignorable(&self) -> bool { matches!( self, - PeerMsg::SignificantMsg(SignificantMsg::CaptureChange { .. }) + PeerMsg::SignificantMsg(box SignificantMsg::CaptureChange { .. }) ) } } +#[derive(EnumCount, EnumVariantNames)] pub enum StoreMsg where EK: KvEngine, { - RaftMessage(InspectedRaftMessage), - - ValidateSstResult { - invalid_ssts: Vec, - }, + RaftMessage(Box), // Clear region size and keys for all regions in the range, so we can force them to // re-calculate their size later. @@ -849,14 +882,11 @@ where /// Inspect the latency of raftstore. LatencyInspect { + factor: InspectFactor, send_time: Instant, inspector: LatencyInspector, }, - /// Message only used for test. - #[cfg(any(test, feature = "testexport"))] - Validate(Box), - UnsafeRecoveryReport(pdpb::StoreReport), UnsafeRecoveryCreatePeer { syncer: UnsafeRecoveryExecutePlanSyncer, @@ -868,6 +898,10 @@ where AwakenRegions { abnormal_stores: Vec, }, + + /// Message only used for test. + #[cfg(any(test, feature = "testexport"))] + Validate(Box), } impl ResourceMetered for StoreMsg {} @@ -883,7 +917,6 @@ where write!(fmt, "Store {} is unreachable", store_id) } StoreMsg::CompactedEvent(ref event) => write!(fmt, "CompactedEvent cf {}", event.cf()), - StoreMsg::ValidateSstResult { .. } => write!(fmt, "Validate SST Result"), StoreMsg::ClearRegionSizeInRange { ref start_key, ref end_key, @@ -894,8 +927,6 @@ where ), StoreMsg::Tick(tick) => write!(fmt, "StoreTick {:?}", tick), StoreMsg::Start { ref store } => write!(fmt, "Start store {:?}", store), - #[cfg(any(test, feature = "testexport"))] - StoreMsg::Validate(_) => write!(fmt, "Validate config"), StoreMsg::UpdateReplicationMode(_) => write!(fmt, "UpdateReplicationMode"), StoreMsg::LatencyInspect { .. } => write!(fmt, "LatencyInspect"), StoreMsg::UnsafeRecoveryReport(..) => write!(fmt, "UnsafeRecoveryReport"), @@ -904,6 +935,44 @@ where } StoreMsg::GcSnapshotFinish => write!(fmt, "GcSnapshotFinish"), StoreMsg::AwakenRegions { .. } => write!(fmt, "AwakenRegions"), + #[cfg(any(test, feature = "testexport"))] + StoreMsg::Validate(_) => write!(fmt, "Validate config"), + } + } +} + +impl StoreMsg { + pub fn discriminant(&self) -> usize { + match self { + StoreMsg::RaftMessage(_) => 0, + StoreMsg::StoreUnreachable { .. } => 1, + StoreMsg::CompactedEvent(_) => 2, + StoreMsg::ClearRegionSizeInRange { .. } => 3, + StoreMsg::Tick(_) => 4, + StoreMsg::Start { .. } => 5, + StoreMsg::UpdateReplicationMode(_) => 6, + StoreMsg::LatencyInspect { .. } => 7, + StoreMsg::UnsafeRecoveryReport(_) => 8, + StoreMsg::UnsafeRecoveryCreatePeer { .. } => 9, + StoreMsg::GcSnapshotFinish => 10, + StoreMsg::AwakenRegions { .. } => 11, + #[cfg(any(test, feature = "testexport"))] + StoreMsg::Validate(_) => 12, } } } + +#[cfg(test)] +mod tests { + #[test] + fn test_msg_size() { + use std::mem; + + use engine_rocks::RocksEngine; + + use super::*; + + // make sure the msg is small enough + assert_eq!(mem::size_of::>(), 32); + } +} diff --git a/components/raftstore/src/store/peer.rs b/components/raftstore/src/store/peer.rs index 8c1a7ef61e9..087b255ec7e 100644 --- a/components/raftstore/src/store/peer.rs +++ b/components/raftstore/src/store/peer.rs @@ -43,7 +43,7 @@ use kvproto::{ }, }; use parking_lot::RwLockUpgradableReadGuard; -use pd_client::{BucketStat, INVALID_ID}; +use pd_client::INVALID_ID; use protobuf::Message; use raft::{ self, @@ -80,6 +80,7 @@ use super::{ self, check_req_region_epoch, is_initial_msg, AdminCmdEpochState, ChangePeerI, ConfChangeKind, Lease, LeaseState, NORMAL_REQ_CHECK_CONF_VER, NORMAL_REQ_CHECK_VER, }, + worker::BucketStatsInfo, DestroyPeerJob, LocalReadContext, }; use crate::{ @@ -100,8 +101,9 @@ use crate::{ memory::{needs_evict_entry_cache, MEMTRACE_RAFT_ENTRIES}, msg::{CasualMessage, ErrorCallback, RaftCommand}, peer_storage::HandleSnapshotResult, + snapshot_backup::{AbortReason, SnapshotBrState}, txn_ext::LocksStatus, - unsafe_recovery::{ForceLeaderState, SnapshotRecoveryState, UnsafeRecoveryState}, + unsafe_recovery::{ForceLeaderState, UnsafeRecoveryState}, util::{admin_cmd_epoch_lookup, RegionReadProgress}, worker::{ CleanupTask, CompactTask, HeartbeatTask, RaftlogGcTask, ReadDelegate, ReadExecutor, @@ -126,6 +128,7 @@ pub enum StaleState { Valid, ToValidate, LeaderMissing, + MaybeLeaderMissing, } #[derive(Debug)] @@ -465,6 +468,12 @@ pub struct ApplySnapshotContext { /// The message should be sent after snapshot is applied. pub msgs: Vec, pub persist_res: Option, + /// Destroy the peer after apply task finished or aborted + /// This flag is set to true when the peer destroy is skipped because of + /// running snapshot task. + /// This is to accelerate peer destroy without waiting for extra destory + /// peer message. + pub destroy_peer_after_apply: bool, } #[derive(PartialEq, Debug)] @@ -569,6 +578,119 @@ pub fn can_amend_read( false } +/// The SplitCheckTrigger maintains the internal status to determine +/// if a split check task should be triggered. +#[derive(Default, Debug)] +pub struct SplitCheckTrigger { + /// An inaccurate difference in region size since last reset. + /// It is used to decide whether split check is needed. + size_diff_hint: u64, + /// An inaccurate difference in region size after compaction. + /// It is used to trigger check split to update approximate size and keys + /// after space reclamation of deleted entries. + pub compaction_declined_bytes: u64, + /// Approximate size of the region. + pub approximate_size: Option, + may_split_size: Option, + /// Approximate keys of the region. + pub approximate_keys: Option, + may_split_keys: Option, + /// Whether this region has scheduled a split check task. If we just + /// splitted the region or ingested one file which may be overlapped + /// with the existed data, reset the flag so that the region can be + /// splitted again. + may_skip_split_check: bool, +} + +impl SplitCheckTrigger { + pub fn should_skip(&self, threshold: u64) -> bool { + self.may_skip_split_check + && self.compaction_declined_bytes < threshold + && self.size_diff_hint < threshold + } + + pub fn post_triggered(&mut self) { + self.size_diff_hint = 0; + self.compaction_declined_bytes = 0; + // The task is scheduled, the next tick may skip it only when the size and keys + // are small. + // If either size or keys are big enough to do a split, + // keep split check tick until split is done + if !matches!(self.may_split_size, Some(true)) && !matches!(self.may_split_keys, Some(true)) + { + self.may_skip_split_check = true; + } + } + + pub fn post_split(&mut self) { + self.size_diff_hint = 0; + self.may_split_keys = None; + self.may_split_size = None; + // It's not correct anymore, so set it to false to schedule a split check task. + self.may_skip_split_check = false; + } + + pub fn add_size_diff(&mut self, size_diff: i64) { + let diff = self.size_diff_hint as i64 + size_diff; + self.size_diff_hint = cmp::max(diff, 0) as u64; + } + + pub fn reset_skip_check(&mut self) { + self.may_skip_split_check = false; + } + + pub fn on_clear_region_size(&mut self) { + self.approximate_size = None; + self.approximate_keys = None; + self.may_split_size = None; + self.may_split_keys = None; + self.may_skip_split_check = false; + } + + pub fn on_approximate_region_size(&mut self, size: Option, splitable: Option) { + // If size is none, it means no estimated size + if size.is_some() { + self.approximate_size = size; + } + + if splitable.is_some() { + self.may_split_size = splitable; + } + + // If the region is truly splitable, + // may_skip_split_check should be false + if matches!(splitable, Some(true)) { + self.may_skip_split_check = false; + } + } + + pub fn on_approximate_region_keys(&mut self, keys: Option, splitable: Option) { + // if keys is none, it means no estimated keys + if keys.is_some() { + self.approximate_keys = keys; + } + + if splitable.is_some() { + self.may_split_keys = splitable; + } + + // If the region is truly splitable, + // may_skip_split_check should be false + if matches!(splitable, Some(true)) { + self.may_skip_split_check = false; + } + } + + pub fn on_ingest_sst_result(&mut self, size: u64, keys: u64) { + self.approximate_size = Some(self.approximate_size.unwrap_or_default() + size); + self.approximate_keys = Some(self.approximate_keys.unwrap_or_default() + keys); + + // The ingested file may be overlapped with the data in engine, so we need to + // check it again to get the accurate value. + self.may_skip_split_check = false; + } +} + #[derive(Getters, MutGetters)] pub struct Peer where @@ -593,6 +715,8 @@ where pub peer_heartbeats: HashMap, /// Record the waiting data status of each follower or learner peer. pub wait_data_peers: Vec, + /// This peer is created by a raft message from `create_by_peer`. + create_by_peer: Option, proposals: ProposalQueue>, leader_missing_time: Option, @@ -656,25 +780,10 @@ where pub peers_start_pending_time: Vec<(u64, Instant)>, /// A inaccurate cache about which peer is marked as down. down_peer_ids: Vec, - - /// An inaccurate difference in region size since last reset. - /// It is used to decide whether split check is needed. - pub size_diff_hint: u64, + /// the split check trigger + pub split_check_trigger: SplitCheckTrigger, /// The count of deleted keys since last reset. delete_keys_hint: u64, - /// An inaccurate difference in region size after compaction. - /// It is used to trigger check split to update approximate size and keys - /// after space reclamation of deleted entries. - pub compaction_declined_bytes: u64, - /// Approximate size of the region. - pub approximate_size: Option, - /// Approximate keys of the region. - pub approximate_keys: Option, - /// Whether this region has scheduled a split check task. If we just - /// splitted the region or ingested one file which may be overlapped - /// with the existed data, reset the flag so that the region can be - /// splitted again. - pub may_skip_split_check: bool, /// The state for consistency check. pub consistency_state: ConsistencyState, @@ -780,15 +889,24 @@ where persisted_number: u64, /// The context of applying snapshot. apply_snap_ctx: Option, - /// region buckets. - pub region_buckets: Option, - pub last_region_buckets: Option, + /// region buckets info in this region. + region_buckets_info: BucketStatsInfo, /// lead_transferee if this peer(leader) is in a leadership transferring. pub lead_transferee: u64, pub unsafe_recovery_state: Option, - pub snapshot_recovery_state: Option, + pub snapshot_recovery_state: Option, last_record_safe_point: u64, + /// Used for checking whether the peer is busy on apply. + /// * `None` => the peer has no pending logs for apply or already finishes + /// applying. + /// * `Some(false)` => initial state, not be recorded. + /// * `Some(true)` => busy on apply, and already recorded. + pub busy_on_apply: Option, + /// The index of last commited idx in the leader. It's used to check whether + /// this peer has raft log gaps and whether should be marked busy on + /// apply. + pub last_leader_committed_idx: Option, } impl Peer @@ -805,6 +923,7 @@ where region: &metapb::Region, peer: metapb::Peer, wait_data: bool, + create_by_peer: Option, ) -> Result> { let peer_id = peer.get_id(); if peer_id == raft::INVALID_ID { @@ -859,14 +978,11 @@ where peer_cache: RefCell::new(HashMap::default()), peer_heartbeats: HashMap::default(), wait_data_peers: Vec::default(), + create_by_peer, peers_start_pending_time: vec![], down_peer_ids: vec![], - size_diff_hint: 0, + split_check_trigger: SplitCheckTrigger::default(), delete_keys_hint: 0, - approximate_size: None, - approximate_keys: None, - may_skip_split_check: false, - compaction_declined_bytes: 0, leader_unreachable: false, pending_remove: false, wait_data, @@ -931,11 +1047,12 @@ where unpersisted_ready: None, persisted_number: 0, apply_snap_ctx: None, - region_buckets: None, - last_region_buckets: None, + region_buckets_info: BucketStatsInfo::default(), lead_transferee: raft::INVALID_ID, unsafe_recovery_state: None, snapshot_recovery_state: None, + busy_on_apply: Some(false), + last_leader_committed_idx: None, }; // If this region has only one peer and I am the one, campaign directly. @@ -965,7 +1082,10 @@ where return; } self.replication_mode_version = state.status().get_dr_auto_sync().state_id; - let enable = state.status().get_dr_auto_sync().get_state() != DrAutoSyncState::Async; + let enable = !matches!( + state.status().get_dr_auto_sync().get_state(), + DrAutoSyncState::Async | DrAutoSyncState::SyncRecover + ); self.raft_group.raft.enable_group_commit(enable); self.dr_auto_sync_state = state.status().get_dr_auto_sync().get_state(); } @@ -974,29 +1094,32 @@ where pub fn switch_replication_mode(&mut self, state: &Mutex) { self.replication_sync = false; let guard = state.lock().unwrap(); - let enable_group_commit = if guard.status().get_mode() == ReplicationMode::Majority { - self.replication_mode_version = 0; - self.dr_auto_sync_state = DrAutoSyncState::Async; - false - } else { - self.dr_auto_sync_state = guard.status().get_dr_auto_sync().get_state(); - self.replication_mode_version = guard.status().get_dr_auto_sync().state_id; - match guard.status().get_dr_auto_sync().get_state() { - // SyncRecover will enable group commit after it catches up logs. - DrAutoSyncState::Async | DrAutoSyncState::SyncRecover => false, - _ => true, - } - }; + let (enable_group_commit, calculate_group_id) = + if guard.status().get_mode() == ReplicationMode::Majority { + self.replication_mode_version = 0; + self.dr_auto_sync_state = DrAutoSyncState::Async; + (false, false) + } else { + self.dr_auto_sync_state = guard.status().get_dr_auto_sync().get_state(); + self.replication_mode_version = guard.status().get_dr_auto_sync().state_id; + match guard.status().get_dr_auto_sync().get_state() { + // SyncRecover will enable group commit after it catches up logs. + DrAutoSyncState::Async => (false, false), + DrAutoSyncState::SyncRecover => (false, true), + _ => (true, true), + } + }; drop(guard); - self.switch_group_commit(enable_group_commit, state); + self.switch_group_commit(enable_group_commit, calculate_group_id, state); } fn switch_group_commit( &mut self, enable_group_commit: bool, + calculate_group_id: bool, state: &Mutex, ) { - if enable_group_commit { + if enable_group_commit || calculate_group_id { let mut guard = state.lock().unwrap(); let ids = mem::replace( guard.calculate_commit_group( @@ -1086,6 +1209,8 @@ where // of term explicitly to get correct metadata. info!( "become follower for new logs"; + "first_log_term" => first.term, + "first_log_index" => first.index, "new_log_term" => last_log.term, "new_log_index" => last_log.index, "term" => self.term(), @@ -1127,13 +1252,14 @@ where } } - if let Some(snap_ctx) = self.apply_snap_ctx.as_ref() { + if let Some(snap_ctx) = self.apply_snap_ctx.as_mut() { if !snap_ctx.scheduled { info!( "stale peer is persisting snapshot, will destroy next time"; "region_id" => self.region_id, "peer_id" => self.peer.get_id(), ); + snap_ctx.destroy_peer_after_apply = true; return None; } } @@ -1144,6 +1270,9 @@ where "region_id" => self.region_id, "peer_id" => self.peer.get_id(), ); + if let Some(snap_ctx) = self.apply_snap_ctx.as_mut() { + snap_ctx.destroy_peer_after_apply = true; + } return None; } @@ -1312,6 +1441,16 @@ where self.get_store().region() } + #[inline] + pub fn region_buckets_info_mut(&mut self) -> &mut BucketStatsInfo { + &mut self.region_buckets_info + } + + #[inline] + pub fn region_buckets_info(&self) -> &BucketStatsInfo { + &self.region_buckets_info + } + /// Check whether the peer can be hibernated. /// /// This should be used with `check_after_tick` to get a correct conclusion. @@ -1507,6 +1646,13 @@ where self.apply_snap_ctx.is_some() || self.get_store().is_applying_snapshot() } + #[inline] + pub fn should_destroy_after_apply_snapshot(&self) -> bool { + self.apply_snap_ctx + .as_ref() + .map_or(false, |ctx| ctx.destroy_peer_after_apply) + } + /// Returns `true` if the raft group has replicated a snapshot but not /// committed it yet. #[inline] @@ -2001,7 +2147,6 @@ where self.leader_missing_time = None; return StaleState::Valid; } - let naive_peer = !self.is_initialized() || !self.raft_group.raft.promotable(); // Updates the `leader_missing_time` according to the current state. // // If we are checking this it means we suspect the leader might be missing. @@ -2021,13 +2166,18 @@ where StaleState::ToValidate } Some(instant) - if instant.saturating_elapsed() >= ctx.cfg.abnormal_leader_missing_duration.0 - && !naive_peer => + if instant.saturating_elapsed() >= ctx.cfg.abnormal_leader_missing_duration.0 => { // A peer is considered as in the leader missing state // if it's initialized but is isolated from its leader or // something bad happens that the raft group can not elect a leader. - StaleState::LeaderMissing + if self.is_initialized() && self.raft_group.raft.promotable() { + StaleState::LeaderMissing + } else { + // Uninitialized peer and learner may not have leader info, + // even if there is a valid leader. + StaleState::MaybeLeaderMissing + } } _ => StaleState::Valid, } @@ -2544,9 +2694,10 @@ where if let Some(hs) = ready.hs() { let pre_commit_index = self.get_store().commit_index(); - assert!(hs.get_commit() >= pre_commit_index); + let cur_commit_index = hs.get_commit(); + assert!(cur_commit_index >= pre_commit_index); if self.is_leader() { - self.on_leader_commit_idx_changed(pre_commit_index, hs.get_commit()); + self.on_leader_commit_idx_changed(pre_commit_index, cur_commit_index); } } @@ -2704,7 +2855,13 @@ where if for_witness { // inform next round to check apply status ctx.router - .send_casual_msg(snap_region.get_id(), CasualMessage::SnapshotApplied) + .send_casual_msg( + snap_region.get_id(), + CasualMessage::SnapshotApplied { + peer_id: self.peer.get_id(), + tombstone: false, + }, + ) .unwrap(); } // When applying snapshot, there is no log applied and not compacted yet. @@ -2720,6 +2877,7 @@ where destroy_regions, for_witness, }), + destroy_peer_after_apply: false, }); if self.last_compacted_idx == 0 && last_first_index >= RAFT_INIT_LOG_INDEX { // There may be stale logs in raft engine, so schedule a task to clean it @@ -2840,7 +2998,10 @@ where commit_term, committed_entries, cbs, - self.region_buckets.as_ref().map(|b| b.meta.clone()), + self.region_buckets_info() + .bucket_stat() + .as_ref() + .map(|b| b.meta.clone()), ); apply.on_schedule(&ctx.raft_metrics); self.mut_store() @@ -3340,8 +3501,8 @@ where self.peer_stat.written_keys += apply_metrics.written_keys; self.peer_stat.written_bytes += apply_metrics.written_bytes; self.delete_keys_hint += apply_metrics.delete_keys_hint; - let diff = self.size_diff_hint as i64 + apply_metrics.size_diff_hint; - self.size_diff_hint = cmp::max(diff, 0) as u64; + self.split_check_trigger + .add_size_diff(apply_metrics.size_diff_hint); if self.has_pending_snapshot() && self.ready_to_handle_pending_snap() { has_ready = true; @@ -3373,17 +3534,14 @@ where } pub fn post_split(&mut self) { - // Reset delete_keys_hint and size_diff_hint. self.delete_keys_hint = 0; - self.size_diff_hint = 0; + self.split_check_trigger.post_split(); + self.reset_region_buckets(); } pub fn reset_region_buckets(&mut self) { - if self.region_buckets.is_some() { - self.last_region_buckets = self.region_buckets.take(); - self.region_buckets = None; - } + self.region_buckets_info_mut().set_bucket_stat(None); } /// Try to renew leader lease. @@ -3508,7 +3666,7 @@ where self.check_normal_proposal_with_disk_full_opt(ctx, disk_full_opt) .and_then(|_| self.propose_normal(ctx, req)) } - Ok(RequestPolicy::ProposeConfChange) => self.propose_conf_change(ctx, &req), + Ok(RequestPolicy::ProposeConfChange) => self.propose_conf_change(ctx, req), Err(e) => Err(e), }; fail_point!("after_propose"); @@ -4237,7 +4395,9 @@ where // Should not propose normal in force leader state. // In `pre_propose_raft_command`, it rejects all the requests expect conf-change // if in force leader state. - if self.force_leader.is_some() { + if self.force_leader.is_some() + && req.get_admin_request().get_cmd_type() != AdminCmdType::RollbackMerge + { poll_ctx.raft_metrics.invalid_proposal.force_leader.inc(); panic!( "{} propose normal in force leader state {:?}", @@ -4494,9 +4654,23 @@ where req: RaftCmdRequest, cb: Callback, ) -> bool { + let transfer_leader = get_transfer_leader_cmd(&req).unwrap(); + if let Err(err) = ctx + .coprocessor_host + .pre_transfer_leader(self.region(), transfer_leader) + { + warn!("Coprocessor rejected transfer leader."; "err" => ?err, + "region_id" => self.region_id, + "peer_id" => self.peer.get_id(), + "transferee" => transfer_leader.get_peer().get_id()); + let mut resp = RaftCmdResponse::new(); + *resp.mut_header().mut_error() = Error::from(err).into(); + cb.invoke_with_response(resp); + return false; + } + ctx.raft_metrics.propose.transfer_leader.inc(); - let transfer_leader = get_transfer_leader_cmd(&req).unwrap(); let prs = self.raft_group.raft.prs(); let (_, peers) = transfer_leader @@ -4549,7 +4723,7 @@ where fn propose_conf_change( &mut self, ctx: &mut PollContext, - req: &RaftCmdRequest, + mut req: RaftCmdRequest, ) -> Result> { if self.pending_merge_state.is_some() { return Err(Error::ProposalInMergingMode(self.region_id)); @@ -4577,7 +4751,24 @@ where self.term() )); } - if let Some(index) = self.cmd_epoch_checker.propose_check_epoch(req, self.term()) { + + if let Err(err) = ctx.coprocessor_host.pre_propose(self.region(), &mut req) { + warn!("Coprocessor rejected proposing conf change."; + "err" => ?err, + "region_id" => self.region_id, + "peer_id" => self.peer.get_id(), + ); + return Err(box_err!( + "{} rejected by coprocessor(reason = {})", + self.tag, + err + )); + } + + if let Some(index) = self + .cmd_epoch_checker + .propose_check_epoch(&req, self.term()) + { return Ok(Either::Right(index)); } @@ -4621,6 +4812,7 @@ where changes.as_ref(), &cc, self.is_in_force_leader(), + &self.peer_heartbeats, )?; ctx.raft_metrics.propose.conf_change.inc(); @@ -4698,7 +4890,11 @@ where let mut resp = reader.execute(&req, &Arc::new(region), read_index, None); if let Some(snap) = resp.snapshot.as_mut() { snap.txn_ext = Some(self.txn_ext.clone()); - snap.bucket_meta = self.region_buckets.as_ref().map(|b| b.meta.clone()); + snap.bucket_meta = self + .region_buckets_info() + .bucket_stat() + .as_ref() + .map(|s| s.meta.clone()); } resp.txn_extra_op = self.txn_extra_op.load(); cmd_resp::bind_term(&mut resp.response, self.term()); @@ -5009,10 +5205,31 @@ where } pub fn snapshot_recovery_maybe_finish_wait_apply(&mut self, force: bool) { - if let Some(SnapshotRecoveryState::WaitLogApplyToLast { target_index, .. }) = - &self.snapshot_recovery_state + if let Some(SnapshotBrState::WaitLogApplyToLast { + target_index, + valid_for_term, + .. + }) = &self.snapshot_recovery_state { - if self.raft_group.raft.term != self.raft_group.raft.raft_log.last_term() { + if valid_for_term + .map(|vt| vt != self.raft_group.raft.term) + .unwrap_or(false) + { + info!("leadership changed, aborting syncer because required."; "region_id" => self.region().id); + match self.snapshot_recovery_state.take() { + Some(SnapshotBrState::WaitLogApplyToLast { + syncer, + valid_for_term, + .. + }) => { + syncer.abort(AbortReason::StaleCommand { + region_id: self.region().get_id(), + expected_term: valid_for_term.unwrap_or_default(), + current_term: self.raft_group.raft.term, + }); + } + _ => unreachable!(), + }; return; } @@ -5031,6 +5248,37 @@ where } } } + + pub fn update_last_leader_committed_idx(&mut self, committed_index: u64) { + if self.is_leader() { + // Ignore. + return; + } + + let local_committed_index = self.get_store().commit_index(); + if committed_index < local_committed_index { + warn!( + "stale committed index"; + "region_id" => self.region().get_id(), + "peer_id" => self.peer_id(), + "last_committed_index" => committed_index, + "local_index" => local_committed_index, + ); + } else { + self.last_leader_committed_idx = Some(committed_index); + debug!( + "update last committed index from leader"; + "region_id" => self.region().get_id(), + "peer_id" => self.peer_id(), + "last_committed_index" => committed_index, + "local_index" => local_committed_index, + ); + } + } + + pub fn needs_update_last_leader_committed_idx(&self) -> bool { + self.busy_on_apply.is_some() && self.last_leader_committed_idx.is_none() + } } #[derive(Default, Debug)] @@ -5047,6 +5295,15 @@ impl DiskFullPeers { pub fn majority(&self) -> bool { self.majority } + pub fn set_majority(&mut self, majority: bool) { + self.majority = majority; + } + pub fn peers(&self) -> &HashMap { + &self.peers + } + pub fn peers_mut(&mut self) -> &mut HashMap { + &mut self.peers + } pub fn has(&self, peer_id: u64) -> bool { !self.peers.is_empty() && self.peers.contains_key(&peer_id) } @@ -5127,7 +5384,7 @@ where // should enable group commit to promise `IntegrityOverLabel`. then safe // to switch to the `Sync` phase. if self.dr_auto_sync_state == DrAutoSyncState::SyncRecover { - self.switch_group_commit(true, &ctx.global_replication_state) + self.switch_group_commit(true, true, &ctx.global_replication_state) } self.replication_sync = true; } @@ -5178,8 +5435,8 @@ where pending_peers: self.collect_pending_peers(ctx), written_bytes: self.peer_stat.written_bytes, written_keys: self.peer_stat.written_keys, - approximate_size: self.approximate_size, - approximate_keys: self.approximate_keys, + approximate_size: self.split_check_trigger.approximate_size, + approximate_keys: self.split_check_trigger.approximate_keys, replication_status: self.region_replication_status(ctx), wait_data_peers: self.wait_data_peers.clone(), }); @@ -5309,9 +5566,17 @@ where &mut self, ctx: &mut PollContext, ) { - if self.check_stale_conf_ver < self.region().get_region_epoch().get_conf_ver() { + ctx.raft_metrics.check_stale_peer.inc(); + if self.check_stale_conf_ver < self.region().get_region_epoch().get_conf_ver() + || self.region().get_region_epoch().get_conf_ver() == 0 + { self.check_stale_conf_ver = self.region().get_region_epoch().get_conf_ver(); self.check_stale_peers = self.region().get_peers().to_vec(); + if let Some(create_by_peer) = self.create_by_peer.as_ref() { + // Push create_by_peer in case the peer is removed before + // initialization which has no peer in region. + self.check_stale_peers.push(create_by_peer.clone()); + } } for peer in &self.check_stale_peers { if peer.get_id() == self.peer_id() { @@ -5667,9 +5932,9 @@ mod memtrace { ER: RaftEngine, { pub fn proposal_size(&self) -> usize { - let mut heap_size = self.pending_reads.heap_size(); + let mut heap_size = self.pending_reads.approximate_heap_size(); for prop in &self.proposals.queue { - heap_size += prop.heap_size(); + heap_size += prop.approximate_heap_size(); } heap_size } diff --git a/components/raftstore/src/store/peer_storage.rs b/components/raftstore/src/store/peer_storage.rs index d89eafc3a46..dd261ebb13a 100644 --- a/components/raftstore/src/store/peer_storage.rs +++ b/components/raftstore/src/store/peer_storage.rs @@ -507,6 +507,8 @@ where *snap_state = SnapState::Relax; *tried_cnt = 0; if self.validate_snap(&s, request_index) { + info!("start sending snapshot"; "region_id" => self.region.get_id(), + "peer_id" => self.peer_id, "request_peer" => to,); return Ok(s); } } @@ -793,8 +795,9 @@ where } else if s == JOB_STATUS_CANCELLED { SnapState::ApplyAborted } else if s == JOB_STATUS_FAILED { - // TODO: cleanup region and treat it as tombstone. - panic!("{} applying snapshot failed", self.tag,); + // Cleanup region and treat it as tombstone. + warn!("{} applying snapshot failed", self.tag); + SnapState::ApplyAborted } else { return CheckApplyingSnapStatus::Applying; } @@ -1017,6 +1020,9 @@ where // The `region` is updated after persisting in order to stay consistent with the // one in `StoreMeta::regions` (will be updated soon). // See comments in `apply_snapshot` for more details. + (|| { + fail_point!("before_set_region_on_peer_3", self.peer_id == 3, |_| {}); + })(); self.set_region(res.region.clone()); } } @@ -2024,8 +2030,8 @@ pub mod tests { s.snap_state = RefCell::new(SnapState::Applying(Arc::new(AtomicUsize::new( JOB_STATUS_FAILED, )))); - let res = panic_hook::recover_safe(|| s.cancel_applying_snap()); - res.unwrap_err(); + assert!(s.cancel_applying_snap()); + assert_eq!(*s.snap_state.borrow(), SnapState::ApplyAborted); } #[test] @@ -2074,8 +2080,8 @@ pub mod tests { s.snap_state = RefCell::new(SnapState::Applying(Arc::new(AtomicUsize::new( JOB_STATUS_FAILED, )))); - let res = panic_hook::recover_safe(|| s.check_applying_snap()); - res.unwrap_err(); + assert!(s.cancel_applying_snap()); + assert_eq!(*s.snap_state.borrow(), SnapState::ApplyAborted); } #[test] diff --git a/components/raftstore/src/store/read_queue.rs b/components/raftstore/src/store/read_queue.rs index 376f168c26d..bde49b4ed30 100644 --- a/components/raftstore/src/store/read_queue.rs +++ b/components/raftstore/src/store/read_queue.rs @@ -46,7 +46,7 @@ impl ReadIndexRequest { pub fn push_command(&mut self, req: RaftCmdRequest, cb: C, read_index: u64) { RAFT_READ_INDEX_PENDING_COUNT.inc(); - self.cmds_heap_size += req.heap_size(); + self.cmds_heap_size += req.approximate_heap_size(); self.cmds.push((req, cb, Some(read_index))); } @@ -54,7 +54,7 @@ impl ReadIndexRequest { RAFT_READ_INDEX_PENDING_COUNT.inc(); // Ignore heap allocations for `Callback`. - let cmds_heap_size = req.heap_size(); + let cmds_heap_size = req.approximate_heap_size(); let mut cmds = MustConsumeVec::with_capacity("callback of index read", 1); cmds.push((req, cb, None)); @@ -434,10 +434,10 @@ mod memtrace { use super::*; impl HeapSize for ReadIndexRequest { - fn heap_size(&self) -> usize { + fn approximate_heap_size(&self) -> usize { let mut size = self.cmds_heap_size + Self::CMD_SIZE * self.cmds.capacity(); if let Some(ref add) = self.addition_request { - size += add.heap_size(); + size += add.approximate_heap_size(); } size } @@ -445,12 +445,12 @@ mod memtrace { impl HeapSize for ReadIndexQueue { #[inline] - fn heap_size(&self) -> usize { + fn approximate_heap_size(&self) -> usize { let mut size = self.reads.capacity() * mem::size_of::>() // For one Uuid and one usize. + 24 * self.contexts.len(); for read in &self.reads { - size += read.heap_size(); + size += read.approximate_heap_size(); } size } diff --git a/components/raftstore/src/store/simple_write.rs b/components/raftstore/src/store/simple_write.rs index a303a586935..dd461e61867 100644 --- a/components/raftstore/src/store/simple_write.rs +++ b/components/raftstore/src/store/simple_write.rs @@ -49,7 +49,6 @@ where channels: Vec, size_limit: usize, write_type: WriteType, - notify_proposed: bool, } impl SimpleWriteReqEncoder @@ -57,14 +56,10 @@ where C: ErrorCallback + WriteCallback, { /// Create a request encoder. - /// - /// If `notify_proposed` is true, channels will be called `notify_proposed` - /// when it's appended. pub fn new( header: Box, bin: SimpleWriteBinary, size_limit: usize, - notify_proposed: bool, ) -> SimpleWriteReqEncoder { let mut buf = Vec::with_capacity(256); buf.push(MAGIC_PREFIX); @@ -77,7 +72,6 @@ where channels: vec![], size_limit, write_type: bin.write_type, - notify_proposed, } } @@ -112,18 +106,10 @@ where } #[inline] - pub fn add_response_channel(&mut self, mut ch: C) { - if self.notify_proposed { - ch.notify_proposed(); - } + pub fn add_response_channel(&mut self, ch: C) { self.channels.push(ch); } - #[inline] - pub fn notify_proposed(&self) -> bool { - self.notify_proposed - } - #[inline] pub fn header(&self) -> &RaftRequestHeader { &self.header @@ -558,7 +544,6 @@ mod tests { header.clone(), bin, usize::MAX, - false, ); let mut encoder = SimpleWriteEncoder::with_capacity(512); @@ -570,7 +555,6 @@ mod tests { header.clone(), bin, 0, - false, ); let (bytes, _) = req_encoder.encode(); @@ -619,9 +603,8 @@ mod tests { .collect(); encoder.ingest(exp.clone()); let bin = encoder.encode(); - let req_encoder = SimpleWriteReqEncoder::>::new( - header, bin, 0, false, - ); + let req_encoder = + SimpleWriteReqEncoder::>::new(header, bin, 0); let (bytes, _) = req_encoder.encode(); let mut decoder = SimpleWriteReqDecoder::new(decoder_fallback, &logger, &bytes, 0, 0).unwrap(); @@ -683,7 +666,6 @@ mod tests { header.clone(), bin.clone(), 512, - false, ); let mut header2 = Box::::default(); @@ -700,7 +682,6 @@ mod tests { header.clone(), bin2.clone(), 512, - false, ); assert!(!req_encoder2.amend(&header, &bin)); @@ -735,7 +716,6 @@ mod tests { header.clone(), SimpleWriteEncoder::with_capacity(512).encode(), 512, - false, ); let (bin, _) = req_encoder.encode(); assert_eq!( @@ -753,7 +733,6 @@ mod tests { header.clone(), encoder.encode(), 512, - false, ); let (bin, _) = req_encoder.encode(); let req = SimpleWriteReqDecoder::new(decoder_fallback, &logger, &bin, 0, 0) @@ -771,7 +750,6 @@ mod tests { header.clone(), encoder.encode(), 512, - false, ); let (bin, _) = req_encoder.encode(); let req = SimpleWriteReqDecoder::new(decoder_fallback, &logger, &bin, 0, 0) @@ -788,7 +766,6 @@ mod tests { header.clone(), encoder.encode(), 512, - false, ); let (bin, _) = req_encoder.encode(); let req = SimpleWriteReqDecoder::new(decoder_fallback, &logger, &bin, 0, 0) @@ -816,7 +793,6 @@ mod tests { header, encoder.encode(), 512, - false, ); let (bin, _) = req_encoder.encode(); let req = SimpleWriteReqDecoder::new(decoder_fallback, &logger, &bin, 0, 0) diff --git a/components/raftstore/src/store/snap.rs b/components/raftstore/src/store/snap.rs index 6fe21fe9750..6ec7111e09a 100644 --- a/components/raftstore/src/store/snap.rs +++ b/components/raftstore/src/store/snap.rs @@ -92,6 +92,12 @@ impl From for Error { } } +impl From for Error { + fn from(e: engine_traits::Error) -> Self { + Error::Other(Box::new(e)) + } +} + pub type Result = result::Result; impl ErrorCodeExt for Error { @@ -712,21 +718,11 @@ impl Snapshot { )); } if meta.get_size() != 0 { - let file_path = self.cf_files[cf_idx].add_file_with_size_checksum( + let _ = self.cf_files[cf_idx].add_file_with_size_checksum( file_idx, meta.get_size(), meta.get_checksum(), ); - if file_exists(&file_path) { - let mgr = self.mgr.encryption_key_manager.as_ref(); - let file_path = Path::new(&file_path); - let (_, size) = calc_checksum_and_size(file_path, mgr)?; - check_file_size( - size, - *(self.cf_files[cf_idx].size.last().unwrap()), - file_path, - )?; - } } file_idx += 1; if file_idx >= cf_file_count_from_meta[cf_idx] { @@ -873,8 +869,13 @@ impl Snapshot { self.switch_to_cf_file(cf)?; let cf_file = &mut self.cf_files[self.cf_index]; let cf_stat = if plain_file_used(cf_file.cf) { - let key_mgr = self.mgr.encryption_key_manager.as_ref(); - snap_io::build_plain_cf_file::(cf_file, key_mgr, kv_snap, &begin_key, &end_key)? + snap_io::build_plain_cf_file::( + cf_file, + self.mgr.encryption_key_manager.as_ref(), + kv_snap, + &begin_key, + &end_key, + )? } else { snap_io::build_sst_cf_file_list::( cf_file, @@ -885,6 +886,7 @@ impl Snapshot { self.mgr .get_actual_max_per_file_size(allow_multi_files_snapshot), &self.mgr.limiter, + self.mgr.encryption_key_manager.clone(), )? }; SNAPSHOT_LIMIT_GENERATE_BYTES.inc_by(cf_stat.total_size as u64); @@ -1212,7 +1214,7 @@ impl Snapshot { if file_for_recving.written_size != cf_file.size[i] { return Err(io::Error::new( - ErrorKind::Other, + ErrorKind::InvalidData, format!( "snapshot file {} for cf {} size mismatches, \ real size {}, expected size {}", @@ -1227,7 +1229,7 @@ impl Snapshot { let checksum = file_for_recving.write_digest.finalize(); if checksum != cf_file.checksum[i] { return Err(io::Error::new( - ErrorKind::Other, + ErrorKind::InvalidData, format!( "snapshot file {} for cf {} checksum \ mismatches, real checksum {}, expected \ @@ -1698,6 +1700,20 @@ impl SnapManager { Ok(Box::new(s)) } + pub fn meta_file_exist(&self, key: &SnapKey) -> RaftStoreResult<()> { + let _lock = self.core.registry.rl(); + let base = &self.core.base; + // Use CheckPolicy::None to avoid reading meta file + let s = Snapshot::new(base, key, false, CheckPolicy::None, &self.core)?; + if !file_exists(s.meta_file.path.as_path()) { + return Err(RaftStoreError::Other(From::from(format!( + "snapshot of {:?} not exists.", + key + )))); + } + Ok(()) + } + /// Get the approximate size of snap file exists in snap directory. /// /// Return value is not guaranteed to be accurate. @@ -2760,26 +2776,6 @@ pub mod tests { assert!(s2.exists()); } - // Make all the snapshot in the specified dir corrupted to have incorrect size. - fn corrupt_snapshot_size_in>(dir: T) { - let dir_path = dir.into(); - let read_dir = file_system::read_dir(dir_path).unwrap(); - for p in read_dir { - if p.is_ok() { - let e = p.as_ref().unwrap(); - if !e - .file_name() - .into_string() - .unwrap() - .ends_with(META_FILE_SUFFIX) - { - let mut f = OpenOptions::new().append(true).open(e.path()).unwrap(); - f.write_all(b"xxxxx").unwrap(); - } - } - } - } - // Make all the snapshot in the specified dir corrupted to have incorrect // checksum. fn corrupt_snapshot_checksum_in>(dir: T) -> Vec { @@ -2880,7 +2876,7 @@ pub mod tests { } #[test] - fn test_snap_corruption_on_size_or_checksum() { + fn test_snap_corruption_on_checksum() { let region_id = 1; let region = gen_test_region(region_id, 1, 1); let db_dir = Builder::new() @@ -2899,22 +2895,11 @@ pub mod tests { let mut s1 = Snapshot::new_for_building(dir.path(), &key, &mgr_core).unwrap(); assert!(!s1.exists()); - let _ = s1 + let snap_data = s1 .build(&db, &snapshot, ®ion, true, false, UnixSecs::now()) .unwrap(); assert!(s1.exists()); - corrupt_snapshot_size_in(dir.path()); - - Snapshot::new_for_sending(dir.path(), &key, &mgr_core).unwrap_err(); - - let mut s2 = Snapshot::new_for_building(dir.path(), &key, &mgr_core).unwrap(); - assert!(!s2.exists()); - let snap_data = s2 - .build(&db, &snapshot, ®ion, true, false, UnixSecs::now()) - .unwrap(); - assert!(s2.exists()); - let dst_dir = Builder::new() .prefix("test-snap-corruption-dst") .tempdir() @@ -2927,12 +2912,11 @@ pub mod tests { snap_data.get_meta().clone(), ); - let mut metas = corrupt_snapshot_checksum_in(dst_dir.path()); + let metas = corrupt_snapshot_checksum_in(dst_dir.path()); assert_eq!(1, metas.len()); - let snap_meta = metas.pop().unwrap(); - let mut s5 = Snapshot::new_for_applying(dst_dir.path(), &key, &mgr_core).unwrap(); - assert!(s5.exists()); + let mut s2 = Snapshot::new_for_applying(dst_dir.path(), &key, &mgr_core).unwrap(); + assert!(s2.exists()); let dst_db_dir = Builder::new() .prefix("test-snap-corruption-dst-db") @@ -2947,11 +2931,7 @@ pub mod tests { coprocessor_host: CoprocessorHost::::default(), ingest_copy_symlink: false, }; - s5.apply(options).unwrap_err(); - - corrupt_snapshot_size_in(dst_dir.path()); - Snapshot::new_for_receiving(dst_dir.path(), &key, &mgr_core, snap_meta).unwrap_err(); - Snapshot::new_for_applying(dst_dir.path(), &key, &mgr_core).unwrap_err(); + s2.apply(options).unwrap_err(); } #[test] diff --git a/components/raftstore/src/store/snap/io.rs b/components/raftstore/src/store/snap/io.rs index 3cdee1e40f1..837ae7176fb 100644 --- a/components/raftstore/src/store/snap/io.rs +++ b/components/raftstore/src/store/snap/io.rs @@ -13,13 +13,14 @@ use encryption::{ }; use engine_traits::{ CfName, EncryptionKeyManager, Error as EngineError, Iterable, KvEngine, Mutable, - SstCompressionType, SstWriter, SstWriterBuilder, WriteBatch, + SstCompressionType, SstReader, SstWriter, SstWriterBuilder, WriteBatch, }; +use fail::fail_point; use kvproto::encryptionpb::EncryptionMethod; use tikv_util::{ box_try, codec::bytes::{BytesEncoder, CompactBytesFromFileDecoder}, - debug, info, + debug, error, info, time::{Instant, Limiter}, }; @@ -116,6 +117,7 @@ pub fn build_sst_cf_file_list( end_key: &[u8], raw_size_per_file: u64, io_limiter: &Limiter, + key_mgr: Option>, ) -> Result where E: KvEngine, @@ -133,6 +135,53 @@ where let sst_writer = RefCell::new(create_sst_file_writer::(engine, cf, &path)?); let mut file_length: usize = 0; + let finish_sst_writer = |sst_writer: E::SstWriter, + path: String, + key_mgr: Option>| + -> Result<(), Error> { + sst_writer.finish()?; + (|| { + fail_point!("inject_sst_file_corruption", |_| { + static CALLED: std::sync::atomic::AtomicBool = + std::sync::atomic::AtomicBool::new(false); + if CALLED + .compare_exchange( + false, + true, + std::sync::atomic::Ordering::SeqCst, + std::sync::atomic::Ordering::SeqCst, + ) + .is_err() + { + return; + } + // overwrite the file to break checksum + let mut f = OpenOptions::new().write(true).open(&path).unwrap(); + f.write_all(b"x").unwrap(); + }); + })(); + + let sst_reader = if let Some(mgr) = key_mgr { + E::SstReader::open_encrypted(&path, mgr)? + } else { + E::SstReader::open(&path)? + }; + + if let Err(e) = sst_reader.verify_checksum() { + // use sst reader to verify block checksum, it would detect corrupted SST due to + // memory bit-flip + fs::remove_file(&path)?; + error!( + "failed to pass block checksum verification"; + "file" => path, + "err" => ?e, + ); + return Err(io::Error::new(io::ErrorKind::InvalidData, e).into()); + } + File::open(&path).and_then(|f| f.sync_all())?; + Ok(()) + }; + let instant = Instant::now(); box_try!(snap.scan(cf, start_key, end_key, false, |key, value| { let entry_len = key.len() + value.len(); @@ -151,8 +200,7 @@ where match result { Ok(new_sst_writer) => { let old_writer = sst_writer.replace(new_sst_writer); - box_try!(old_writer.finish()); - box_try!(File::open(prev_path).and_then(|f| f.sync_all())); + box_try!(finish_sst_writer(old_writer, prev_path, key_mgr.clone())); } Err(e) => { let io_error = io::Error::new(io::ErrorKind::Other, e); @@ -178,9 +226,8 @@ where Ok(true) })); if stats.key_count > 0 { + box_try!(finish_sst_writer(sst_writer.into_inner(), path, key_mgr)); cf_file.add_file(file_id); - box_try!(sst_writer.into_inner().finish()); - box_try!(File::open(path).and_then(|f| f.sync_all())); info!( "build_sst_cf_file_list builds {} files in cf {}. Total keys {}, total size {}. raw_size_per_file {}, total takes {:?}", file_id + 1, @@ -427,6 +474,7 @@ mod tests { &keys::data_key(b"z"), *max_file_size, &limiter, + db_opt.as_ref().and_then(|opt| opt.get_key_manager()), ) .unwrap(); if stats.key_count == 0 { diff --git a/components/raftstore/src/store/snapshot_backup.rs b/components/raftstore/src/store/snapshot_backup.rs new file mode 100644 index 00000000000..710ac281e8a --- /dev/null +++ b/components/raftstore/src/store/snapshot_backup.rs @@ -0,0 +1,391 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ + sync::{ + atomic::{AtomicBool, AtomicU64, Ordering}, + Arc, Mutex, + }, + time::Duration, +}; + +use engine_traits::{KvEngine, RaftEngine}; +use futures::channel::mpsc::UnboundedSender; +use kvproto::{brpb::CheckAdminResponse, metapb::RegionEpoch, raft_cmdpb::AdminCmdType}; +use tikv_util::{info, warn}; +use tokio::sync::oneshot; + +use super::{metrics, PeerMsg, RaftRouter, SignificantMsg, SignificantRouter}; +use crate::coprocessor::{ + AdminObserver, BoxAdminObserver, BoxQueryObserver, Coprocessor, CoprocessorHost, + Error as CopError, QueryObserver, +}; + +fn epoch_second_coarse() -> u64 { + let spec = tikv_util::time::monotonic_coarse_now(); + spec.sec as u64 +} + +#[derive(Debug, Clone)] +pub struct SnapshotBrWaitApplyRequest { + pub syncer: SnapshotBrWaitApplySyncer, + pub expected_epoch: Option, + pub abort_when_term_change: bool, +} + +impl SnapshotBrWaitApplyRequest { + /// Create a "relax" request for waiting apply. + /// This only waits to the last index, without checking the region epoch or + /// leadership migrating. + pub fn relaxed(syncer: SnapshotBrWaitApplySyncer) -> Self { + Self { + syncer, + expected_epoch: None, + abort_when_term_change: false, + } + } + + /// Create a "strict" request for waiting apply. + /// This will wait to last applied index, and aborts if the region epoch not + /// match or the last index may not be committed. + pub fn strict(syncer: SnapshotBrWaitApplySyncer, epoch: RegionEpoch) -> Self { + Self { + syncer, + expected_epoch: Some(epoch), + abort_when_term_change: true, + } + } +} + +pub trait SnapshotBrHandle: Sync + Send + Clone { + fn send_wait_apply(&self, region: u64, req: SnapshotBrWaitApplyRequest) -> crate::Result<()>; + fn broadcast_wait_apply(&self, req: SnapshotBrWaitApplyRequest) -> crate::Result<()>; + fn broadcast_check_pending_admin( + &self, + tx: UnboundedSender, + ) -> crate::Result<()>; +} + +impl SnapshotBrHandle for Arc>> { + fn send_wait_apply(&self, region: u64, req: SnapshotBrWaitApplyRequest) -> crate::Result<()> { + let msg = SignificantMsg::SnapshotBrWaitApply(req); + metrics::SNAP_BR_WAIT_APPLY_EVENT.sent.inc(); + self.lock().unwrap().significant_send(region, msg) + } + + fn broadcast_wait_apply(&self, req: SnapshotBrWaitApplyRequest) -> crate::Result<()> { + let msg_gen = || { + metrics::SNAP_BR_WAIT_APPLY_EVENT.sent.inc(); + PeerMsg::SignificantMsg(Box::new(SignificantMsg::SnapshotBrWaitApply(req.clone()))) + }; + self.lock().unwrap().broadcast_normal(msg_gen); + Ok(()) + } + + fn broadcast_check_pending_admin( + &self, + tx: UnboundedSender, + ) -> crate::Result<()> { + self.lock().unwrap().broadcast_normal(|| { + PeerMsg::SignificantMsg(Box::new(SignificantMsg::CheckPendingAdmin(tx.clone()))) + }); + Ok(()) + } +} + +#[derive(Default)] +pub struct PrepareDiskSnapObserver { + before: AtomicU64, + initialized: AtomicBool, +} + +impl PrepareDiskSnapObserver { + pub fn register_to(self: &Arc, coprocessor_host: &mut CoprocessorHost) { + let reg = &mut coprocessor_host.registry; + reg.register_query_observer(0, BoxQueryObserver::new(Arc::clone(self))); + reg.register_admin_observer(0, BoxAdminObserver::new(Arc::clone(self))); + info!("registered reject ingest and admin coprocessor to TiKV."); + } + + pub fn remained_secs(&self) -> u64 { + self.before + .load(Ordering::Acquire) + .saturating_sub(epoch_second_coarse()) + } + + fn reject(&self) -> CopError { + CopError::RequireDelay { + after: Duration::from_secs(self.remained_secs()), + reason: + "[Suspended] Preparing disk snapshot backup, ingests and some of admin commands are suspended." + .to_owned(), + } + } + + pub fn allowed(&self) -> bool { + let mut v = self.before.load(Ordering::Acquire); + if v == 0 { + return true; + } + let mut expired = v < epoch_second_coarse(); + while expired { + match self + .before + .compare_exchange(v, 0, Ordering::SeqCst, Ordering::SeqCst) + { + Ok(_) => { + metrics::SNAP_BR_SUSPEND_COMMAND_LEASE_UNTIL.set(0); + metrics::SNAP_BR_LEASE_EVENT.expired.inc(); + break; + } + Err(new_val) => { + v = new_val; + expired = v < epoch_second_coarse(); + } + } + } + + expired + } + + pub fn initialized(&self) -> bool { + self.initialized.load(Ordering::Acquire) + } + + /// Extend the lease. + /// + /// # Returns + /// + /// Whether previously there is a lease. + pub fn update_lease(&self, lease: Duration) -> bool { + let mut v = self.before.load(Ordering::SeqCst); + let now = epoch_second_coarse(); + let new_lease = now + lease.as_secs(); + let last_lease_valid = v > now; + while v < new_lease { + let res = self + .before + .fetch_update(Ordering::SeqCst, Ordering::SeqCst, |v| { + if v > new_lease { None } else { Some(new_lease) } + }); + match res { + Ok(_) => { + metrics::SNAP_BR_SUSPEND_COMMAND_LEASE_UNTIL.set(new_lease as _); + break; + } + Err(prev) => v = prev, + } + } + if last_lease_valid { + metrics::SNAP_BR_LEASE_EVENT.renew.inc(); + } else { + metrics::SNAP_BR_LEASE_EVENT.create.inc(); + } + last_lease_valid + } + + pub fn reset(&self) { + self.before.store(0, Ordering::SeqCst); + metrics::SNAP_BR_SUSPEND_COMMAND_LEASE_UNTIL.set(0); + metrics::SNAP_BR_LEASE_EVENT.reset.inc(); + } +} + +impl Coprocessor for Arc { + fn start(&self) { + self.initialized.store(true, Ordering::Release) + } + + fn stop(&self) { + self.initialized.store(false, Ordering::Release) + } +} + +impl QueryObserver for Arc { + fn pre_propose_query( + &self, + cx: &mut crate::coprocessor::ObserverContext<'_>, + reqs: &mut Vec, + ) -> crate::coprocessor::Result<()> { + if self.allowed() { + return Ok(()); + } + for req in reqs { + if req.has_ingest_sst() { + // Note: this will reject the batch of commands, which isn't so effective. + // But we cannot reject proposing a subset of command for now... + cx.bypass = true; + metrics::SNAP_BR_SUSPEND_COMMAND_TYPE + .with_label_values(&["Ingest"]) + .inc(); + return Err(self.reject()); + } + } + Ok(()) + } +} + +impl AdminObserver for Arc { + fn pre_propose_admin( + &self, + _: &mut crate::coprocessor::ObserverContext<'_>, + admin: &mut kvproto::raft_cmdpb::AdminRequest, + ) -> crate::coprocessor::Result<()> { + if self.allowed() { + return Ok(()); + } + // NOTE: We have disabled `CompactLog` here because if the log get truncated, + // we may take a long time to send snapshots during restoring. + // Also note it may impact the TP workload if we are preparing for a long time. + let should_reject = matches!( + admin.get_cmd_type(), + AdminCmdType::Split | + AdminCmdType::BatchSplit | + // We will allow `Commit/RollbackMerge` here because the + // `wait_pending_admin` will wait until the merge get finished. + // If we reject them, they won't be able to see the merge get finished. + // And will finally time out. + AdminCmdType::PrepareMerge | + AdminCmdType::ChangePeer | + AdminCmdType::ChangePeerV2 | + AdminCmdType::BatchSwitchWitness | + AdminCmdType::CompactLog + ); + if should_reject { + metrics::SNAP_BR_SUSPEND_COMMAND_TYPE + .with_label_values(&[&format!("{:?}", admin.get_cmd_type())]) + .inc(); + return Err(self.reject()); + } + Ok(()) + } + + fn pre_transfer_leader( + &self, + _ctx: &mut crate::coprocessor::ObserverContext<'_>, + _tr: &kvproto::raft_cmdpb::TransferLeaderRequest, + ) -> crate::coprocessor::Result<()> { + if self.allowed() { + return Ok(()); + } + metrics::SNAP_BR_SUSPEND_COMMAND_TYPE + .with_label_values(&["TransferLeader"]) + .inc(); + Err(self.reject()) + } +} + +#[derive(Debug)] +struct SyncerCore { + report_id: u64, + feedback: Option>, +} + +#[derive(Debug, PartialEq)] +pub struct SyncReport { + pub report_id: u64, + pub aborted: Option, +} + +impl SyncerCore { + fn new(report_id: u64, feedback: oneshot::Sender) -> Self { + Self { + report_id, + feedback: Some(feedback), + } + } + + fn is_aborted(&self) -> bool { + self.feedback.is_none() + } + + /// Abort this syncer. + /// This will fire a message right now. + /// And disable all clones of this syncer. + /// If already aborted, this will do nothing. + fn abort(&mut self, reason: AbortReason) { + if let Some(ch) = self.feedback.take() { + let report = SyncReport { + report_id: self.report_id, + aborted: Some(reason), + }; + if let Err(report) = ch.send(report) { + warn!("reply waitapply states failure."; "report" => ?report); + } + } + } + + fn make_success_result(&self) -> SyncReport { + SyncReport { + report_id: self.report_id, + aborted: None, + } + } +} + +impl Drop for SyncerCore { + fn drop(&mut self) { + if let Some(ch) = self.feedback.take() { + let report = self.make_success_result(); + if let Err(report) = ch.send(report) { + warn!("reply waitapply states failure."; "report" => ?report); + } + metrics::SNAP_BR_WAIT_APPLY_EVENT.finished.inc() + } else { + warn!("wait apply aborted."; "report" => self.report_id); + } + } +} + +/// A syncer for wait apply. +/// The sender used for constructing this structure will: +/// Be closed, if the `abort` has been called. +/// Send the report id to the caller, if all replicas of this Syncer has been +/// dropped. +#[derive(Debug, Clone)] +pub struct SnapshotBrWaitApplySyncer(Arc>); + +impl SnapshotBrWaitApplySyncer { + pub fn new(report_id: u64, sender: oneshot::Sender) -> Self { + let core = SyncerCore::new(report_id, sender); + Self(Arc::new(Mutex::new(core))) + } + + pub fn abort(self, reason: AbortReason) { + let mut core = self.0.lock().unwrap(); + warn!("aborting wait apply."; "reason" => ?reason, "id" => %core.report_id, "already_aborted" => %core.is_aborted()); + match reason { + AbortReason::EpochNotMatch(_) => { + metrics::SNAP_BR_WAIT_APPLY_EVENT.epoch_not_match.inc() + } + AbortReason::StaleCommand { .. } => { + metrics::SNAP_BR_WAIT_APPLY_EVENT.term_not_match.inc() + } + AbortReason::Duplicated => metrics::SNAP_BR_WAIT_APPLY_EVENT.duplicated.inc(), + } + core.abort(reason); + } +} + +#[derive(Debug, PartialEq)] +pub enum AbortReason { + EpochNotMatch(kvproto::errorpb::EpochNotMatch), + StaleCommand { + expected_term: u64, + current_term: u64, + region_id: u64, + }, + Duplicated, +} + +#[derive(Debug)] +pub enum SnapshotBrState { + // This state is set by the leader peer fsm. Once set, it sync and check leader commit index + // and force forward to last index once follower appended and then it also is checked + // every time this peer applies a the last index, if the last index is met, this state is + // reset / droppeds. The syncer is dropped and send the response to the invoker. + WaitLogApplyToLast { + target_index: u64, + valid_for_term: Option, + syncer: SnapshotBrWaitApplySyncer, + }, +} diff --git a/components/raftstore/src/store/transport.rs b/components/raftstore/src/store/transport.rs index 7f10e7cd249..35761aa5d18 100644 --- a/components/raftstore/src/store/transport.rs +++ b/components/raftstore/src/store/transport.rs @@ -1,7 +1,7 @@ // Copyright 2016 TiKV Project Authors. Licensed under Apache-2.0. // #[PerformanceCriticalPath] -use std::sync::mpsc; +use std::sync::{mpsc, Mutex}; use crossbeam::channel::{SendError, TrySendError}; use engine_traits::{KvEngine, RaftEngine, Snapshot}; @@ -46,6 +46,13 @@ where fn significant_send(&self, region_id: u64, msg: SignificantMsg) -> Result<()>; } +impl<'a, T: SignificantRouter, EK: KvEngine> SignificantRouter for &'a Mutex { + #[inline] + fn significant_send(&self, region_id: u64, msg: SignificantMsg) -> Result<()> { + Mutex::lock(self).unwrap().significant_send(region_id, msg) + } +} + /// Routes proposal to target region. pub trait ProposalRouter where @@ -71,7 +78,10 @@ where { #[inline] fn send(&self, region_id: u64, msg: CasualMessage) -> Result<()> { - match self.router.send(region_id, PeerMsg::CasualMessage(msg)) { + match self + .router + .send(region_id, PeerMsg::CasualMessage(Box::new(msg))) + { Ok(()) => Ok(()), Err(TrySendError::Full(_)) => Err(Error::Transport(DiscardReason::Full)), Err(TrySendError::Disconnected(_)) => Err(Error::RegionNotFound(region_id)), @@ -79,6 +89,13 @@ where } } +impl<'a, EK: KvEngine, T: CasualRouter> CasualRouter for &'a Mutex { + #[inline] + fn send(&self, region_id: u64, msg: CasualMessage) -> Result<()> { + CasualRouter::send(&*Mutex::lock(self).unwrap(), region_id, msg) + } +} + impl SignificantRouter for RaftRouter where EK: KvEngine, @@ -88,7 +105,7 @@ where fn significant_send(&self, region_id: u64, msg: SignificantMsg) -> Result<()> { if let Err(SendError(msg)) = self .router - .force_send(region_id, PeerMsg::SignificantMsg(msg)) + .force_send(region_id, PeerMsg::SignificantMsg(Box::new(msg))) { // TODO: panic here once we can detect system is shutting down reliably. diff --git a/components/raftstore/src/store/unsafe_recovery.rs b/components/raftstore/src/store/unsafe_recovery.rs index f98fcaea581..ab344a26239 100644 --- a/components/raftstore/src/store/unsafe_recovery.rs +++ b/components/raftstore/src/store/unsafe_recovery.rs @@ -2,7 +2,7 @@ use std::{ fmt, mem, - sync::{mpsc::SyncSender, Arc, Mutex}, + sync::{Arc, Mutex}, time::Duration, }; @@ -80,7 +80,9 @@ impl UnsafeRecoveryHandle for Mutex UnsafeRecoveryHandle for Mutex); impl UnsafeRecoveryForceLeaderSyncer { pub fn new(report_id: u64, router: Arc) -> Self { let inner = InvokeClosureOnDrop(Some(Box::new(move || { - info!("Unsafe recovery, force leader finished."); + info!("Unsafe recovery, force leader finished."; "report_id" => report_id); start_unsafe_recovery_report(router, report_id, false); }))); UnsafeRecoveryForceLeaderSyncer(Arc::new(inner)) @@ -260,11 +266,11 @@ impl UnsafeRecoveryExecutePlanSyncer { let abort = Arc::new(Mutex::new(false)); let abort_clone = abort.clone(); let closure = InvokeClosureOnDrop(Some(Box::new(move || { - info!("Unsafe recovery, plan execution finished"); if *abort_clone.lock().unwrap() { - warn!("Unsafe recovery, plan execution aborted"); + warn!("Unsafe recovery, plan execution aborted"; "report_id" => report_id); return; } + info!("Unsafe recovery, plan execution finished"; "report_id" => report_id); start_unsafe_recovery_report(router, report_id, true); }))); UnsafeRecoveryExecutePlanSyncer { @@ -278,40 +284,6 @@ impl UnsafeRecoveryExecutePlanSyncer { *self.abort.lock().unwrap() = true; } } -// Syncer only send to leader in 2nd BR restore -#[derive(Clone, Debug)] -pub struct SnapshotRecoveryWaitApplySyncer { - _closure: Arc, - abort: Arc>, -} - -impl SnapshotRecoveryWaitApplySyncer { - pub fn new(region_id: u64, sender: SyncSender) -> Self { - let thread_safe_router = Mutex::new(sender); - let abort = Arc::new(Mutex::new(false)); - let abort_clone = abort.clone(); - let closure = InvokeClosureOnDrop(Some(Box::new(move || { - info!("region {} wait apply finished", region_id); - if *abort_clone.lock().unwrap() { - warn!("wait apply aborted"); - return; - } - let router_ptr = thread_safe_router.lock().unwrap(); - - _ = router_ptr.send(region_id).map_err(|_| { - warn!("reply waitapply states failure."); - }); - }))); - SnapshotRecoveryWaitApplySyncer { - _closure: Arc::new(closure), - abort, - } - } - - pub fn abort(&self) { - *self.abort.lock().unwrap() = true; - } -} #[derive(Clone, Debug)] pub struct UnsafeRecoveryWaitApplySyncer { @@ -330,7 +302,7 @@ impl UnsafeRecoveryWaitApplySyncer { let abort_clone = abort.clone(); let closure = InvokeClosureOnDrop(Some(Box::new(move || { if *abort_clone.lock().unwrap() { - warn!("Unsafe recovery, wait apply aborted"); + warn!("Unsafe recovery, wait apply aborted"; "report_id" => report_id); return; } info!("Unsafe recovery, wait apply finished"); @@ -363,7 +335,7 @@ impl UnsafeRecoveryFillOutReportSyncer { let reports = Arc::new(Mutex::new(vec![])); let reports_clone = reports.clone(); let closure = InvokeClosureOnDrop(Some(Box::new(move || { - info!("Unsafe recovery, peer reports collected"); + info!("Unsafe recovery, peer reports collected"; "report_id" => report_id); let mut store_report = StoreReport::default(); { let mut reports_ptr = reports_clone.lock().unwrap(); @@ -386,19 +358,6 @@ impl UnsafeRecoveryFillOutReportSyncer { } } -#[derive(Debug)] -pub enum SnapshotRecoveryState { - // This state is set by the leader peer fsm. Once set, it sync and check leader commit index - // and force forward to last index once follower appended and then it also is checked - // every time this peer applies a the last index, if the last index is met, this state is - // reset / droppeds. The syncer is droped and send the response to the invoker, triggers - // the next step of recovery process. - WaitLogApplyToLast { - target_index: u64, - syncer: SnapshotRecoveryWaitApplySyncer, - }, -} - #[derive(Debug)] pub enum UnsafeRecoveryState { // Stores the state that is necessary for the wait apply stage of unsafe recovery process. @@ -420,6 +379,9 @@ pub enum UnsafeRecoveryState { }, Destroy(UnsafeRecoveryExecutePlanSyncer), WaitInitialize(UnsafeRecoveryExecutePlanSyncer), + // DemoteFailedVoter may fail due to some reasons. It's just a marker to avoid exiting force + // leader state + Failed, } impl UnsafeRecoveryState { @@ -429,6 +391,7 @@ impl UnsafeRecoveryState { UnsafeRecoveryState::DemoteFailedVoters { syncer, .. } | UnsafeRecoveryState::Destroy(syncer) | UnsafeRecoveryState::WaitInitialize(syncer) => syncer.time, + UnsafeRecoveryState::Failed => return false, }; time.saturating_elapsed() >= timeout } @@ -439,6 +402,7 @@ impl UnsafeRecoveryState { UnsafeRecoveryState::DemoteFailedVoters { syncer, .. } | UnsafeRecoveryState::Destroy(syncer) | UnsafeRecoveryState::WaitInitialize(syncer) => &syncer.abort, + UnsafeRecoveryState::Failed => return true, }; *abort.lock().unwrap() } @@ -449,6 +413,7 @@ impl UnsafeRecoveryState { UnsafeRecoveryState::DemoteFailedVoters { syncer, .. } | UnsafeRecoveryState::Destroy(syncer) | UnsafeRecoveryState::WaitInitialize(syncer) => syncer.abort(), + UnsafeRecoveryState::Failed => (), } } } diff --git a/components/raftstore/src/store/util.rs b/components/raftstore/src/store/util.rs index 880a394fdae..e1320f48712 100644 --- a/components/raftstore/src/store/util.rs +++ b/components/raftstore/src/store/util.rs @@ -160,6 +160,20 @@ pub fn new_empty_snapshot( snapshot } +pub fn gen_bucket_version(term: u64, current_version: u64) -> u64 { + // term logical counter + // |-----------|-----------| + // high bits low bits + // term: given 10s election timeout, the 32 bit means 1362 year running time + let current_version_term = current_version >> 32; + let bucket_version: u64 = if current_version_term == term { + current_version + 1 + } else { + term << 32 + }; + bucket_version +} + const STR_CONF_CHANGE_ADD_NODE: &str = "AddNode"; const STR_CONF_CHANGE_REMOVE_NODE: &str = "RemoveNode"; const STR_CONF_CHANGE_ADDLEARNER_NODE: &str = "AddLearner"; @@ -306,7 +320,7 @@ pub fn compare_region_epoch( // tells TiDB with a epoch not match error contains the latest target Region // info, TiDB updates its region cache and sends requests to TiKV B, // and TiKV B has not applied commit merge yet, since the region epoch in - // request is higher than TiKV B, the request must be denied due to epoch + // request is higher than TiKV B, the request must be suspended due to epoch // not match, so it does not read on a stale snapshot, thus avoid the // KeyNotInRegion error. let current_epoch = region.get_region_epoch(); @@ -998,6 +1012,7 @@ pub fn check_conf_change( change_peers: &[ChangePeerRequest], cc: &impl ConfChangeI, ignore_safety: bool, + peer_heartbeats: &collections::HashMap, ) -> Result<()> { let current_progress = node.status().progress.unwrap().clone(); let mut after_progress = current_progress.clone(); @@ -1081,6 +1096,13 @@ pub fn check_conf_change( return Err(box_err!("multiple changes that only effect learner")); } + check_availability_by_last_heartbeats( + region, + cfg, + change_peers, + leader.get_id(), + peer_heartbeats, + )?; if !ignore_safety { let promoted_commit_index = after_progress.maximal_committed_index().0; let first_index = node.raft.raft_log.first_index(); @@ -1109,6 +1131,108 @@ pub fn check_conf_change( } } +/// Check the would-be availability if the operation proceed. +/// If the slow peers count would be equal or larger than normal peers count, +/// then the operations would be rejected +fn check_availability_by_last_heartbeats( + region: &metapb::Region, + cfg: &Config, + change_peers: &[ChangePeerRequest], + leader_id: u64, + peer_heartbeats: &collections::HashMap, +) -> Result<()> { + let mut slow_voters = vec![]; + let mut normal_voters = vec![]; + + // Here we assume if the last beartbeat is within 2 election timeout, the peer + // is healthy. When a region is hibernate, we expect all its peers are *slow* + // and it would still allow the operation + let slow_voter_threshold = + 2 * cfg.raft_base_tick_interval.0 * cfg.raft_max_election_timeout_ticks as u32; + for (id, last_heartbeat) in peer_heartbeats { + // for slow and normal peer calculation, we only count voter role + if region + .get_peers() + .iter() + .find(|p| p.get_id() == *id) + .map_or(false, |p| { + p.role == PeerRole::Voter || p.role == PeerRole::IncomingVoter + }) + { + // leader itself is not a slow peer + if *id == leader_id || last_heartbeat.elapsed() <= slow_voter_threshold { + normal_voters.push(*id); + } else { + slow_voters.push(*id); + } + } + } + + let is_healthy = normal_voters.len() > slow_voters.len(); + // if it's already unhealthy, let it go + if !is_healthy { + return Ok(()); + } + + let mut normal_voters_to_remove = vec![]; + let mut slow_voters_to_add = vec![]; + for cp in change_peers { + let (change_type, peer) = (cp.get_change_type(), cp.get_peer()); + let is_voter = region + .get_peers() + .iter() + .find(|p| p.get_id() == peer.get_id()) + .map_or(false, |p| { + p.role == PeerRole::Voter || p.role == PeerRole::IncomingVoter + }); + if !is_voter && change_type == ConfChangeType::AddNode { + // exiting peers, promoting from learner to voter + if let Some(last_heartbeat) = peer_heartbeats.get(&peer.get_id()) { + if last_heartbeat.elapsed() <= slow_voter_threshold { + normal_voters.push(peer.get_id()); + } else { + slow_voters.push(peer.get_id()); + slow_voters_to_add.push(peer.get_id()); + } + } else { + // it's a new peer, assuming it's a normal voter + normal_voters.push(peer.get_id()); + } + } + + if is_voter + && (change_type == ConfChangeType::RemoveNode + || change_type == ConfChangeType::AddLearnerNode) + { + // If the change_type is AddLearnerNode and the last heartbeat is found, it + // means it's a demote from voter as AddLearnerNode on existing learner node is + // not allowed. + if let Some(last_heartbeat) = peer_heartbeats.get(&peer.get_id()) { + if last_heartbeat.elapsed() <= slow_voter_threshold { + normal_voters.retain(|id| *id != peer.get_id()); + normal_voters_to_remove.push(peer.clone()); + } + } + } + } + + // Only block the conf change when currently it's healthy, but would be + // unhealthy. If currently it's already unhealthy, let it go. + if slow_voters.len() >= normal_voters.len() { + return Err(box_err!( + "Ignore conf change command on [region_id={}] because the operations may lead to unavailability.\ + Normal voters to remove {:?}, slow voters to add {:?}.\ + Normal voters would be {:?}, slow voters would be {:?}.", + region.get_id(), + &normal_voters_to_remove, + &slow_voters_to_add, + &normal_voters, + &slow_voters + )); + } + + Ok(()) +} pub struct MsgType<'a>(pub &'a RaftMessage); impl Display for MsgType<'_> { @@ -1439,7 +1563,6 @@ impl RegionReadProgress { self.safe_ts() } - // Dump the `LeaderInfo` and the peer list pub fn get_core(&self) -> MutexGuard<'_, RegionReadProgressCore> { self.core.lock().unwrap() } @@ -1720,13 +1843,38 @@ pub struct RaftstoreDuration { } impl RaftstoreDuration { + #[inline] pub fn sum(&self) -> std::time::Duration { - self.store_wait_duration.unwrap_or_default() - + self.store_process_duration.unwrap_or_default() + self.delays_on_disk_io(true) + self.delays_on_net_io() + } + + #[inline] + /// Returns the delayed duration on Disk I/O. + pub fn delays_on_disk_io(&self, include_wait_duration: bool) -> std::time::Duration { + let duration = self.store_process_duration.unwrap_or_default() + self.store_write_duration.unwrap_or_default() - + self.store_commit_duration.unwrap_or_default() - + self.apply_wait_duration.unwrap_or_default() - + self.apply_process_duration.unwrap_or_default() + + self.apply_process_duration.unwrap_or_default(); + if include_wait_duration { + duration + + self.store_wait_duration.unwrap_or_default() + + self.apply_wait_duration.unwrap_or_default() + } else { + duration + } + } + + #[inline] + /// Returns the delayed duration on Network I/O. + /// + /// Normally, it can be reflected by the duraiton on + /// `store_commit_duraiton`. + pub fn delays_on_net_io(&self) -> std::time::Duration { + // The `store_commit_duration` serves as an indicator for latency + // during the duration of transferring Raft logs to peers and appending + // logs. In most scenarios, instances of latency fluctuations in the + // network are reflected by this duration. Hence, it is selected as a + // representative of network latency. + self.store_commit_duration.unwrap_or_default() } } @@ -2466,4 +2614,323 @@ mod tests { mismatch_err.set_store_peer_id(2); assert_eq!(region_err.get_mismatch_peer_id(), &mismatch_err) } + + #[test] + fn test_check_conf_change_upon_slow_peers() { + // Create a sample configuration + let mut cfg = Config::default(); + cfg.raft_max_election_timeout_ticks = 10; + + // peer 1, 2, 3 are voters, 4, 5 are learners. + let mut region = Region::default(); + for i in 1..3 { + region.mut_peers().push(metapb::Peer { + id: i, + role: PeerRole::Voter, + ..Default::default() + }); + } + region.mut_peers().push(metapb::Peer { + id: 3, + role: PeerRole::IncomingVoter, + ..Default::default() + }); + for i in 4..6 { + region.mut_peers().push(metapb::Peer { + id: i, + role: PeerRole::Learner, + ..Default::default() + }); + } + + // heartbeats: peer 3, 5 are slow + let mut peer_heartbeat = collections::HashMap::default(); + peer_heartbeat.insert( + 1, + std::time::Instant::now() - std::time::Duration::from_secs(1), + ); + peer_heartbeat.insert( + 2, + std::time::Instant::now() - std::time::Duration::from_secs(1), + ); + peer_heartbeat.insert( + 3, + std::time::Instant::now() - std::time::Duration::from_secs(100), + ); + peer_heartbeat.insert( + 4, + std::time::Instant::now() - std::time::Duration::from_secs(1), + ); + peer_heartbeat.insert( + 5, + std::time::Instant::now() - std::time::Duration::from_secs(100), + ); + + // Initialize change_peers + let change_peers_and_expect = vec![ + // promote peer 4 from learner to voter, it should work + ( + vec![ChangePeerRequest { + change_type: eraftpb::ConfChangeType::AddNode, + peer: Some(metapb::Peer { + id: 4, + ..Default::default() + }) + .into(), + ..Default::default() + }], + true, + ), + // promote peer 5 from learner to voter, it should be rejected (two slow voters vs two + // normal voters) + ( + vec![ChangePeerRequest { + change_type: eraftpb::ConfChangeType::AddNode, + peer: Some(metapb::Peer { + id: 4, + ..Default::default() + }) + .into(), + ..Default::default() + }], + true, + ), + // remove a peer 3, it should work as peer 3 is slow + ( + vec![ChangePeerRequest { + change_type: eraftpb::ConfChangeType::RemoveNode, + peer: Some(metapb::Peer { + id: 3, + ..Default::default() + }) + .into(), + ..Default::default() + }], + true, + ), + // remove a peer 2, it should be rejected as peer 3 is slow + ( + vec![ChangePeerRequest { + change_type: eraftpb::ConfChangeType::RemoveNode, + peer: Some(metapb::Peer { + id: 2, + ..Default::default() + }) + .into(), + ..Default::default() + }], + false, + ), + // demote peer2, it should be rejected + ( + vec![ChangePeerRequest { + change_type: eraftpb::ConfChangeType::AddLearnerNode, + peer: Some(metapb::Peer { + id: 2, + ..Default::default() + }) + .into(), + ..Default::default() + }], + false, + ), + // demote peer 2, but promote peer 4 as voter, it should work + ( + vec![ + ChangePeerRequest { + change_type: eraftpb::ConfChangeType::AddNode, + peer: Some(metapb::Peer { + id: 4, + ..Default::default() + }) + .into(), + ..Default::default() + }, + ChangePeerRequest { + change_type: eraftpb::ConfChangeType::AddLearnerNode, + peer: Some(metapb::Peer { + id: 2, + ..Default::default() + }) + .into(), + ..Default::default() + }, + ], + true, + ), + // demote peer 2, but promote peer 5 as voter, it should be rejected because peer 5 is + // slow + ( + vec![ + ChangePeerRequest { + change_type: eraftpb::ConfChangeType::AddNode, + peer: Some(metapb::Peer { + id: 5, + ..Default::default() + }) + .into(), + ..Default::default() + }, + ChangePeerRequest { + change_type: eraftpb::ConfChangeType::AddLearnerNode, + peer: Some(metapb::Peer { + id: 2, + ..Default::default() + }) + .into(), + ..Default::default() + }, + ], + false, + ), + // promote peer 4 and 5 as voter, it should be ok + ( + vec![ + ChangePeerRequest { + change_type: eraftpb::ConfChangeType::AddNode, + peer: Some(metapb::Peer { + id: 4, + ..Default::default() + }) + .into(), + ..Default::default() + }, + ChangePeerRequest { + change_type: eraftpb::ConfChangeType::AddNode, + peer: Some(metapb::Peer { + id: 5, + ..Default::default() + }) + .into(), + ..Default::default() + }, + ], + true, + ), + ]; + + for (cp, expect_result) in change_peers_and_expect { + // Call the function under test and assert that the function returns failed + // Call the function under test and assert that the function returns Ok + let result = + check_availability_by_last_heartbeats(®ion, &cfg, &cp, 1, &peer_heartbeat); + if expect_result { + assert!(result.is_ok()); + } else { + assert!(result.is_err(), "{:?}", cp); + } + } + } + + #[test] + fn test_check_conf_change_on_unhealthy_status() { + // Create a sample configuration + let mut cfg = Config::default(); + cfg.raft_max_election_timeout_ticks = 10; + + // peer 1, 2, 3 are voters, 4 is learner + let mut region = Region::default(); + region.mut_peers().push(metapb::Peer { + id: 1, + role: PeerRole::Voter, + ..Default::default() + }); + for i in 2..4 { + region.mut_peers().push(metapb::Peer { + id: i, + role: PeerRole::IncomingVoter, + ..Default::default() + }); + } + region.mut_peers().push(metapb::Peer { + id: 4, + role: PeerRole::Learner, + ..Default::default() + }); + + // heartbeats: peer 2, 3, 4 are slow, it's already unhealthy now + let mut peer_heartbeat = collections::HashMap::default(); + peer_heartbeat.insert( + 1, + std::time::Instant::now() - std::time::Duration::from_secs(1), + ); + peer_heartbeat.insert( + 2, + std::time::Instant::now() - std::time::Duration::from_secs(100), + ); + peer_heartbeat.insert( + 3, + std::time::Instant::now() - std::time::Duration::from_secs(100), + ); + peer_heartbeat.insert( + 4, + std::time::Instant::now() - std::time::Duration::from_secs(100), + ); + + // Initialize change_peers + let change_peers_and_expect = vec![ + // promote peer 4 from learner to voter, it should work + ( + vec![ChangePeerRequest { + change_type: eraftpb::ConfChangeType::AddNode, + peer: Some(metapb::Peer { + id: 4, + ..Default::default() + }) + .into(), + ..Default::default() + }], + true, + ), + // remove a peer 3, it should work as peer 3 is slow + ( + vec![ChangePeerRequest { + change_type: eraftpb::ConfChangeType::RemoveNode, + peer: Some(metapb::Peer { + id: 3, + ..Default::default() + }) + .into(), + ..Default::default() + }], + true, + ), + // remove a peer 2, 3, it should work + ( + vec![ + ChangePeerRequest { + change_type: eraftpb::ConfChangeType::RemoveNode, + peer: Some(metapb::Peer { + id: 2, + ..Default::default() + }) + .into(), + ..Default::default() + }, + ChangePeerRequest { + change_type: eraftpb::ConfChangeType::AddLearnerNode, + peer: Some(metapb::Peer { + id: 3, + ..Default::default() + }) + .into(), + ..Default::default() + }, + ], + true, + ), + ]; + + for (cp, expect_result) in change_peers_and_expect { + // Call the function under test and assert that the function returns failed + // Call the function under test and assert that the function returns Ok + let result = + check_availability_by_last_heartbeats(®ion, &cfg, &cp, 1, &peer_heartbeat); + if expect_result { + assert!(result.is_ok()); + } else { + assert!(result.is_err(), "{:?}", cp); + } + } + } } diff --git a/components/raftstore/src/store/worker/OWNERS b/components/raftstore/src/store/worker/OWNERS new file mode 100644 index 00000000000..eef90ab7017 --- /dev/null +++ b/components/raftstore/src/store/worker/OWNERS @@ -0,0 +1,7 @@ +# See the OWNERS docs at https://go.k8s.io/owners +options: + no_parent_owners: true +filters: + "(OWNERS|split_config\\.rs)$": + approvers: + - sig-critical-approvers-config-components diff --git a/components/raftstore/src/store/worker/cleanup.rs b/components/raftstore/src/store/worker/cleanup.rs index 632e85f40cc..726b7abe5ce 100644 --- a/components/raftstore/src/store/worker/cleanup.rs +++ b/components/raftstore/src/store/worker/cleanup.rs @@ -3,7 +3,6 @@ use std::fmt::{self, Display, Formatter}; use engine_traits::{KvEngine, RaftEngine}; -use pd_client::PdClient; use tikv_util::worker::Runnable; use super::{ @@ -11,7 +10,6 @@ use super::{ cleanup_sst::{Runner as CleanupSstRunner, Task as CleanupSstTask}, compact::{Runner as CompactRunner, Task as CompactTask}, }; -use crate::store::StoreRouter; pub enum Task { Compact(CompactTask), @@ -29,29 +27,26 @@ impl Display for Task { } } -pub struct Runner +pub struct Runner where E: KvEngine, R: RaftEngine, - S: StoreRouter, { compact: CompactRunner, - cleanup_sst: CleanupSstRunner, + cleanup_sst: CleanupSstRunner, gc_snapshot: GcSnapshotRunner, } -impl Runner +impl Runner where E: KvEngine, R: RaftEngine, - C: PdClient, - S: StoreRouter, { pub fn new( compact: CompactRunner, - cleanup_sst: CleanupSstRunner, + cleanup_sst: CleanupSstRunner, gc_snapshot: GcSnapshotRunner, - ) -> Runner { + ) -> Runner { Runner { compact, cleanup_sst, @@ -60,12 +55,10 @@ where } } -impl Runnable for Runner +impl Runnable for Runner where E: KvEngine, R: RaftEngine, - C: PdClient, - S: StoreRouter, { type Task = Task; diff --git a/components/raftstore/src/store/worker/cleanup_snapshot.rs b/components/raftstore/src/store/worker/cleanup_snapshot.rs index c84d6ddb4d3..78167a726d2 100644 --- a/components/raftstore/src/store/worker/cleanup_snapshot.rs +++ b/components/raftstore/src/store/worker/cleanup_snapshot.rs @@ -70,12 +70,12 @@ where "region_id" => region_id, ); - let gc_snap = PeerMsg::CasualMessage(CasualMessage::GcSnap { snaps }); + let gc_snap = PeerMsg::CasualMessage(Box::new(CasualMessage::GcSnap { snaps })); match (*self.router).send(region_id, gc_snap) { Ok(()) => Ok(()), Err(TrySendError::Disconnected(_)) if self.router.is_shutdown() => Ok(()), Err(TrySendError::Disconnected(PeerMsg::CasualMessage( - CasualMessage::GcSnap { snaps }, + box CasualMessage::GcSnap { snaps }, ))) => { // The snapshot exists because MsgAppend has been rejected. So the // peer must have been exist. But now it's disconnected, so the peer diff --git a/components/raftstore/src/store/worker/cleanup_sst.rs b/components/raftstore/src/store/worker/cleanup_sst.rs index 8174b872f4b..44f188e6f8f 100644 --- a/components/raftstore/src/store/worker/cleanup_sst.rs +++ b/components/raftstore/src/store/worker/cleanup_sst.rs @@ -1,62 +1,30 @@ // Copyright 2018 TiKV Project Authors. Licensed under Apache-2.0. -use std::{error::Error, fmt, marker::PhantomData, sync::Arc}; +use std::{fmt, sync::Arc}; -use engine_traits::KvEngine; -use kvproto::{import_sstpb::SstMeta, metapb::Region}; -use pd_client::PdClient; +use kvproto::import_sstpb::SstMeta; use sst_importer::SstImporter; -use tikv_util::{error, worker::Runnable}; - -use crate::store::{util::is_epoch_stale, StoreMsg, StoreRouter}; - -type Result = std::result::Result>; +use tikv_util::worker::Runnable; pub enum Task { DeleteSst { ssts: Vec }, - ValidateSst { ssts: Vec }, } impl fmt::Display for Task { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match *self { Task::DeleteSst { ref ssts } => write!(f, "Delete {} ssts", ssts.len()), - Task::ValidateSst { ref ssts } => write!(f, "Validate {} ssts", ssts.len()), } } } -pub struct Runner -where - EK: KvEngine, - S: StoreRouter, -{ - store_id: u64, - store_router: S, +pub struct Runner { importer: Arc, - pd_client: Arc, - _engine: PhantomData, } -impl Runner -where - EK: KvEngine, - C: PdClient, - S: StoreRouter, -{ - pub fn new( - store_id: u64, - store_router: S, - importer: Arc, - pd_client: Arc, - ) -> Runner { - Runner { - store_id, - store_router, - importer, - pd_client, - _engine: PhantomData, - } +impl Runner { + pub fn new(importer: Arc) -> Runner { + Runner { importer } } /// Deletes SST files from the importer. @@ -65,78 +33,9 @@ where let _ = self.importer.delete(sst); } } - - fn get_region_by_meta(&self, sst: &SstMeta) -> Result { - // The SST meta has been delivered with a range, use it directly. - // For now, no case will reach this. But this still could be a guard for - // reducing the superise in the future... - if !sst.get_range().get_start().is_empty() || !sst.get_range().get_end().is_empty() { - return self - .pd_client - .get_region(sst.get_range().get_start()) - .map_err(Into::into); - } - // Once there isn't range provided. - let query_by_start_key_of_full_meta = || { - let start_key = self - .importer - .load_start_key_by_meta::(sst)? - .ok_or_else(|| -> Box { - "failed to load start key from sst, the sst might be empty".into() - })?; - let region = self.pd_client.get_region(&start_key)?; - Result::Ok(region) - }; - query_by_start_key_of_full_meta() - .map_err(|err| - format!("failed to load full sst meta from disk for {:?} and there isn't extra information provided: {err}", sst.get_uuid()).into() - ) - } - - /// Validates whether the SST is stale or not. - fn handle_validate_sst(&self, ssts: Vec) { - let store_id = self.store_id; - let mut invalid_ssts = Vec::new(); - for sst in ssts { - match self.get_region_by_meta(&sst) { - Ok(r) => { - // The region id may or may not be the same as the - // SST file, but it doesn't matter, because the - // epoch of a range will not decrease anyway. - if is_epoch_stale(r.get_region_epoch(), sst.get_region_epoch()) { - // Region has not been updated. - continue; - } - if r.get_id() == sst.get_region_id() - && r.get_peers().iter().any(|p| p.get_store_id() == store_id) - { - // The SST still belongs to this store. - continue; - } - invalid_ssts.push(sst); - } - Err(e) => { - error!("get region failed"; "err" => %e); - } - } - } - - // We need to send back the result to check for the stale - // peer, which may ingest the stale SST before it is - // destroyed. - let msg = StoreMsg::ValidateSstResult { invalid_ssts }; - if let Err(e) = self.store_router.send(msg) { - error!(%e; "send validate sst result failed"); - } - } } -impl Runnable for Runner -where - EK: KvEngine, - C: PdClient, - S: StoreRouter, -{ +impl Runnable for Runner { type Task = Task; fn run(&mut self, task: Task) { @@ -144,9 +43,6 @@ where Task::DeleteSst { ssts } => { self.handle_delete_sst(ssts); } - Task::ValidateSst { ssts } => { - self.handle_validate_sst(ssts); - } } } } diff --git a/components/raftstore/src/store/worker/compact.rs b/components/raftstore/src/store/worker/compact.rs index 3b2a2ec0404..8e29b6ed5a5 100644 --- a/components/raftstore/src/store/worker/compact.rs +++ b/components/raftstore/src/store/worker/compact.rs @@ -209,7 +209,7 @@ pub fn need_compact(range_stats: &RangeStats, compact_threshold: &CompactThresho // We trigger region compaction when their are to many tombstones as well as // redundant keys, both of which can severly impact scan operation: let estimate_num_del = range_stats.num_entries - range_stats.num_versions; - let redundant_keys = range_stats.num_entries - range_stats.num_rows; + let redundant_keys = range_stats.redundant_keys(); (redundant_keys >= compact_threshold.redundant_rows_threshold && redundant_keys * 100 >= compact_threshold.redundant_rows_percent_threshold * range_stats.num_entries) @@ -277,8 +277,8 @@ mod tests { kv::{new_engine, new_engine_opt, KvTestEngine}, }; use engine_traits::{ - MiscExt, Mutable, SyncMutable, WriteBatch, WriteBatchExt, CF_DEFAULT, CF_LOCK, CF_RAFT, - CF_WRITE, + CompactExt, MiscExt, Mutable, SyncMutable, WriteBatch, WriteBatchExt, CF_DEFAULT, CF_LOCK, + CF_RAFT, CF_WRITE, }; use keys::data_key; use tempfile::Builder; @@ -286,6 +286,61 @@ mod tests { use super::*; + #[test] + fn test_disable_manual_compaction() { + let path = Builder::new() + .prefix("test_disable_manual_compaction") + .tempdir() + .unwrap(); + let db = new_engine(path.path().to_str().unwrap(), &[CF_DEFAULT]).unwrap(); + + // Generate the first SST file. + let mut wb = db.write_batch(); + for i in 0..1000 { + let k = format!("key_{}", i); + wb.put_cf(CF_DEFAULT, k.as_bytes(), b"whatever content") + .unwrap(); + } + wb.write().unwrap(); + db.flush_cf(CF_DEFAULT, true).unwrap(); + + // Generate another SST file has the same content with first SST file. + let mut wb = db.write_batch(); + for i in 0..1000 { + let k = format!("key_{}", i); + wb.put_cf(CF_DEFAULT, k.as_bytes(), b"whatever content") + .unwrap(); + } + wb.write().unwrap(); + db.flush_cf(CF_DEFAULT, true).unwrap(); + + // Get the total SST files size. + let old_sst_files_size = db.get_total_sst_files_size_cf(CF_DEFAULT).unwrap().unwrap(); + + // Stop the assistant. + { + let _ = db.disable_manual_compaction(); + + // Manually compact range. + let _ = db.compact_range_cf(CF_DEFAULT, None, None, false, 1); + + // Get the total SST files size after compact range. + let new_sst_files_size = db.get_total_sst_files_size_cf(CF_DEFAULT).unwrap().unwrap(); + assert_eq!(old_sst_files_size, new_sst_files_size); + } + // Restart the assistant. + { + let _ = db.enable_manual_compaction(); + + // Manually compact range. + let _ = db.compact_range_cf(CF_DEFAULT, None, None, false, 1); + + // Get the total SST files size after compact range. + let new_sst_files_size = db.get_total_sst_files_size_cf(CF_DEFAULT).unwrap().unwrap(); + assert!(old_sst_files_size > new_sst_files_size); + } + } + #[test] fn test_compact_range() { let path = Builder::new() diff --git a/components/raftstore/src/store/worker/disk_check.rs b/components/raftstore/src/store/worker/disk_check.rs new file mode 100644 index 00000000000..145a911adb9 --- /dev/null +++ b/components/raftstore/src/store/worker/disk_check.rs @@ -0,0 +1,179 @@ +// Copyright 2024 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ + fmt::{self, Display, Formatter}, + io::Write, + path::PathBuf, + time::Duration, +}; + +use crossbeam::channel::{bounded, Receiver, Sender}; +use tikv_util::{ + time::Instant, + warn, + worker::{Runnable, Worker}, +}; + +use crate::store::util::LatencyInspector; + +#[derive(Debug)] +pub enum Task { + InspectLatency { inspector: LatencyInspector }, +} + +impl Display for Task { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + match *self { + Task::InspectLatency { .. } => write!(f, "InspectLatency"), + } + } +} + +#[derive(Clone)] +/// A simple inspector to measure the latency of disk IO. +/// +/// This is used to measure the latency of disk IO, which is used to determine +/// the health status of the TiKV server. +/// The inspector writes a file to the disk and measures the time it takes to +/// complete the write operation. +pub struct Runner { + target: PathBuf, + notifier: Sender, + receiver: Receiver, + bg_worker: Option, +} + +impl Runner { + /// The filename to write to the disk to measure the latency. + const DISK_IO_LATENCY_INSPECT_FILENAME: &'static str = ".disk_latency_inspector.tmp"; + /// The content to write to the file to measure the latency. + const DISK_IO_LATENCY_INSPECT_FLUSH_STR: &'static [u8] = b"inspect disk io latency"; + + #[inline] + fn build(target: PathBuf) -> Self { + // The disk check mechanism only cares about the latency of the most + // recent request; older requests become stale and irrelevant. To avoid + // unnecessary accumulation of multiple requests, we set a small + // `capacity` for the disk check worker. + let (notifier, receiver) = bounded(3); + Self { + target, + notifier, + receiver, + bg_worker: None, + } + } + + #[inline] + pub fn new(inspect_dir: PathBuf) -> Self { + Self::build(inspect_dir.join(Self::DISK_IO_LATENCY_INSPECT_FILENAME)) + } + + #[inline] + /// Only for test. + /// Generate a dummy Runner. + pub fn dummy() -> Self { + Self::build(PathBuf::from("./").join(Self::DISK_IO_LATENCY_INSPECT_FILENAME)) + } + + #[inline] + pub fn bind_background_worker(&mut self, bg_worker: Worker) { + self.bg_worker = Some(bg_worker); + } + + fn inspect(&self) -> Option { + let mut file = std::fs::OpenOptions::new() + .create(true) + .write(true) + .truncate(true) + .open(&self.target) + .ok()?; + + let start = Instant::now(); + // Ignore the error + file.write_all(Self::DISK_IO_LATENCY_INSPECT_FLUSH_STR) + .ok()?; + file.sync_all().ok()?; + Some(start.saturating_elapsed()) + } + + fn execute(&self) { + if let Ok(task) = self.receiver.try_recv() { + match task { + Task::InspectLatency { mut inspector } => { + if let Some(latency) = self.inspect() { + inspector.record_apply_process(latency); + inspector.finish(); + } else { + warn!("failed to inspect disk io latency"); + } + } + } + } + } +} + +impl Runnable for Runner { + type Task = Task; + + fn run(&mut self, task: Task) { + // Send the task to the limited capacity channel. + if let Err(e) = self.notifier.try_send(task) { + warn!("failed to send task to disk check bg_worker: {:?}", e); + } else { + let runner = self.clone(); + if let Some(bg_worker) = self.bg_worker.as_ref() { + bg_worker.spawn_async_task(async move { + runner.execute(); + }); + } + } + } +} + +#[cfg(test)] +mod tests { + use tikv_util::worker::Builder; + + use super::*; + + #[test] + fn test_disk_check_runner() { + let background_worker = Builder::new("disk-check-worker") + .pending_capacity(256) + .create(); + let (tx, rx) = std::sync::mpsc::sync_channel(1); + let mut runner = Runner::dummy(); + runner.bind_background_worker(background_worker); + // Validate the disk check runner. + { + let tx_1 = tx.clone(); + let inspector = LatencyInspector::new( + 1, + Box::new(move |_, duration| { + let dur = duration.sum(); + tx_1.send(dur).unwrap(); + }), + ); + runner.run(Task::InspectLatency { inspector }); + let latency = rx.recv().unwrap(); + assert!(latency > Duration::from_secs(0)); + } + // Invalid bg_worker and out of capacity + { + runner.bg_worker = None; + for i in 2..=10 { + let tx_2 = tx.clone(); + let inspector = LatencyInspector::new( + i as u64, + Box::new(move |_, duration| { + let dur = duration.sum(); + tx_2.send(dur).unwrap(); + }), + ); + runner.run(Task::InspectLatency { inspector }); + rx.recv_timeout(Duration::from_secs(1)).unwrap_err(); + } + } + } +} diff --git a/components/raftstore/src/store/worker/metrics.rs b/components/raftstore/src/store/worker/metrics.rs index fd3f54d239d..8dca3bcfd44 100644 --- a/components/raftstore/src/store/worker/metrics.rs +++ b/components/raftstore/src/store/worker/metrics.rs @@ -70,6 +70,8 @@ make_static_metric! { pub struct LocalReadMetrics { pub local_executed_requests: LocalIntCounter, pub local_executed_stale_read_requests: LocalIntCounter, + pub local_executed_stale_read_fallback_success_requests: LocalIntCounter, + pub local_executed_stale_read_fallback_failure_requests: LocalIntCounter, pub local_executed_replica_read_requests: LocalIntCounter, pub local_executed_snapshot_cache_hit: LocalIntCounter, pub reject_reason: LocalReadRejectCounter, @@ -82,6 +84,8 @@ thread_local! { LocalReadMetrics { local_executed_requests: LOCAL_READ_EXECUTED_REQUESTS.local(), local_executed_stale_read_requests: LOCAL_READ_EXECUTED_STALE_READ_REQUESTS.local(), + local_executed_stale_read_fallback_success_requests: LOCAL_READ_EXECUTED_STALE_READ_FALLBACK_SUCCESS_REQUESTS.local(), + local_executed_stale_read_fallback_failure_requests: LOCAL_READ_EXECUTED_STALE_READ_FALLBACK_FAILURE_REQUESTS.local(), local_executed_replica_read_requests: LOCAL_READ_EXECUTED_REPLICA_READ_REQUESTS.local(), local_executed_snapshot_cache_hit: LOCAL_READ_EXECUTED_CACHE_REQUESTS.local(), reject_reason: LocalReadRejectCounter::from(&LOCAL_READ_REJECT_VEC), @@ -100,6 +104,10 @@ pub fn maybe_tls_local_read_metrics_flush() { if m.last_flush_time.saturating_elapsed() >= Duration::from_millis(METRICS_FLUSH_INTERVAL) { m.local_executed_requests.flush(); m.local_executed_stale_read_requests.flush(); + m.local_executed_stale_read_fallback_success_requests + .flush(); + m.local_executed_stale_read_fallback_failure_requests + .flush(); m.local_executed_replica_read_requests.flush(); m.local_executed_snapshot_cache_hit.flush(); m.reject_reason.flush(); @@ -189,6 +197,18 @@ lazy_static! { "Total number of stale read requests directly executed by local reader." ) .unwrap(); + pub static ref LOCAL_READ_EXECUTED_STALE_READ_FALLBACK_SUCCESS_REQUESTS: IntCounter = + register_int_counter!( + "tikv_raftstore_local_read_executed_stale_read_fallback_success_requests", + "Total number of stale read requests executed by local leader peer as snapshot read." + ) + .unwrap(); + pub static ref LOCAL_READ_EXECUTED_STALE_READ_FALLBACK_FAILURE_REQUESTS: IntCounter = + register_int_counter!( + "tikv_raftstore_local_read_executed_stale_read_fallback_failure_requests", + "Total number of stale read requests failed to be executed by local leader peer as snapshot read." + ) + .unwrap(); pub static ref LOCAL_READ_EXECUTED_REPLICA_READ_REQUESTS: IntCounter = register_int_counter!( "tikv_raftstore_local_read_executed_replica_read_requests", "Total number of stale read requests directly executed by local reader." diff --git a/components/raftstore/src/store/worker/mod.rs b/components/raftstore/src/store/worker/mod.rs index e79f37a4bc4..865326e1c4a 100644 --- a/components/raftstore/src/store/worker/mod.rs +++ b/components/raftstore/src/store/worker/mod.rs @@ -6,6 +6,7 @@ mod cleanup_snapshot; mod cleanup_sst; mod compact; mod consistency_check; +mod disk_check; pub mod metrics; mod pd; mod raftlog_gc; @@ -25,6 +26,7 @@ pub use self::{ cleanup_sst::{Runner as CleanupSstRunner, Task as CleanupSstTask}, compact::{need_compact, CompactThreshold, Runner as CompactRunner, Task as CompactTask}, consistency_check::{Runner as ConsistencyCheckRunner, Task as ConsistencyCheckTask}, + disk_check::{Runner as DiskCheckRunner, Task as DiskCheckTask}, pd::{ new_change_peer_v2_request, FlowStatistics, FlowStatsReporter, HeartbeatTask, Runner as PdRunner, StatsMonitor as PdStatsMonitor, StoreStatsReporter, Task as PdTask, @@ -42,7 +44,8 @@ pub use self::{ }, region::{Runner as RegionRunner, Task as RegionTask}, split_check::{ - Bucket, BucketRange, KeyEntry, Runner as SplitCheckRunner, Task as SplitCheckTask, + Bucket, BucketRange, BucketStatsInfo, KeyEntry, Runner as SplitCheckRunner, + Task as SplitCheckTask, }, split_config::{ SplitConfig, SplitConfigManager, BIG_REGION_CPU_OVERLOAD_THRESHOLD_RATIO, diff --git a/components/raftstore/src/store/worker/pd.rs b/components/raftstore/src/store/worker/pd.rs index d812830569a..964687bb35f 100644 --- a/components/raftstore/src/store/worker/pd.rs +++ b/components/raftstore/src/store/worker/pd.rs @@ -7,7 +7,7 @@ use std::{ io, mem, sync::{ atomic::Ordering, - mpsc::{self, Receiver, Sender}, + mpsc::{self, Receiver, Sender, SyncSender}, Arc, Mutex, }, thread::{Builder, JoinHandle}, @@ -31,7 +31,6 @@ use kvproto::{ raft_serverpb::RaftMessage, replication_modepb::{RegionReplicationStatus, StoreDrAutoSyncStatus}, }; -use ordered_float::OrderedFloat; use pd_client::{metrics::*, BucketStat, Error, PdClient, RegionStat}; use prometheus::local::LocalHistogram; use raft::eraftpb::ConfChangeType; @@ -40,19 +39,22 @@ use service::service_manager::GrpcServiceManager; use tikv_util::{ box_err, debug, error, info, metrics::ThreadInfoStatistics, + slow_score::SlowScore, store::QueryStats, - sys::thread::StdThreadBuildWrapper, + sys::{disk::get_disk_space_stats, thread::StdThreadBuildWrapper, SysQuota}, thd_name, time::{Instant as TiInstant, UnixSecs}, timer::GLOBAL_TIMER_HANDLE, topn::TopN, trend::{RequestPerSecRecorder, Trend}, warn, - worker::{Runnable, RunnableWithTimer, ScheduleError, Scheduler}, + worker::{Runnable, ScheduleError, Scheduler}, + InspectFactor, }; use txn_types::TimeStamp; use yatp::Remote; +use super::split_controller::AutoSplitControllerContext; use crate::{ coprocessor::CoprocessorHost, router::RaftStoreRouter, @@ -73,6 +75,10 @@ use crate::{ }; pub const NUM_COLLECT_STORE_INFOS_PER_HEARTBEAT: u32 = 2; +/// The upper bound of buffered stats messages. +/// It prevents unexpected memory buildup when AutoSplitController +/// runs slowly. +const STATS_CHANNEL_CAPACITY_LIMIT: usize = 128; type RecordPairVec = Vec; @@ -141,6 +147,7 @@ where peer: metapb::Peer, // If true, right Region derives origin region_id. right_derive: bool, + share_source_region_size: bool, callback: Callback, }, AskBatchSplit { @@ -149,6 +156,7 @@ where peer: metapb::Peer, // If true, right Region derives origin region_id. right_derive: bool, + share_source_region_size: bool, callback: Callback, }, AutoSplit { @@ -192,6 +200,7 @@ where }, UpdateSlowScore { id: u64, + factor: InspectFactor, duration: RaftstoreDuration, }, RegionCpuRecords(Arc), @@ -201,6 +210,9 @@ where }, ReportBuckets(BucketStat), ControlGrpcServer(pdpb::ControlGrpcEvent), + InspectLatency { + factor: InspectFactor, + }, } pub struct StoreStat { @@ -223,6 +235,9 @@ pub struct StoreStat { pub store_cpu_usages: RecordPairVec, pub store_read_io_rates: RecordPairVec, pub store_write_io_rates: RecordPairVec, + + store_cpu_quota: f64, // quota of cpu usage + store_cpu_busy_thd: f64, } impl Default for StoreStat { @@ -247,10 +262,33 @@ impl Default for StoreStat { store_cpu_usages: RecordPairVec::default(), store_read_io_rates: RecordPairVec::default(), store_write_io_rates: RecordPairVec::default(), + + store_cpu_quota: 0.0_f64, + store_cpu_busy_thd: 0.8_f64, } } } +impl StoreStat { + fn set_cpu_quota(&mut self, cpu_cores: f64, busy_thd: f64) { + self.store_cpu_quota = cpu_cores * 100.0; + self.store_cpu_busy_thd = busy_thd; + } + + fn maybe_busy(&self) -> bool { + if self.store_cpu_quota < 1.0 || self.store_cpu_busy_thd > 1.0 { + return false; + } + + let mut cpu_usage = 0_u64; + for record in self.store_cpu_usages.iter() { + cpu_usage += record.get_value(); + } + + (cpu_usage as f64 / self.store_cpu_quota) >= self.store_cpu_busy_thd + } +} + #[derive(Default)] pub struct PeerStat { pub read_bytes: u64, @@ -412,8 +450,16 @@ where Task::QueryRegionLeader { region_id } => { write!(f, "query the leader of region {}", region_id) } - Task::UpdateSlowScore { id, ref duration } => { - write!(f, "compute slow score: id {}, duration {:?}", id, duration) + Task::UpdateSlowScore { + id, + factor, + ref duration, + } => { + write!( + f, + "compute slow score: id {}, factor: {:?}, duration {:?}", + id, factor, duration + ) } Task::RegionCpuRecords(ref cpu_records) => { write!(f, "get region cpu records: {:?}", cpu_records) @@ -434,6 +480,9 @@ where Task::ControlGrpcServer(ref event) => { write!(f, "control grpc server: {:?}", event) } + Task::InspectLatency { factor } => { + write!(f, "inspect raftstore latency: {:?}", factor) + } } } } @@ -492,7 +541,7 @@ pub trait StoreStatsReporter: Send + Clone + Sync + 'static + Collector { ); fn report_min_resolved_ts(&self, store_id: u64, min_resolved_ts: u64); fn auto_split(&self, split_infos: Vec); - fn update_latency_stats(&self, timer_tick: u64); + fn update_latency_stats(&self, timer_tick: u64, factor: InspectFactor); } impl StoreStatsReporter for WrappedScheduler @@ -542,9 +591,16 @@ where } } - fn update_latency_stats(&self, timer_tick: u64) { - debug!("update latency statistics not implemented for raftstore-v1"; + fn update_latency_stats(&self, timer_tick: u64, factor: InspectFactor) { + debug!("update latency statistics for raftstore-v1"; "tick" => timer_tick); + let task = Task::InspectLatency { factor }; + if let Err(e) = self.0.schedule(task) { + error!( + "failed to send inspect raftstore latency task to pd worker"; + "err" => ?e, + ); + } } } @@ -555,13 +611,14 @@ where reporter: T, handle: Option>, timer: Option>, - read_stats_sender: Option>, - cpu_stats_sender: Option>>, + read_stats_sender: Option>, + cpu_stats_sender: Option>>, collect_store_infos_interval: Duration, load_base_split_check_interval: Duration, collect_tick_interval: Duration, report_min_resolved_ts_interval: Duration, inspect_latency_interval: Duration, + inspect_kvdb_latency_interval: Duration, } impl StatsMonitor @@ -572,6 +629,7 @@ where interval: Duration, report_min_resolved_ts_interval: Duration, inspect_latency_interval: Duration, + inspect_kvdb_latency_interval: Duration, reporter: T, ) -> Self { StatsMonitor { @@ -592,6 +650,7 @@ where cmp::min(default_collect_tick_interval(), interval), ), inspect_latency_interval, + inspect_kvdb_latency_interval, } } @@ -626,17 +685,22 @@ where let report_min_resolved_ts_interval = self .report_min_resolved_ts_interval .div_duration_f64(tick_interval) as u64; - let update_latency_stats_interval = self - .inspect_latency_interval - .div_duration_f64(tick_interval) as u64; + let update_raftdisk_latency_stats_interval = + self.inspect_latency_interval + .div_duration_f64(tick_interval) as u64; + let update_kvdisk_latency_stats_interval = + self.inspect_kvdb_latency_interval + .div_duration_f64(tick_interval) as u64; let (timer_tx, timer_rx) = mpsc::channel(); self.timer = Some(timer_tx); - let (read_stats_sender, read_stats_receiver) = mpsc::channel(); + let (read_stats_sender, read_stats_receiver) = + mpsc::sync_channel(STATS_CHANNEL_CAPACITY_LIMIT); self.read_stats_sender = Some(read_stats_sender); - let (cpu_stats_sender, cpu_stats_receiver) = mpsc::channel(); + let (cpu_stats_sender, cpu_stats_receiver) = + mpsc::sync_channel(STATS_CHANNEL_CAPACITY_LIMIT); self.cpu_stats_sender = Some(cpu_stats_sender); let reporter = self.reporter.clone(); @@ -655,10 +719,12 @@ where let mut collect_store_infos_thread_stats = ThreadInfoStatistics::new(); let mut load_base_split_thread_stats = ThreadInfoStatistics::new(); let mut region_cpu_records_collector = None; + let mut auto_split_controller_ctx = + AutoSplitControllerContext::new(STATS_CHANNEL_CAPACITY_LIMIT); // Register the region CPU records collector. if auto_split_controller .cfg - .region_cpu_overload_threshold_ratio + .region_cpu_overload_threshold_ratio() > 0.0 { region_cpu_records_collector = @@ -676,6 +742,7 @@ where if is_enable_tick(timer_cnt, load_base_split_check_interval) { StatsMonitor::load_base_split( &mut auto_split_controller, + &mut auto_split_controller_ctx, &read_stats_receiver, &cpu_stats_receiver, &mut load_base_split_thread_stats, @@ -690,8 +757,11 @@ where region_read_progress.get_min_resolved_ts(), ); } - if is_enable_tick(timer_cnt, update_latency_stats_interval) { - reporter.update_latency_stats(timer_cnt); + if is_enable_tick(timer_cnt, update_raftdisk_latency_stats_interval) { + reporter.update_latency_stats(timer_cnt, InspectFactor::RaftDisk); + } + if is_enable_tick(timer_cnt, update_kvdisk_latency_stats_interval) { + reporter.update_latency_stats(timer_cnt, InspectFactor::KvDisk); } timer_cnt += 1; } @@ -712,6 +782,7 @@ where pub fn load_base_split( auto_split_controller: &mut AutoSplitController, + auto_split_controller_ctx: &mut AutoSplitControllerContext, read_stats_receiver: &Receiver, cpu_stats_receiver: &Receiver>, thread_stats: &mut ThreadInfoStatistics, @@ -733,18 +804,14 @@ where } SplitConfigChange::Noop => {} } - let mut read_stats_vec = vec![]; - while let Ok(read_stats) = read_stats_receiver.try_recv() { - read_stats_vec.push(read_stats); - } - let mut cpu_stats_vec = vec![]; - while let Ok(cpu_stats) = cpu_stats_receiver.try_recv() { - cpu_stats_vec.push(cpu_stats); - } - thread_stats.record(); - let (top_qps, split_infos) = - auto_split_controller.flush(read_stats_vec, cpu_stats_vec, thread_stats); + let (top_qps, split_infos) = auto_split_controller.flush( + auto_split_controller_ctx, + read_stats_receiver, + cpu_stats_receiver, + thread_stats, + ); auto_split_controller.clear(); + auto_split_controller_ctx.maybe_gc(); reporter.auto_split(split_infos); for i in 0..TOP_N { if i < top_qps.len() { @@ -772,8 +839,8 @@ where #[inline] pub fn maybe_send_read_stats(&self, read_stats: ReadStats) { if let Some(sender) = &self.read_stats_sender { - if sender.send(read_stats).is_err() { - warn!("send read_stats failed, are we shutting down?") + if sender.try_send(read_stats).is_err() { + debug!("send read_stats failed, are we shutting down or channel is full?") } } } @@ -781,8 +848,8 @@ where #[inline] pub fn maybe_send_cpu_stats(&self, cpu_stats: &Arc) { if let Some(sender) = &self.cpu_stats_sender { - if sender.send(cpu_stats.clone()).is_err() { - warn!("send region cpu info failed, are we shutting down?") + if sender.try_send(cpu_stats.clone()).is_err() { + debug!("send region cpu info failed, are we shutting down or channel is full?") } } } @@ -815,105 +882,66 @@ fn hotspot_query_num_report_threshold() -> u64 { /// Max limitation of delayed store_heartbeat. const STORE_HEARTBEAT_DELAY_LIMIT: u64 = 5 * 60; -// Slow score is a value that represents the speed of a store and ranges in [1, -// 100]. It is maintained in the AIMD way. -// If there are some inspecting requests timeout during a round, by default the -// score will be increased at most 1x when above 10% inspecting requests -// timeout. If there is not any timeout inspecting requests, the score will go -// back to 1 in at least 5min. -struct SlowScore { - value: OrderedFloat, - last_record_time: Instant, - last_update_time: Instant, - - timeout_requests: usize, - total_requests: usize, - - inspect_interval: Duration, - // The maximal tolerated timeout ratio. - ratio_thresh: OrderedFloat, - // Minimal time that the score could be decreased from 100 to 1. - min_ttr: Duration, - - // After how many ticks the value need to be updated. - round_ticks: u64, - // Identify every ticks. - last_tick_id: u64, - // If the last tick does not finished, it would be recorded as a timeout. - last_tick_finished: bool, +/// A unified slow score that combines multiple slow scores. +/// +/// It calculates the final slow score of a store by picking the maximum +/// score among multiple factors. Each factor represents a different aspect of +/// the store's performance. Typically, we have two factors: Raft Disk I/O and +/// KvDB Disk I/O. If there are more factors in the future, we can add them +/// here. +#[derive(Default)] +pub struct UnifiedSlowScore { + factors: Vec, } -impl SlowScore { - fn new(inspect_interval: Duration) -> SlowScore { - SlowScore { - value: OrderedFloat(1.0), - - timeout_requests: 0, - total_requests: 0, - - inspect_interval, - ratio_thresh: OrderedFloat(0.1), - min_ttr: Duration::from_secs(5 * 60), - last_record_time: Instant::now(), - last_update_time: Instant::now(), - round_ticks: 30, - last_tick_id: 0, - last_tick_finished: true, - } +impl UnifiedSlowScore { + pub fn new(cfg: &Config) -> Self { + let mut unified_slow_score = UnifiedSlowScore::default(); + // The first factor is for Raft Disk I/O. + unified_slow_score + .factors + .push(SlowScore::new(cfg.inspect_interval.0)); + // The second factor is for KvDB Disk I/O. + unified_slow_score + .factors + .push(SlowScore::new_with_extra_config( + cfg.inspect_kvdb_interval.0, + 0.6, + )); + unified_slow_score } - fn record(&mut self, id: u64, duration: Duration) { - self.last_record_time = Instant::now(); - if id != self.last_tick_id { - return; - } - self.last_tick_finished = true; - self.total_requests += 1; - if duration >= self.inspect_interval { - self.timeout_requests += 1; - } + #[inline] + pub fn record( + &mut self, + id: u64, + factor: InspectFactor, + duration: &RaftstoreDuration, + not_busy: bool, + ) { + self.factors[factor as usize].record(id, duration.delays_on_disk_io(false), not_busy); } - fn record_timeout(&mut self) { - self.last_tick_finished = true; - self.total_requests += 1; - self.timeout_requests += 1; + #[inline] + pub fn get(&self, factor: InspectFactor) -> &SlowScore { + &self.factors[factor as usize] } - fn update(&mut self) -> f64 { - let elapsed = self.last_update_time.elapsed(); - self.update_impl(elapsed).into() + #[inline] + pub fn get_mut(&mut self, factor: InspectFactor) -> &mut SlowScore { + &mut self.factors[factor as usize] } - fn get(&self) -> f64 { - self.value.into() + // Returns the maximum score of all factors. + pub fn get_score(&self) -> f64 { + self.factors + .iter() + .map(|factor| factor.get()) + .fold(1.0, f64::max) } - // Update the score in a AIMD way. - fn update_impl(&mut self, elapsed: Duration) -> OrderedFloat { - if self.timeout_requests == 0 { - let desc = 100.0 * (elapsed.as_millis() as f64 / self.min_ttr.as_millis() as f64); - if OrderedFloat(desc) > self.value - OrderedFloat(1.0) { - self.value = 1.0.into(); - } else { - self.value -= desc; - } - } else { - let timeout_ratio = self.timeout_requests as f64 / self.total_requests as f64; - let near_thresh = - cmp::min(OrderedFloat(timeout_ratio), self.ratio_thresh) / self.ratio_thresh; - let value = self.value * (OrderedFloat(1.0) + near_thresh); - self.value = cmp::min(OrderedFloat(100.0), value); - } - - self.total_requests = 0; - self.timeout_requests = 0; - self.last_update_time = Instant::now(); - self.value - } - - fn should_force_report_slow_store(&self) -> bool { - self.value >= OrderedFloat(100.0) && (self.last_tick_id % self.round_ticks == 0) + pub fn last_tick_finished(&self) -> bool { + self.factors.iter().all(SlowScore::last_tick_finished) } } @@ -946,7 +974,7 @@ where concurrency_manager: ConcurrencyManager, snap_mgr: SnapManager, remote: Remote, - slow_score: SlowScore, + slow_score: UnifiedSlowScore, slow_trend_cause: Trend, slow_trend_result: Trend, slow_trend_result_recorder: RequestPerSecRecorder, @@ -984,12 +1012,15 @@ where causal_ts_provider: Option>, // used for rawkv apiv2 grpc_service_manager: GrpcServiceManager, ) -> Runner { + let mut store_stat = StoreStat::default(); + store_stat.set_cpu_quota(SysQuota::cpu_cores_quota(), cfg.inspect_cpu_util_thd); let store_heartbeat_interval = cfg.pd_store_heartbeat_tick_interval.0; let interval = store_heartbeat_interval / NUM_COLLECT_STORE_INFOS_PER_HEARTBEAT; let mut stats_monitor = StatsMonitor::new( interval, cfg.report_min_resolved_ts_interval.0, cfg.inspect_interval.0, + cfg.inspect_kvdb_interval.0, WrappedScheduler(scheduler.clone()), ); if let Err(e) = stats_monitor.start( @@ -1008,7 +1039,7 @@ where is_hb_receiver_scheduled: false, region_peers: HashMap::default(), region_buckets: HashMap::default(), - store_stat: StoreStat::default(), + store_stat, start_ts: UnixSecs::now(), scheduler, store_heartbeat_interval, @@ -1017,7 +1048,7 @@ where concurrency_manager, snap_mgr, remote, - slow_score: SlowScore::new(cfg.inspect_interval.0), + slow_score: UnifiedSlowScore::new(cfg), slow_trend_cause: Trend::new( // Disable SpikeFilter for now Duration::from_secs(0), @@ -1066,6 +1097,7 @@ where split_key: Vec, peer: metapb::Peer, right_derive: bool, + share_source_region_size: bool, callback: Callback, task: String, ) { @@ -1087,6 +1119,7 @@ where resp.get_new_region_id(), resp.take_new_peer_ids(), right_derive, + share_source_region_size, ); let region_id = region.get_id(); let epoch = region.take_region_epoch(); @@ -1121,6 +1154,7 @@ where mut split_keys: Vec>, peer: metapb::Peer, right_derive: bool, + share_source_region_size: bool, callback: Callback, task: String, remote: Remote, @@ -1146,6 +1180,7 @@ where split_keys, resp.take_ids().into(), right_derive, + share_source_region_size, ); let region_id = region.get_id(); let epoch = region.take_region_epoch(); @@ -1174,6 +1209,7 @@ where split_key: split_keys.pop().unwrap(), peer, right_derive, + share_source_region_size, callback, }; if let Err(ScheduleError::Stopped(t)) = scheduler.schedule(task) { @@ -1352,17 +1388,11 @@ where self.store_stat.region_bytes_read.flush(); self.store_stat.region_keys_read.flush(); - STORE_SIZE_GAUGE_VEC - .with_label_values(&["capacity"]) - .set(capacity as i64); - STORE_SIZE_GAUGE_VEC - .with_label_values(&["available"]) - .set(available as i64); - STORE_SIZE_GAUGE_VEC - .with_label_values(&["used"]) - .set(used_size as i64); - - let slow_score = self.slow_score.get(); + STORE_SIZE_EVENT_INT_VEC.capacity.set(capacity as i64); + STORE_SIZE_EVENT_INT_VEC.available.set(available as i64); + STORE_SIZE_EVENT_INT_VEC.used.set(used_size as i64); + + let slow_score = self.slow_score.get_score(); stats.set_slow_score(slow_score as u64); self.set_slow_trend_to_store_stats(&mut stats, total_query_num); @@ -1566,8 +1596,14 @@ where } } Ok(None) => { - // splitted Region has not yet reported to PD. - // TODO: handle merge + // Splitted region has not yet reported to PD. + // + // Or region has been merged. This case is handled by + // message `MsgCheckStalePeer`, stale peers will be + // removed eventually. + PD_VALIDATE_PEER_COUNTER_VEC + .with_label_values(&["region not found"]) + .inc(); } Err(e) => { error!("get region failed"; "err" => ?e); @@ -1645,6 +1681,7 @@ where split_keys: split_region.take_keys().into(), callback: Callback::None, source: "pd".into(), + share_source_region_size: false, } } else { CasualMessage::HalfSplitRegion { @@ -1656,7 +1693,7 @@ where cb: Callback::None, } }; - if let Err(e) = router.send(region_id, PeerMsg::CasualMessage(msg)) { + if let Err(e) = router.send(region_id, PeerMsg::CasualMessage(Box::new(msg))) { error!("send halfsplit request failed"; "region_id" => region_id, "err" => ?e); } } else if resp.has_merge() { @@ -1852,7 +1889,7 @@ where match resp.await { Ok(Some((region, leader))) => { if leader.get_store_id() != 0 { - let msg = CasualMessage::QueryRegionLeaderResp { region, leader }; + let msg = Box::new(CasualMessage::QueryRegionLeaderResp { region, leader }); if let Err(e) = router.send(region_id, PeerMsg::CasualMessage(msg)) { error!("send region info message failed"; "region_id" => region_id, "err" => ?e); } @@ -2009,6 +2046,121 @@ where } } } + + fn handle_inspect_latency(&mut self, factor: InspectFactor) { + // all_ticks_finished: The last tick of all factors is finished. + // factor_tick_finished: The last tick of the current factor is finished. + let (all_ticks_finished, factor_tick_finished) = ( + self.slow_score.last_tick_finished(), + self.slow_score.get(factor).last_tick_finished(), + ); + // The health status is recovered to serving as long as any tick + // does not timeout. + if self.curr_health_status == ServingStatus::ServiceUnknown && all_ticks_finished { + self.update_health_status(ServingStatus::Serving); + } + if !all_ticks_finished { + // If the last tick is not finished, it means that the current store might + // be busy on handling requests or delayed on I/O operations. And only when + // the current store is not busy, it should record the last_tick as a timeout. + if !self.store_stat.maybe_busy() && !factor_tick_finished { + self.slow_score.get_mut(factor).record_timeout(); + } + } + + let slow_score_tick_result = self.slow_score.get_mut(factor).tick(); + if slow_score_tick_result.updated_score.is_some() && !slow_score_tick_result.has_new_record + { + self.update_health_status(ServingStatus::ServiceUnknown); + } + if let Some(score) = slow_score_tick_result.updated_score { + STORE_SLOW_SCORE_GAUGE + .with_label_values(&[factor.as_str()]) + .set(score as i64); + } + + let id = slow_score_tick_result.tick_id; + let scheduler = self.scheduler.clone(); + let inspector = { + match factor { + InspectFactor::RaftDisk => { + // Record a fairly great value when timeout + self.slow_trend_cause.record(500_000, Instant::now()); + + // If the last slow_score already reached abnormal state and was delayed for + // reporting by `store-heartbeat` to PD, we should report it here manually as + // a FAKE `store-heartbeat`. + if slow_score_tick_result.should_force_report_slow_store + && self.is_store_heartbeat_delayed() + { + self.handle_fake_store_heartbeat(); + } + LatencyInspector::new( + id, + Box::new(move |id, duration| { + // TODO: use sub metric to record different durations. + STORE_INSPECT_DURATION_HISTOGRAM + .with_label_values(&["store_process"]) + .observe(tikv_util::time::duration_to_sec( + duration.store_process_duration.unwrap_or_default(), + )); + STORE_INSPECT_DURATION_HISTOGRAM + .with_label_values(&["store_wait"]) + .observe(tikv_util::time::duration_to_sec( + duration.store_wait_duration.unwrap_or_default(), + )); + STORE_INSPECT_DURATION_HISTOGRAM + .with_label_values(&["store_commit"]) + .observe(tikv_util::time::duration_to_sec( + duration.store_commit_duration.unwrap_or_default(), + )); + + STORE_INSPECT_DURATION_HISTOGRAM + .with_label_values(&["all"]) + .observe(tikv_util::time::duration_to_sec(duration.sum())); + if let Err(e) = scheduler.schedule(Task::UpdateSlowScore { + id, + factor, + duration, + }) { + warn!("schedule pd task failed"; "err" => ?e); + } + }), + ) + } + InspectFactor::KvDisk => LatencyInspector::new( + id, + Box::new(move |id, duration| { + STORE_INSPECT_DURATION_HISTOGRAM + .with_label_values(&["apply_wait"]) + .observe(tikv_util::time::duration_to_sec( + duration.apply_wait_duration.unwrap_or_default(), + )); + STORE_INSPECT_DURATION_HISTOGRAM + .with_label_values(&["apply_process"]) + .observe(tikv_util::time::duration_to_sec( + duration.apply_process_duration.unwrap_or_default(), + )); + if let Err(e) = scheduler.schedule(Task::UpdateSlowScore { + id, + factor, + duration, + }) { + warn!("schedule pd task failed"; "err" => ?e); + } + }), + ), + } + }; + let msg = StoreMsg::LatencyInspect { + factor, + send_time: TiInstant::now(), + inspector, + }; + if let Err(e) = self.router.send_control(msg) { + warn!("pd worker send latency inspecter failed"; "err" => ?e); + } + } } fn calculate_region_cpu_records( @@ -2048,12 +2200,14 @@ where split_key, peer, right_derive, + share_source_region_size, callback, } => self.handle_ask_split( region, split_key, peer, right_derive, + share_source_region_size, callback, String::from("ask_split"), ), @@ -2062,6 +2216,7 @@ where split_keys, peer, right_derive, + share_source_region_size, callback, } => Self::handle_ask_batch_split( self.router.clone(), @@ -2071,6 +2226,7 @@ where split_keys, peer, right_derive, + share_source_region_size, callback, String::from("batch_split"), self.remote.clone(), @@ -2095,6 +2251,7 @@ where vec![split_key], split_info.peer, true, + false, Callback::None, String::from("auto_split"), remote.clone(), @@ -2105,14 +2262,14 @@ where let start_key = split_info.start_key.unwrap(); let end_key = split_info.end_key.unwrap(); let region_id = region.get_id(); - let msg = CasualMessage::HalfSplitRegion { + let msg = Box::new(CasualMessage::HalfSplitRegion { region_epoch: region.get_region_epoch().clone(), start_key: Some(start_key.clone()), end_key: Some(end_key.clone()), policy: pdpb::CheckPolicy::Scan, source: "auto_split", cb: Callback::None, - }; + }); if let Err(e) = router.send(region_id, PeerMsg::CasualMessage(msg)) { error!("send auto half split request failed"; "region_id" => region_id, @@ -2247,12 +2404,14 @@ where txn_ext, } => self.handle_update_max_timestamp(region_id, initial_status, txn_ext), Task::QueryRegionLeader { region_id } => self.handle_query_region_leader(region_id), - Task::UpdateSlowScore { id, duration } => { - self.slow_score.record(id, duration.sum()); - self.slow_trend_cause.record( - tikv_util::time::duration_to_us(duration.store_wait_duration.unwrap()), - Instant::now(), - ); + Task::UpdateSlowScore { + id, + factor, + duration, + } => { + // Fine-tuned, `SlowScore` only takes the I/O jitters on the disk into account. + self.slow_score + .record(id, factor, &duration, !self.store_stat.maybe_busy()); } Task::RegionCpuRecords(records) => self.handle_region_cpu_records(records), Task::ReportMinResolvedTs { @@ -2265,6 +2424,9 @@ where Task::ControlGrpcServer(event) => { self.handle_control_grpc_server(event); } + Task::InspectLatency { factor } => { + self.handle_inspect_latency(factor); + } }; } @@ -2273,87 +2435,6 @@ where } } -impl RunnableWithTimer for Runner -where - EK: KvEngine, - ER: RaftEngine, - T: PdClient + 'static, -{ - fn on_timeout(&mut self) { - // Record a fairly great value when timeout - self.slow_trend_cause.record(500_000, Instant::now()); - - // The health status is recovered to serving as long as any tick - // does not timeout. - if self.curr_health_status == ServingStatus::ServiceUnknown - && self.slow_score.last_tick_finished - { - self.update_health_status(ServingStatus::Serving); - } - if !self.slow_score.last_tick_finished { - self.slow_score.record_timeout(); - // If the last slow_score already reached abnormal state and was delayed for - // reporting by `store-heartbeat` to PD, we should report it here manually as - // a FAKE `store-heartbeat`. - if self.slow_score.should_force_report_slow_store() && self.is_store_heartbeat_delayed() - { - self.handle_fake_store_heartbeat(); - } - } - let scheduler = self.scheduler.clone(); - let id = self.slow_score.last_tick_id + 1; - self.slow_score.last_tick_id += 1; - self.slow_score.last_tick_finished = false; - - if self.slow_score.last_tick_id % self.slow_score.round_ticks == 0 { - // `last_update_time` is refreshed every round. If no update happens in a whole - // round, we set the status to unknown. - if self.curr_health_status == ServingStatus::Serving - && self.slow_score.last_record_time < self.slow_score.last_update_time - { - self.update_health_status(ServingStatus::ServiceUnknown); - } - let slow_score = self.slow_score.update(); - STORE_SLOW_SCORE_GAUGE.set(slow_score); - } - - let inspector = LatencyInspector::new( - id, - Box::new(move |id, duration| { - let dur = duration.sum(); - - STORE_INSPECT_DURATION_HISTOGRAM - .with_label_values(&["store_process"]) - .observe(tikv_util::time::duration_to_sec( - duration.store_process_duration.unwrap(), - )); - STORE_INSPECT_DURATION_HISTOGRAM - .with_label_values(&["store_wait"]) - .observe(tikv_util::time::duration_to_sec( - duration.store_wait_duration.unwrap(), - )); - STORE_INSPECT_DURATION_HISTOGRAM - .with_label_values(&["all"]) - .observe(tikv_util::time::duration_to_sec(dur)); - if let Err(e) = scheduler.schedule(Task::UpdateSlowScore { id, duration }) { - warn!("schedule pd task failed"; "err" => ?e); - } - }), - ); - let msg = StoreMsg::LatencyInspect { - send_time: TiInstant::now(), - inspector, - }; - if let Err(e) = self.router.send_control(msg) { - warn!("pd worker send latency inspecter failed"; "err" => ?e); - } - } - - fn get_interval(&self) -> Duration { - self.slow_score.inspect_interval - } -} - fn new_change_peer_request(change_type: ConfChangeType, peer: metapb::Peer) -> AdminRequest { let mut req = AdminRequest::default(); req.set_cmd_type(AdminCmdType::ChangePeer); @@ -2385,6 +2466,7 @@ fn new_split_region_request( new_region_id: u64, peer_ids: Vec, right_derive: bool, + share_source_region_size: bool, ) -> AdminRequest { let mut req = AdminRequest::default(); req.set_cmd_type(AdminCmdType::Split); @@ -2392,6 +2474,8 @@ fn new_split_region_request( req.mut_split().set_new_region_id(new_region_id); req.mut_split().set_new_peer_ids(peer_ids); req.mut_split().set_right_derive(right_derive); + req.mut_split() + .set_share_source_region_size(share_source_region_size); req } @@ -2399,10 +2483,13 @@ fn new_batch_split_region_request( split_keys: Vec>, ids: Vec, right_derive: bool, + share_source_region_size: bool, ) -> AdminRequest { let mut req = AdminRequest::default(); req.set_cmd_type(AdminCmdType::BatchSplit); req.mut_splits().set_right_derive(right_derive); + req.mut_splits() + .set_share_source_region_size(share_source_region_size); let mut requests = Vec::with_capacity(ids.len()); for (mut id, key) in ids.into_iter().zip(split_keys) { let mut split = SplitRequest::default(); @@ -2560,7 +2647,7 @@ fn collect_engine_size( return Some((engine_size.capacity, engine_size.used, engine_size.avail)); } let store_info = store_info.unwrap(); - let disk_stats = match fs2::statvfs(store_info.kv_engine.path()) { + let (disk_cap, disk_avail) = match get_disk_space_stats(store_info.kv_engine.path()) { Err(e) => { error!( "get disk stat for rocksdb failed"; @@ -2569,27 +2656,32 @@ fn collect_engine_size( ); return None; } - Ok(stats) => stats, + Ok((total_size, available_size)) => (total_size, available_size), }; - let disk_cap = disk_stats.total_space(); let capacity = if store_info.capacity == 0 || disk_cap < store_info.capacity { disk_cap } else { store_info.capacity }; - let used_size = snap_mgr_size - + store_info - .kv_engine - .get_engine_used_size() - .expect("kv engine used size") - + store_info - .raft_engine - .get_engine_size() - .expect("raft engine used size"); + let raft_size = store_info + .raft_engine + .get_engine_size() + .expect("raft engine used size"); + + let kv_size = store_info + .kv_engine + .get_engine_used_size() + .expect("kv engine used size"); + + STORE_SIZE_EVENT_INT_VEC.raft_size.set(raft_size as i64); + STORE_SIZE_EVENT_INT_VEC.snap_size.set(snap_mgr_size as i64); + STORE_SIZE_EVENT_INT_VEC.kv_size.set(kv_size as i64); + + let used_size = snap_mgr_size + kv_size + raft_size; let mut available = capacity.checked_sub(used_size).unwrap_or_default(); // We only care about rocksdb SST file size, so we should check disk available // here. - available = cmp::min(available, disk_stats.available_space()); + available = cmp::min(available, disk_avail); Some((capacity, used_size, available)) } @@ -2603,8 +2695,10 @@ mod tests { use kvproto::{kvrpcpb, pdpb::QueryKind}; use pd_client::{new_bucket_stats, BucketMeta}; + use tikv_util::worker::LazyWorker; use super::*; + use crate::store::{fsm::StoreMeta, util::build_key_range}; const DEFAULT_TEST_STORE_ID: u64 = 1; @@ -2614,7 +2708,6 @@ mod tests { use std::{sync::Mutex, time::Instant}; use engine_test::{kv::KvTestEngine, raft::RaftTestEngine}; - use tikv_util::worker::LazyWorker; use crate::store::fsm::StoreMeta; @@ -2633,6 +2726,7 @@ mod tests { Duration::from_secs(interval), Duration::from_secs(0), Duration::from_secs(interval), + Duration::default(), WrappedScheduler(scheduler), ); let store_meta = Arc::new(Mutex::new(StoreMeta::new(0))); @@ -2734,59 +2828,6 @@ mod tests { assert_eq!(store_stats.peer_stats.len(), 3) } - #[test] - fn test_slow_score() { - let mut slow_score = SlowScore::new(Duration::from_millis(500)); - slow_score.timeout_requests = 5; - slow_score.total_requests = 100; - assert_eq!( - OrderedFloat(1.5), - slow_score.update_impl(Duration::from_secs(10)) - ); - - slow_score.timeout_requests = 10; - slow_score.total_requests = 100; - assert_eq!( - OrderedFloat(3.0), - slow_score.update_impl(Duration::from_secs(10)) - ); - - slow_score.timeout_requests = 20; - slow_score.total_requests = 100; - assert_eq!( - OrderedFloat(6.0), - slow_score.update_impl(Duration::from_secs(10)) - ); - - slow_score.timeout_requests = 100; - slow_score.total_requests = 100; - assert_eq!( - OrderedFloat(12.0), - slow_score.update_impl(Duration::from_secs(10)) - ); - - slow_score.timeout_requests = 11; - slow_score.total_requests = 100; - assert_eq!( - OrderedFloat(24.0), - slow_score.update_impl(Duration::from_secs(10)) - ); - - slow_score.timeout_requests = 0; - slow_score.total_requests = 100; - assert_eq!( - OrderedFloat(19.0), - slow_score.update_impl(Duration::from_secs(15)) - ); - - slow_score.timeout_requests = 0; - slow_score.total_requests = 100; - assert_eq!( - OrderedFloat(1.0), - slow_score.update_impl(Duration::from_secs(57)) - ); - } - use engine_test::{kv::KvTestEngine, raft::RaftTestEngine}; use metapb::Peer; use resource_metering::{RawRecord, TagInfos}; @@ -2928,4 +2969,70 @@ mod tests { assert_eq!(used, 111); assert_eq!(avail, 333); } + + #[test] + fn test_pd_worker_send_stats_on_read_and_cpu() { + let mut pd_worker: LazyWorker> = + LazyWorker::new("test-pd-worker-collect-stats"); + // Set the interval long enough for mocking the channel full state. + let interval = 600_u64; + let mut stats_monitor = StatsMonitor::new( + Duration::from_secs(interval), + Duration::from_secs(0), + Duration::from_secs(interval), + Duration::default(), + WrappedScheduler(pd_worker.scheduler()), + ); + let store_meta = Arc::new(Mutex::new(StoreMeta::new(0))); + let region_read_progress = store_meta.lock().unwrap().region_read_progress.clone(); + stats_monitor + .start( + AutoSplitController::default(), + region_read_progress, + CollectorRegHandle::new_for_test(), + 0, + ) + .unwrap(); + // Add some read stats and cpu stats to the stats monitor. + { + for _ in 0..=STATS_CHANNEL_CAPACITY_LIMIT + 10 { + let mut read_stats = ReadStats::with_sample_num(1); + read_stats.add_query_num( + 1, + &Peer::default(), + build_key_range(b"a", b"b", false), + QueryKind::Get, + ); + stats_monitor.maybe_send_read_stats(read_stats); + } + + let raw_records = Arc::new(RawRecords { + begin_unix_time_secs: UnixSecs::now().into_inner(), + duration: Duration::default(), + records: { + let mut records = HashMap::default(); + records.insert( + Arc::new(TagInfos { + store_id: 0, + region_id: 1, + peer_id: 0, + key_ranges: vec![], + extra_attachment: b"a".to_vec(), + }), + RawRecord { + cpu_time: 111, + read_keys: 1, + write_keys: 0, + }, + ); + records + }, + }); + for _ in 0..=STATS_CHANNEL_CAPACITY_LIMIT + 10 { + stats_monitor.maybe_send_cpu_stats(&raw_records); + } + } + + pd_worker.stop(); + } } diff --git a/components/raftstore/src/store/worker/read.rs b/components/raftstore/src/store/worker/read.rs index 0c4641770be..2694481494f 100644 --- a/components/raftstore/src/store/worker/read.rs +++ b/components/raftstore/src/store/worker/read.rs @@ -30,7 +30,7 @@ use tikv_util::{ }; use time::Timespec; use tracker::GLOBAL_TRACKERS; -use txn_types::TimeStamp; +use txn_types::{TimeStamp, WriteBatchFlags}; use super::metrics::*; use crate::{ @@ -440,7 +440,11 @@ impl ReadDelegate { read_progress: peer.read_progress.clone(), pending_remove: false, wait_data: false, - bucket_meta: peer.region_buckets.as_ref().map(|b| b.meta.clone()), + bucket_meta: peer + .region_buckets_info() + .bucket_stat() + .as_ref() + .map(|b| b.meta.clone()), track_ver: TrackVer::new(), } } @@ -824,10 +828,21 @@ where return Ok(None); } - // Check witness - if find_peer_by_id(&delegate.region, delegate.peer_id).map_or(true, |p| p.is_witness) { - TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().reject_reason.witness.inc()); - return Err(Error::IsWitness(region_id)); + match find_peer_by_id(&delegate.region, delegate.peer_id) { + // Check witness + Some(peer) => { + if peer.is_witness { + TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().reject_reason.witness.inc()); + return Err(Error::IsWitness(region_id)); + } + } + // This (rarely) happen in witness disabled clusters while the conf change applied but + // region not removed. We shouldn't return `IsWitness` here because our client back off + // for a long time while encountering that. + None => { + TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().reject_reason.no_region.inc()); + return Err(Error::RegionNotFound(region_id)); + } } // Check non-witness hasn't finish applying snapshot yet. @@ -963,80 +978,155 @@ where cmd.callback.set_result(read_resp); } + /// Try to handle the read request using local read, if the leader is valid + /// the read response is returned, otherwise None is returned. + fn try_local_leader_read( + &mut self, + req: &RaftCmdRequest, + delegate: &mut CachedReadDelegate, + read_id: Option, + snap_updated: &mut bool, + last_valid_ts: Timespec, + ) -> Option> { + let mut local_read_ctx = LocalReadContext::new(&mut self.snap_cache, read_id); + + (*snap_updated) = + local_read_ctx.maybe_update_snapshot(delegate.get_tablet(), last_valid_ts); + + let snapshot_ts = local_read_ctx.snapshot_ts().unwrap(); + if !delegate.is_in_leader_lease(snapshot_ts) { + return None; + } + + let region = Arc::clone(&delegate.region); + let mut response = delegate.execute(req, ®ion, None, Some(local_read_ctx)); + if let Some(snap) = response.snapshot.as_mut() { + snap.bucket_meta = delegate.bucket_meta.clone(); + } + // Try renew lease in advance + delegate.maybe_renew_lease_advance(&self.router, snapshot_ts); + Some(response) + } + + /// Try to handle the stale read request, if the read_ts < safe_ts the read + /// response is returned, otherwise the raft command response with + /// `DataIsNotReady` error is returned. + fn try_local_stale_read( + &mut self, + req: &RaftCmdRequest, + delegate: &mut CachedReadDelegate, + snap_updated: &mut bool, + last_valid_ts: Timespec, + ) -> std::result::Result, RaftCmdResponse> { + let read_ts = decode_u64(&mut req.get_header().get_flag_data()).unwrap(); + delegate.check_stale_read_safe(read_ts)?; + + // Stale read does not use cache, so we pass None for read_id + let mut local_read_ctx = LocalReadContext::new(&mut self.snap_cache, None); + (*snap_updated) = + local_read_ctx.maybe_update_snapshot(delegate.get_tablet(), last_valid_ts); + + let region = Arc::clone(&delegate.region); + // Getting the snapshot + let mut response = delegate.execute(req, ®ion, None, Some(local_read_ctx)); + if let Some(snap) = response.snapshot.as_mut() { + snap.bucket_meta = delegate.bucket_meta.clone(); + } + // Double check in case `safe_ts` change after the first check and before + // getting snapshot + delegate.check_stale_read_safe(read_ts)?; + + TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().local_executed_stale_read_requests.inc()); + Ok(response) + } + pub fn propose_raft_command( &mut self, read_id: Option, - req: RaftCmdRequest, + mut req: RaftCmdRequest, cb: Callback, ) { match self.pre_propose_raft_command(&req) { Ok(Some((mut delegate, policy))) => { - let snap_updated; + let mut snap_updated = false; let last_valid_ts = delegate.last_valid_ts; let mut response = match policy { // Leader can read local if and only if it is in lease. RequestPolicy::ReadLocal => { - let mut local_read_ctx = - LocalReadContext::new(&mut self.snap_cache, read_id); - - snap_updated = local_read_ctx - .maybe_update_snapshot(delegate.get_tablet(), last_valid_ts); - - let snapshot_ts = local_read_ctx.snapshot_ts().unwrap(); - if !delegate.is_in_leader_lease(snapshot_ts) { + if let Some(read_resp) = self.try_local_leader_read( + &req, + &mut delegate, + read_id, + &mut snap_updated, + last_valid_ts, + ) { + read_resp + } else { fail_point!("localreader_before_redirect", |_| {}); // Forward to raftstore. self.redirect(RaftCommand::new(req, cb)); return; } - - let region = Arc::clone(&delegate.region); - let mut response = - delegate.execute(&req, ®ion, None, Some(local_read_ctx)); - if let Some(snap) = response.snapshot.as_mut() { - snap.bucket_meta = delegate.bucket_meta.clone(); - } - // Try renew lease in advance - delegate.maybe_renew_lease_advance(&self.router, snapshot_ts); - response } // Replica can serve stale read if and only if its `safe_ts` >= `read_ts` RequestPolicy::StaleRead => { - let read_ts = decode_u64(&mut req.get_header().get_flag_data()).unwrap(); - if let Err(resp) = delegate.check_stale_read_safe(read_ts) { - cb.set_result(ReadResponse { - response: resp, - snapshot: None, - txn_extra_op: TxnExtraOp::Noop, - }); - return; + match self.try_local_stale_read( + &req, + &mut delegate, + &mut snap_updated, + last_valid_ts, + ) { + Ok(read_resp) => read_resp, + Err(err_resp) => { + // It's safe to change the header of the `RaftCmdRequest`, as it + // would not affect the `SnapCtx` used in upper layer like. + let unset_stale_flag = req.get_header().get_flags() + & (!WriteBatchFlags::STALE_READ.bits()); + req.mut_header().set_flags(unset_stale_flag); + let mut inspector = Inspector { + delegate: &delegate, + }; + // The read request could be handled using snapshot read if the + // local peer is a valid leader. + let allow_fallback_leader_read = inspector + .inspect(&req) + .map_or(false, |r| r == RequestPolicy::ReadLocal); + if !allow_fallback_leader_read { + cb.set_result(ReadResponse { + response: err_resp, + snapshot: None, + txn_extra_op: TxnExtraOp::Noop, + }); + return; + } + if let Some(read_resp) = self.try_local_leader_read( + &req, + &mut delegate, + None, + &mut snap_updated, + last_valid_ts, + ) { + TLS_LOCAL_READ_METRICS.with(|m| { + m.borrow_mut() + .local_executed_stale_read_fallback_success_requests + .inc() + }); + read_resp + } else { + TLS_LOCAL_READ_METRICS.with(|m| { + m.borrow_mut() + .local_executed_stale_read_fallback_failure_requests + .inc() + }); + cb.set_result(ReadResponse { + response: err_resp, + snapshot: None, + txn_extra_op: TxnExtraOp::Noop, + }); + return; + } + } } - - // Stale read does not use cache, so we pass None for read_id - let mut local_read_ctx = LocalReadContext::new(&mut self.snap_cache, None); - snap_updated = local_read_ctx - .maybe_update_snapshot(delegate.get_tablet(), last_valid_ts); - - let region = Arc::clone(&delegate.region); - // Getting the snapshot - let mut response = - delegate.execute(&req, ®ion, None, Some(local_read_ctx)); - if let Some(snap) = response.snapshot.as_mut() { - snap.bucket_meta = delegate.bucket_meta.clone(); - } - // Double check in case `safe_ts` change after the first check and before - // getting snapshot - if let Err(resp) = delegate.check_stale_read_safe(read_ts) { - cb.set_result(ReadResponse { - response: resp, - snapshot: None, - txn_extra_op: TxnExtraOp::Noop, - }); - return; - } - TLS_LOCAL_READ_METRICS - .with(|m| m.borrow_mut().local_executed_stale_read_requests.inc()); - response } _ => unreachable!(), }; @@ -1587,6 +1677,8 @@ mod tests { read_progress.update_safe_ts(1, 1); assert_eq!(read_progress.safe_ts(), 1); + // Expire lease manually to avoid local retry on leader peer. + lease.expire(); let data = { let mut d = [0u8; 8]; (&mut d[..]).encode_u64(2).unwrap(); @@ -1744,13 +1836,14 @@ mod tests { assert_eq!(kv_engine.path(), tablet.path()); } - fn prepare_read_delegate( + fn prepare_read_delegate_with_lease( store_id: u64, region_id: u64, term: u64, pr_ids: Vec, region_epoch: RegionEpoch, store_meta: Arc>, + max_lease: Duration, ) { let mut region = metapb::Region::default(); region.set_id(region_id); @@ -1759,7 +1852,7 @@ mod tests { let leader = prs[0].clone(); region.set_region_epoch(region_epoch); - let mut lease = Lease::new(Duration::seconds(1), Duration::milliseconds(250)); // 1s is long enough. + let mut lease = Lease::new(max_lease, Duration::milliseconds(250)); // 1s is long enough. let read_progress = Arc::new(RegionReadProgress::new(®ion, 1, 1, 1)); // Register region @@ -1788,6 +1881,25 @@ mod tests { } } + fn prepare_read_delegate( + store_id: u64, + region_id: u64, + term: u64, + pr_ids: Vec, + region_epoch: RegionEpoch, + store_meta: Arc>, + ) { + prepare_read_delegate_with_lease( + store_id, + region_id, + term, + pr_ids, + region_epoch, + store_meta, + Duration::seconds(1), + ) + } + #[test] fn test_snap_across_regions() { let store_id = 2; @@ -2154,4 +2266,123 @@ mod tests { must_not_redirect(&mut reader, &rx, task); notify_rx.recv().unwrap(); } + + #[test] + fn test_stale_read_local_leader_fallback() { + let store_id = 2; + let store_meta = Arc::new(Mutex::new(StoreMeta::new(0))); + let (_tmp, mut reader, rx) = new_reader( + "test-stale-local-leader-fallback", + store_id, + store_meta.clone(), + ); + reader.kv_engine.put(b"key", b"value").unwrap(); + + let epoch13 = { + let mut ep = metapb::RegionEpoch::default(); + ep.set_conf_ver(1); + ep.set_version(3); + ep + }; + let term6 = 6; + + // Register region1. + let pr_ids1 = vec![2, 3, 4]; + let prs1 = new_peers(store_id, pr_ids1.clone()); + // Ensure the leader lease is long enough so the fallback would work. + prepare_read_delegate_with_lease( + store_id, + 1, + term6, + pr_ids1.clone(), + epoch13.clone(), + store_meta.clone(), + Duration::seconds(10), + ); + let leader1 = prs1[0].clone(); + + // Local read. + let mut cmd = RaftCmdRequest::default(); + let mut header = RaftRequestHeader::default(); + header.set_region_id(1); + header.set_peer(leader1); + header.set_region_epoch(epoch13.clone()); + header.set_term(term6); + header.set_flags(header.get_flags() | WriteBatchFlags::STALE_READ.bits()); + cmd.set_header(header.clone()); + let mut req = Request::default(); + req.set_cmd_type(CmdType::Snap); + cmd.set_requests(vec![req].into()); + + // A peer can serve read_ts < safe_ts. + let safe_ts = TimeStamp::compose(2, 0); + { + let mut meta = store_meta.lock().unwrap(); + let delegate = meta.readers.get_mut(&1).unwrap(); + delegate + .read_progress + .update_safe_ts(1, safe_ts.into_inner()); + assert_eq!(delegate.read_progress.safe_ts(), safe_ts.into_inner()); + } + let read_ts_1 = TimeStamp::compose(1, 0); + let mut data = [0u8; 8]; + (&mut data[..]).encode_u64(read_ts_1.into_inner()).unwrap(); + header.set_flag_data(data.into()); + cmd.set_header(header.clone()); + let (snap_tx, snap_rx) = channel(); + let task = RaftCommand::::new( + cmd.clone(), + Callback::read(Box::new(move |resp: ReadResponse| { + snap_tx.send(resp).unwrap(); + })), + ); + must_not_redirect(&mut reader, &rx, task); + snap_rx.recv().unwrap().snapshot.unwrap(); + + // When read_ts > safe_ts, the leader peer could still serve if its lease is + // valid. + let read_ts_2 = TimeStamp::compose(safe_ts.physical() + 201, 0); + let mut data = [0u8; 8]; + (&mut data[..]).encode_u64(read_ts_2.into_inner()).unwrap(); + header.set_flag_data(data.into()); + cmd.set_header(header.clone()); + let (snap_tx, snap_rx) = channel(); + let task = RaftCommand::::new( + cmd.clone(), + Callback::read(Box::new(move |resp: ReadResponse| { + snap_tx.send(resp).unwrap(); + })), + ); + must_not_redirect(&mut reader, &rx, task); + snap_rx.recv().unwrap().snapshot.unwrap(); + + // The fallback would not happen if the lease is not valid. + prepare_read_delegate_with_lease( + store_id, + 1, + term6, + pr_ids1, + epoch13, + store_meta, + Duration::milliseconds(1), + ); + thread::sleep(std::time::Duration::from_millis(50)); + let (snap_tx, snap_rx) = channel(); + let task2 = RaftCommand::::new( + cmd.clone(), + Callback::read(Box::new(move |resp: ReadResponse| { + snap_tx.send(resp).unwrap(); + })), + ); + must_not_redirect(&mut reader, &rx, task2); + assert!( + snap_rx + .recv() + .unwrap() + .response + .get_header() + .get_error() + .has_data_is_not_ready() + ); + } } diff --git a/components/raftstore/src/store/worker/region.rs b/components/raftstore/src/store/worker/region.rs index 068904b2a67..fe5ef64b9cd 100644 --- a/components/raftstore/src/store/worker/region.rs +++ b/components/raftstore/src/store/worker/region.rs @@ -46,7 +46,7 @@ use crate::{ }, snap::{plain_file_used, Error, Result, SNAPSHOT_CFS}, transport::CasualRouter, - ApplyOptions, CasualMessage, Config, SnapEntry, SnapKey, SnapManager, + ApplyOptions, CasualMessage, Config, SnapEntry, SnapError, SnapKey, SnapManager, }, }; @@ -454,6 +454,9 @@ where fn apply_snap(&mut self, region_id: u64, peer_id: u64, abort: Arc) -> Result<()> { info!("begin apply snap data"; "region_id" => region_id, "peer_id" => peer_id); fail_point!("region_apply_snap", |_| { Ok(()) }); + fail_point!("region_apply_snap_io_err", |_| { + Err(SnapError::Other(box_err!("io error"))) + }); check_abort(&abort)?; let mut region_state = self.region_state(region_id)?; @@ -521,10 +524,11 @@ where let start = Instant::now(); - match self.apply_snap(region_id, peer_id, Arc::clone(&status)) { + let tombstone = match self.apply_snap(region_id, peer_id, Arc::clone(&status)) { Ok(()) => { status.swap(JOB_STATUS_FINISHED, Ordering::SeqCst); SNAP_COUNTER.apply.success.inc(); + false } Err(Error::Abort) => { warn!("applying snapshot is aborted"; "region_id" => region_id); @@ -535,18 +539,29 @@ where JOB_STATUS_CANCELLING ); SNAP_COUNTER.apply.abort.inc(); + // The snapshot is applied abort, it's not necessary to tombstone the peer. + false } Err(e) => { - error!(%e; "failed to apply snap!!!"); + warn!("failed to apply snap!!!"; "region_id" => region_id, "err" => %e); + self.coprocessor_host + .cancel_apply_snapshot(region_id, peer_id); status.swap(JOB_STATUS_FAILED, Ordering::SeqCst); SNAP_COUNTER.apply.fail.inc(); + // As the snapshot failed, the related peer should be marked tombstone. + // And as for the abnormal snapshot, it will be automatically cleaned up by + // the CleanupWorker later. + true } - } + }; SNAP_HISTOGRAM .apply .observe(start.saturating_elapsed_secs()); - let _ = self.router.send(region_id, CasualMessage::SnapshotApplied); + let _ = self.router.send( + region_id, + CasualMessage::SnapshotApplied { peer_id, tombstone }, + ); } /// Tries to clean up files in pending ranges overlapping with the given @@ -1275,7 +1290,7 @@ pub(crate) mod tests { let wait_apply_finish = |ids: &[u64]| { for id in ids { match receiver.recv_timeout(Duration::from_secs(5)) { - Ok((region_id, CasualMessage::SnapshotApplied)) => { + Ok((region_id, CasualMessage::SnapshotApplied { .. })) => { assert_eq!(region_id, *id); } msg => panic!("expected {} SnapshotApplied, but got {:?}", id, msg), diff --git a/components/raftstore/src/store/worker/split_check.rs b/components/raftstore/src/store/worker/split_check.rs index 4ff853f70a0..e3c0042acf0 100644 --- a/components/raftstore/src/store/worker/split_check.rs +++ b/components/raftstore/src/store/worker/split_check.rs @@ -5,6 +5,7 @@ use std::{ collections::BinaryHeap, fmt::{self, Display, Formatter}, mem, + sync::Arc, }; use engine_traits::{ @@ -12,21 +13,23 @@ use engine_traits::{ }; use file_system::{IoType, WithIoType}; use itertools::Itertools; -use kvproto::{metapb::Region, pdpb::CheckPolicy}; +use kvproto::{ + metapb::{Region, RegionEpoch}, + pdpb::CheckPolicy, +}; use online_config::{ConfigChange, OnlineConfig}; +use pd_client::{BucketMeta, BucketStat}; use tikv_util::{ box_err, debug, error, info, keybuilder::KeyBuilder, warn, worker::Runnable, Either, }; use txn_types::Key; use super::metrics::*; -#[cfg(any(test, feature = "testexport"))] -use crate::coprocessor::Config; use crate::{ coprocessor::{ dispatcher::StoreHandle, split_observer::{is_valid_split_key, strip_timestamp_if_exists}, - CoprocessorHost, SplitCheckerHost, + Config, CoprocessorHost, SplitCheckerHost, }, Result, }; @@ -144,6 +147,216 @@ pub struct Bucket { pub size: u64, } +#[derive(Debug, Clone, Default)] +pub struct BucketStatsInfo { + // the stats is increment flow. + bucket_stat: Option, + // the report bucket stat records the increment stats after last report pd. + // it will be reset after report pd. + report_bucket_stat: Option, + // avoid the version roll back, it record the last bucket version if bucket stat isn't none. + last_bucket_version: u64, +} + +impl BucketStatsInfo { + /// returns all bucket ranges those's write_bytes exceed the given + /// diff_size_threshold. + pub fn gen_bucket_range_for_update( + &self, + region_bucket_max_size: u64, + ) -> Option> { + let region_buckets = self.bucket_stat.as_ref()?; + let stats = ®ion_buckets.stats; + let keys = ®ion_buckets.meta.keys; + let sizes = ®ion_buckets.meta.sizes; + + let mut suspect_bucket_ranges = vec![]; + assert_eq!(keys.len(), stats.write_bytes.len() + 1); + for i in 0..stats.write_bytes.len() { + let estimated_bucket_size = stats.write_bytes[i] + sizes[i]; + if estimated_bucket_size >= region_bucket_max_size { + suspect_bucket_ranges.push(BucketRange(keys[i].clone(), keys[i + 1].clone())); + } + } + Some(suspect_bucket_ranges) + } + + #[inline] + pub fn version(&self) -> u64 { + self.bucket_stat + .as_ref() + .map_or(self.last_bucket_version, |b| b.meta.version) + } + + #[inline] + pub fn add_bucket_flow(&mut self, delta: &Option) { + if let (Some(buckets), Some(report_buckets), Some(delta)) = ( + self.bucket_stat.as_mut(), + self.report_bucket_stat.as_mut(), + delta, + ) { + buckets.merge(delta); + report_buckets.merge(delta); + } + } + + #[inline] + pub fn set_bucket_stat(&mut self, buckets: Option) { + self.bucket_stat = buckets.clone(); + if let Some(new_buckets) = buckets { + self.last_bucket_version = new_buckets.meta.version; + let mut new_report_buckets = BucketStat::from_meta(new_buckets.meta); + if let Some(old) = &mut self.report_bucket_stat { + new_report_buckets.merge(old); + *old = new_report_buckets; + } else { + self.report_bucket_stat = Some(new_report_buckets); + } + } else { + self.report_bucket_stat = None; + } + } + + #[inline] + pub fn report_bucket_stat(&mut self) -> BucketStat { + let current = self.report_bucket_stat.as_mut().unwrap(); + let delta = current.clone(); + current.clear_stats(); + delta + } + + #[inline] + pub fn bucket_stat(&self) -> &Option { + &self.bucket_stat + } + + #[inline] + pub fn bucket_stat_mut(&mut self) -> Option<&mut BucketStat> { + self.bucket_stat.as_mut() + } + + pub fn on_refresh_region_buckets( + &mut self, + cfg: &Config, + next_bucket_version: u64, + buckets: Vec, + region_epoch: RegionEpoch, + region: &Region, + bucket_ranges: Option>, + ) -> bool { + let change_bucket_version: bool; + // The region buckets reset after this region happened split or merge. + // The message should be dropped if it's epoch is lower than the regions. + // The bucket ranges is none when the region buckets is also none. + // So this condition indicates that the region buckets needs to refresh not + // renew. + if let Some(bucket_ranges) = bucket_ranges&&self.bucket_stat.is_some(){ + assert_eq!(buckets.len(), bucket_ranges.len()); + change_bucket_version=self.update_buckets(cfg, next_bucket_version, buckets, region_epoch, &bucket_ranges); + }else{ + change_bucket_version = true; + // when the region buckets is none, the exclusive buckets includes all the + // bucket keys. + self.init_buckets(cfg, next_bucket_version, buckets, region_epoch, region); + } + change_bucket_version + } + + fn update_buckets( + &mut self, + cfg: &Config, + next_bucket_version: u64, + buckets: Vec, + region_epoch: RegionEpoch, + bucket_ranges: &Vec, + ) -> bool { + let origin_region_buckets = self.bucket_stat.as_ref().unwrap(); + let mut change_bucket_version = false; + let mut meta_idx = 0; + let mut region_buckets = origin_region_buckets.clone(); + let mut meta = (*region_buckets.meta).clone(); + meta.region_epoch = region_epoch; + + // bucket stats will clean if the bucket size is updated. + for (bucket, bucket_range) in buckets.into_iter().zip(bucket_ranges) { + // the bucket ranges maybe need to split or merge not all the meta keys, so it + // needs to find the first keys. + while meta_idx < meta.keys.len() && meta.keys[meta_idx] != bucket_range.0 { + meta_idx += 1; + } + // meta_idx can't be not the last entry (which is end key) + if meta_idx >= meta.keys.len() - 1 { + break; + } + // the bucket size is small and does not have split keys, + // then it should be merged with its left neighbor + let region_bucket_merge_size = + cfg.region_bucket_merge_size_ratio * (cfg.region_bucket_size.0 as f64); + if bucket.keys.is_empty() && bucket.size <= (region_bucket_merge_size as u64) { + meta.sizes[meta_idx] = bucket.size; + region_buckets.clean_stats(meta_idx); + // the region has more than one bucket + // and the left neighbor + current bucket size is not very big + if meta.keys.len() > 2 + && meta_idx != 0 + && meta.sizes[meta_idx - 1] + bucket.size < cfg.region_bucket_size.0 * 2 + { + // bucket is too small + region_buckets.left_merge(meta_idx); + meta.left_merge(meta_idx); + change_bucket_version = true; + continue; + } + } else { + // update size + meta.sizes[meta_idx] = bucket.size / (bucket.keys.len() + 1) as u64; + region_buckets.clean_stats(meta_idx); + // insert new bucket keys (split the original bucket) + for bucket_key in bucket.keys { + meta_idx += 1; + region_buckets.split(meta_idx); + meta.split(meta_idx, bucket_key); + change_bucket_version = true; + } + } + meta_idx += 1; + } + if change_bucket_version { + meta.version = next_bucket_version; + } + region_buckets.meta = Arc::new(meta); + self.set_bucket_stat(Some(region_buckets)); + change_bucket_version + } + + fn init_buckets( + &mut self, + cfg: &Config, + next_bucket_version: u64, + mut buckets: Vec, + region_epoch: RegionEpoch, + region: &Region, + ) { + // when the region buckets is none, the exclusive buckets includes all the + // bucket keys. + assert_eq!(buckets.len(), 1); + let bucket_keys = buckets.pop().unwrap().keys; + let bucket_count = bucket_keys.len() + 1; + let mut meta = BucketMeta { + region_id: region.get_id(), + region_epoch, + version: next_bucket_version, + keys: bucket_keys, + sizes: vec![cfg.region_bucket_size.0; bucket_count], + }; + // padding the boundary keys and initialize the flow. + meta.keys.insert(0, region.get_start_key().to_vec()); + meta.keys.push(region.get_end_key().to_vec()); + let bucket_stats = BucketStat::from_meta(Arc::new(meta)); + self.set_bucket_stat(Some(bucket_stats)); + } +} + pub enum Task { SplitCheckTask { region: Region, @@ -482,6 +695,19 @@ impl Runner { }; if !split_keys.is_empty() { + // Notify peer that if the region is truly splitable. + // If it's truly splitable, then skip_split_check should be false; + self.router.update_approximate_size( + region.get_id(), + None, + Some(!split_keys.is_empty()), + ); + self.router.update_approximate_keys( + region.get_id(), + None, + Some(!split_keys.is_empty()), + ); + let region_epoch = region.get_region_epoch().clone(); self.router .ask_split(region_id, region_epoch, split_keys, "split checker".into()); @@ -523,6 +749,7 @@ impl Runner { } else { (!host.enable_region_bucket(), &empty_bucket) }; + let mut split_keys = vec![]; MergedIterator::<::Iterator>::new( tablet, LARGE_CFS, start_key, end_key, false, @@ -535,6 +762,7 @@ impl Runner { let mut skip_on_kv = false; while let Some(e) = iter.next() { if skip_on_kv && skip_check_bucket { + split_keys = host.split_keys(); return; } if !skip_on_kv && host.on_kv(region, &e) { @@ -597,6 +825,8 @@ impl Runner { } } + split_keys = host.split_keys(); + // if we scan the whole range, we can update approximate size and keys with // accurate value. if is_key_range { @@ -610,8 +840,17 @@ impl Runner { "bucket_count" => buckets.len(), "bucket_size" => bucket_size, ); - self.router.update_approximate_size(region.get_id(), size); - self.router.update_approximate_keys(region.get_id(), keys); + + self.router.update_approximate_size( + region.get_id(), + Some(size), + Some(!split_keys.is_empty()), + ); + self.router.update_approximate_keys( + region.get_id(), + Some(keys), + Some(!split_keys.is_empty()), + ); })?; if host.enable_region_bucket() { @@ -626,7 +865,7 @@ impl Runner { } timer.observe_duration(); - Ok(host.split_keys()) + Ok(split_keys) } fn change_cfg(&mut self, change: ConfigChange) { @@ -702,3 +941,178 @@ where } } } + +#[cfg(test)] +mod tests { + use super::*; + + // create BucketStatsInfo include three keys: ["","100","200",""]. + fn mock_bucket_stats_info() -> BucketStatsInfo { + let mut bucket_stats_info = BucketStatsInfo::default(); + let cfg = Config::default(); + let next_bucket_version = 1; + let bucket_ranges = None; + let mut region_epoch = RegionEpoch::default(); + region_epoch.set_conf_ver(1); + region_epoch.set_version(1); + let mut region = Region::default(); + region.set_id(1); + + let mut buckets = vec![]; + let mut bucket = Bucket::default(); + bucket.keys.push(vec![100]); + bucket.keys.push(vec![200]); + buckets.insert(0, bucket); + + let _ = bucket_stats_info.on_refresh_region_buckets( + &cfg, + next_bucket_version, + buckets, + region_epoch, + ®ion, + bucket_ranges, + ); + bucket_stats_info + } + + #[test] + pub fn test_version() { + let mut bucket_stats_info = mock_bucket_stats_info(); + assert_eq!(1, bucket_stats_info.version()); + bucket_stats_info.set_bucket_stat(None); + assert_eq!(1, bucket_stats_info.version()); + + let mut meta = BucketMeta::default(); + meta.version = 2; + meta.keys.push(vec![]); + meta.keys.push(vec![]); + let bucket_stat = BucketStat::from_meta(Arc::new(meta)); + bucket_stats_info.set_bucket_stat(Some(bucket_stat)); + assert_eq!(2, bucket_stats_info.version()); + } + + #[test] + pub fn test_insert_new_buckets() { + let bucket_stats_info = mock_bucket_stats_info(); + + let cfg = Config::default(); + let bucket_stat = bucket_stats_info.bucket_stat.unwrap(); + assert_eq!( + vec![vec![], vec![100], vec![200], vec![]], + bucket_stat.meta.keys + ); + for i in 0..bucket_stat.stats.write_bytes.len() { + assert_eq!(cfg.region_bucket_size.0, bucket_stat.meta.sizes[i]); + assert_eq!(0, bucket_stat.stats.write_bytes[i]); + } + } + + #[test] + pub fn test_report_buckets() { + let mut bucket_stats_info = mock_bucket_stats_info(); + let bucket_stats = bucket_stats_info.bucket_stat().as_ref().unwrap(); + let mut delta_bucket_stats = bucket_stats.clone(); + delta_bucket_stats.write_key(&[1], 1); + delta_bucket_stats.write_key(&[201], 1); + bucket_stats_info.add_bucket_flow(&Some(delta_bucket_stats.clone())); + let bucket_stats = bucket_stats_info.report_bucket_stat(); + assert_eq!(vec![2, 0, 2], bucket_stats.stats.write_bytes); + + let report_bucket_stats = bucket_stats_info.report_bucket_stat(); + assert_eq!(vec![0, 0, 0], report_bucket_stats.stats.write_bytes); + bucket_stats_info.add_bucket_flow(&Some(delta_bucket_stats)); + assert_eq!(vec![2, 0, 2], bucket_stats.stats.write_bytes); + } + + #[test] + pub fn test_spilt_and_merge_buckets() { + let mut bucket_stats_info = mock_bucket_stats_info(); + let next_bucket_version = 2; + let mut region = Region::default(); + region.set_id(1); + let cfg = Config::default(); + let bucket_size = cfg.region_bucket_size.0; + let bucket_stats = bucket_stats_info.bucket_stat().as_ref().unwrap(); + let region_epoch = bucket_stats.meta.region_epoch.clone(); + + // step1: update buckets flow + let mut delta_bucket_stats = bucket_stats.clone(); + delta_bucket_stats.write_key(&[1], 1); + delta_bucket_stats.write_key(&[201], 1); + bucket_stats_info.add_bucket_flow(&Some(delta_bucket_stats)); + let bucket_stats = bucket_stats_info.bucket_stat().as_ref().unwrap(); + assert_eq!(vec![2, 0, 2], bucket_stats.stats.write_bytes); + + // step2: tick not affect anything + let bucket_ranges = Some(vec![]); + let buckets = vec![]; + let mut change_bucket_version = bucket_stats_info.on_refresh_region_buckets( + &cfg, + next_bucket_version, + buckets, + region_epoch.clone(), + ®ion, + bucket_ranges, + ); + let bucket_stats = bucket_stats_info.bucket_stat().as_ref().unwrap(); + assert!(!change_bucket_version); + assert_eq!(vec![2, 0, 2], bucket_stats.stats.write_bytes); + + // step3: split key 50 + let mut bucket_ranges = Some(vec![BucketRange(vec![], vec![100])]); + let mut bucket = Bucket::default(); + bucket.keys = vec![vec![50]]; + bucket.size = bucket_size; + let mut buckets = vec![bucket]; + change_bucket_version = bucket_stats_info.on_refresh_region_buckets( + &cfg, + next_bucket_version, + buckets.clone(), + region_epoch.clone(), + ®ion, + bucket_ranges.clone(), + ); + assert!(change_bucket_version); + let bucket_stats = bucket_stats_info.bucket_stat().as_ref().unwrap(); + assert_eq!( + vec![vec![], vec![50], vec![100], vec![200], vec![]], + bucket_stats.meta.keys + ); + assert_eq!( + vec![bucket_size / 2, bucket_size / 2, bucket_size, bucket_size], + bucket_stats.meta.sizes + ); + assert_eq!(vec![0, 0, 0, 2], bucket_stats.stats.write_bytes); + + // step4: merge [50-100] to [0-50], + bucket_ranges = Some(vec![BucketRange(vec![50], vec![100])]); + let mut bucket = Bucket::default(); + bucket.keys = vec![]; + bucket.size = 0; + buckets = vec![bucket]; + change_bucket_version = bucket_stats_info.on_refresh_region_buckets( + &cfg, + next_bucket_version, + buckets, + region_epoch, + ®ion, + bucket_ranges, + ); + assert!(change_bucket_version); + + let bucket_stats = bucket_stats_info.bucket_stat().as_ref().unwrap(); + assert_eq!( + vec![vec![], vec![100], vec![200], vec![]], + bucket_stats.meta.keys + ); + assert_eq!( + vec![bucket_size / 2, bucket_size, bucket_size], + bucket_stats.meta.sizes + ); + assert_eq!(vec![0, 0, 2], bucket_stats.stats.write_bytes); + + // report buckets doesn't be affected by the split and merge. + let report_bucket_stats = bucket_stats_info.report_bucket_stat(); + assert_eq!(vec![4, 0, 2], report_bucket_stats.stats.write_bytes); + } +} diff --git a/components/raftstore/src/store/worker/split_config.rs b/components/raftstore/src/store/worker/split_config.rs index 8fec853bb00..2d29bd21a89 100644 --- a/components/raftstore/src/store/worker/split_config.rs +++ b/components/raftstore/src/store/worker/split_config.rs @@ -68,18 +68,18 @@ pub fn get_sample_num() -> usize { #[serde(default)] #[serde(rename_all = "kebab-case")] pub struct SplitConfig { - pub qps_threshold: usize, + pub qps_threshold: Option, pub split_balance_score: f64, pub split_contained_score: f64, pub detect_times: u64, pub sample_num: usize, pub sample_threshold: u64, - pub byte_threshold: usize, + pub byte_threshold: Option, #[doc(hidden)] pub grpc_thread_cpu_overload_threshold_ratio: f64, #[doc(hidden)] pub unified_read_pool_thread_cpu_overload_threshold_ratio: f64, - pub region_cpu_overload_threshold_ratio: f64, + pub region_cpu_overload_threshold_ratio: Option, // deprecated. #[online_config(skip)] #[doc(hidden)] @@ -95,18 +95,18 @@ pub struct SplitConfig { impl Default for SplitConfig { fn default() -> SplitConfig { SplitConfig { - qps_threshold: DEFAULT_QPS_THRESHOLD, + qps_threshold: None, split_balance_score: DEFAULT_SPLIT_BALANCE_SCORE, split_contained_score: DEFAULT_SPLIT_CONTAINED_SCORE, detect_times: DEFAULT_DETECT_TIMES, sample_num: DEFAULT_SAMPLE_NUM, sample_threshold: DEFAULT_SAMPLE_THRESHOLD, - byte_threshold: DEFAULT_BYTE_THRESHOLD, + byte_threshold: None, grpc_thread_cpu_overload_threshold_ratio: DEFAULT_GRPC_THREAD_CPU_OVERLOAD_THRESHOLD_RATIO, unified_read_pool_thread_cpu_overload_threshold_ratio: DEFAULT_UNIFIED_READ_POOL_THREAD_CPU_OVERLOAD_THRESHOLD_RATIO, - region_cpu_overload_threshold_ratio: REGION_CPU_OVERLOAD_THRESHOLD_RATIO, + region_cpu_overload_threshold_ratio: None, size_threshold: None, // deprecated. key_threshold: None, // deprecated. } @@ -124,7 +124,7 @@ impl SplitConfig { ("split_balance_score or split_contained_score should be between 0 and 1.").into(), ); } - if self.sample_num >= self.qps_threshold { + if self.sample_num >= self.qps_threshold() { return Err( ("sample_num should be less than qps_threshold for load-base-split.").into(), ); @@ -133,20 +133,52 @@ impl SplitConfig { || self.grpc_thread_cpu_overload_threshold_ratio < 0.0 || self.unified_read_pool_thread_cpu_overload_threshold_ratio > 1.0 || self.unified_read_pool_thread_cpu_overload_threshold_ratio < 0.0 - || self.region_cpu_overload_threshold_ratio > 1.0 - || self.region_cpu_overload_threshold_ratio < 0.0 + || self.region_cpu_overload_threshold_ratio() > 1.0 + || self.region_cpu_overload_threshold_ratio() < 0.0 { return Err(("threshold ratio should be between 0 and 1.").into()); } Ok(()) } + pub fn qps_threshold(&self) -> usize { + self.qps_threshold.unwrap_or(DEFAULT_QPS_THRESHOLD) + } + + pub fn byte_threshold(&self) -> usize { + self.byte_threshold.unwrap_or(DEFAULT_BYTE_THRESHOLD) + } + + pub fn region_cpu_overload_threshold_ratio(&self) -> f64 { + self.region_cpu_overload_threshold_ratio + .unwrap_or(REGION_CPU_OVERLOAD_THRESHOLD_RATIO) + } + pub fn optimize_for(&mut self, region_size: ReadableSize) { const LARGE_REGION_SIZE_IN_MB: u64 = 4096; - if region_size.as_mb() >= LARGE_REGION_SIZE_IN_MB { - self.qps_threshold = DEFAULT_BIG_REGION_QPS_THRESHOLD; - self.region_cpu_overload_threshold_ratio = BIG_REGION_CPU_OVERLOAD_THRESHOLD_RATIO; - self.byte_threshold = DEFAULT_BIG_REGION_BYTE_THRESHOLD; + let big_size = region_size.as_mb() >= LARGE_REGION_SIZE_IN_MB; + if self.qps_threshold.is_none() { + self.qps_threshold = Some(if big_size { + DEFAULT_BIG_REGION_QPS_THRESHOLD + } else { + DEFAULT_QPS_THRESHOLD + }); + } + + if self.byte_threshold.is_none() { + self.byte_threshold = Some(if big_size { + DEFAULT_BIG_REGION_BYTE_THRESHOLD + } else { + DEFAULT_BYTE_THRESHOLD + }); + } + + if self.region_cpu_overload_threshold_ratio.is_none() { + self.region_cpu_overload_threshold_ratio = Some(if big_size { + BIG_REGION_CPU_OVERLOAD_THRESHOLD_RATIO + } else { + REGION_CPU_OVERLOAD_THRESHOLD_RATIO + }); } } } diff --git a/components/raftstore/src/store/worker/split_controller.rs b/components/raftstore/src/store/worker/split_controller.rs index d432f264e01..5fef6fd76a0 100644 --- a/components/raftstore/src/store/worker/split_controller.rs +++ b/components/raftstore/src/store/worker/split_controller.rs @@ -2,12 +2,13 @@ use std::{ cmp::{min, Ordering}, - collections::{BinaryHeap, HashMap, HashSet}, + collections::{BinaryHeap, HashSet}, slice::{Iter, IterMut}, sync::{mpsc::Receiver, Arc}, time::{Duration, SystemTime}, }; +use collections::HashMap; use kvproto::{ kvrpcpb::KeyRange, metapb::{self, Peer}, @@ -21,6 +22,7 @@ use tikv_util::{ debug, info, metrics::ThreadInfoStatistics, store::{is_read_query, QueryStats}, + time::Instant, warn, }; @@ -608,7 +610,7 @@ impl AutoSplitController { } fn should_check_region_cpu(&self) -> bool { - self.cfg.region_cpu_overload_threshold_ratio > 0.0 + self.cfg.region_cpu_overload_threshold_ratio() > 0.0 } fn is_grpc_poll_busy(&self, avg_grpc_thread_usage: f64) -> bool { @@ -643,16 +645,20 @@ impl AutoSplitController { return false; } region_cpu_usage / unified_read_pool_thread_usage - >= self.cfg.region_cpu_overload_threshold_ratio + >= self.cfg.region_cpu_overload_threshold_ratio() } // collect the read stats from read_stats_vec and dispatch them to a Region // HashMap. - fn collect_read_stats(read_stats_vec: Vec) -> HashMap> { + fn collect_read_stats( + ctx: &mut AutoSplitControllerContext, + read_stats_receiver: &Receiver, + ) -> HashMap> { + let read_stats_vec = ctx.batch_recv_read_stats(read_stats_receiver); // RegionID -> Vec, collect the RegionInfo from different threads. let mut region_infos_map = HashMap::default(); let capacity = read_stats_vec.len(); - for read_stats in read_stats_vec { + for read_stats in read_stats_vec.drain(..) { for (region_id, region_info) in read_stats.region_infos { let region_infos = region_infos_map .entry(region_id) @@ -665,19 +671,27 @@ impl AutoSplitController { // collect the CPU stats from cpu_stats_vec and dispatch them to a Region // HashMap. - fn collect_cpu_stats( - &self, - cpu_stats_vec: Vec>, - ) -> HashMap)> { + fn collect_cpu_stats<'c>( + &mut self, + ctx: &'c mut AutoSplitControllerContext, + cpu_stats_receiver: &Receiver>, + ) -> &'c HashMap)> { // RegionID -> (CPU usage, Hottest Key Range), calculate the CPU usage and its // hottest key range. - let mut region_cpu_map = HashMap::default(); if !self.should_check_region_cpu() { - return region_cpu_map; + return ctx.empty_region_cpu_map(); } + + let ( + cpu_stats_vec, + CpuStatsCache { + region_cpu_map, + hottest_key_range_cpu_time_map, + }, + ) = ctx.batch_recv_cpu_stats(cpu_stats_receiver); // Calculate the Region CPU usage. let mut collect_interval_ms = 0; - let mut region_key_range_cpu_time_map = HashMap::new(); + let mut region_key_range_cpu_time_map = HashMap::default(); cpu_stats_vec.iter().for_each(|cpu_stats| { cpu_stats.records.iter().for_each(|(tag, record)| { // Calculate the Region ID -> CPU Time. @@ -704,7 +718,6 @@ impl AutoSplitController { } }); // Choose the hottest key range for each Region. - let mut hottest_key_range_cpu_time_map = HashMap::with_capacity(region_cpu_map.len()); region_key_range_cpu_time_map .iter() .for_each(|((region_id, key_range), cpu_time)| { @@ -740,15 +753,17 @@ impl AutoSplitController { // be split according to all the stats info the recorder has collected before. pub fn flush( &mut self, - read_stats_vec: Vec, - cpu_stats_vec: Vec>, - thread_stats: &ThreadInfoStatistics, + ctx: &mut AutoSplitControllerContext, + read_stats_receiver: &Receiver, + cpu_stats_receiver: &Receiver>, + thread_stats: &mut ThreadInfoStatistics, ) -> (Vec, Vec) { let mut top_cpu_usage = vec![]; let mut top_qps = BinaryHeap::with_capacity(TOP_N); - let region_infos_map = Self::collect_read_stats(read_stats_vec); - let region_cpu_map = self.collect_cpu_stats(cpu_stats_vec); + let region_infos_map = Self::collect_read_stats(ctx, read_stats_receiver); + let region_cpu_map = self.collect_cpu_stats(ctx, cpu_stats_receiver); // Prepare some diagnostic info. + thread_stats.record(); let (grpc_thread_usage, unified_read_pool_thread_usage) = ( Self::collect_thread_usage(thread_stats, "grpc-server"), Self::collect_thread_usage(thread_stats, "unified-read-po"), @@ -787,9 +802,9 @@ impl AutoSplitController { debug!("load base split params"; "region_id" => region_id, "qps" => qps, - "qps_threshold" => self.cfg.qps_threshold, + "qps_threshold" => self.cfg.qps_threshold(), "byte" => byte, - "byte_threshold" => self.cfg.byte_threshold, + "byte_threshold" => self.cfg.byte_threshold(), "cpu_usage" => cpu_usage, "is_region_busy" => is_region_busy, ); @@ -800,8 +815,8 @@ impl AutoSplitController { // 1. If the QPS or the byte does not meet the threshold, skip. // 2. If the Unified Read Pool or the region is not hot enough, skip. - if qps < self.cfg.qps_threshold - && byte < self.cfg.byte_threshold + if qps < self.cfg.qps_threshold() + && byte < self.cfg.byte_threshold() && (!is_unified_read_pool_busy || !is_region_busy) { self.recorders.remove_entry(®ion_id); @@ -917,13 +932,13 @@ impl AutoSplitController { pub fn refresh_and_check_cfg(&mut self) -> SplitConfigChange { let mut cfg_change = SplitConfigChange::Noop; if let Some(incoming) = self.cfg_tracker.any_new() { - if self.cfg.region_cpu_overload_threshold_ratio <= 0.0 - && incoming.region_cpu_overload_threshold_ratio > 0.0 + if self.cfg.region_cpu_overload_threshold_ratio() <= 0.0 + && incoming.region_cpu_overload_threshold_ratio() > 0.0 { cfg_change = SplitConfigChange::UpdateRegionCpuCollector(true); } - if self.cfg.region_cpu_overload_threshold_ratio > 0.0 - && incoming.region_cpu_overload_threshold_ratio <= 0.0 + if self.cfg.region_cpu_overload_threshold_ratio() > 0.0 + && incoming.region_cpu_overload_threshold_ratio() <= 0.0 { cfg_change = SplitConfigChange::UpdateRegionCpuCollector(false); } @@ -939,16 +954,97 @@ impl AutoSplitController { } } +#[derive(Default)] +pub struct CpuStatsCache { + region_cpu_map: HashMap)>, + hottest_key_range_cpu_time_map: HashMap, +} + +pub struct AutoSplitControllerContext { + read_stats_vec: Vec, + cpu_stats_vec: Vec>, + cpu_stats_cache: CpuStatsCache, + batch_recv_len: usize, + + last_gc_time: Instant, + gc_duration: Duration, +} + +impl AutoSplitControllerContext { + pub fn new(batch_recv_len: usize) -> Self { + AutoSplitControllerContext { + read_stats_vec: Vec::default(), + cpu_stats_vec: Vec::default(), + cpu_stats_cache: CpuStatsCache::default(), + batch_recv_len, + last_gc_time: Instant::now_coarse(), + // 30 seconds is a balance between efficient memory usage and + // maintaining performance under load. + gc_duration: Duration::from_secs(30), + } + } + + pub fn batch_recv_read_stats( + &mut self, + read_stats_receiver: &Receiver, + ) -> &mut Vec { + self.read_stats_vec.clear(); + + while let Ok(read_stats) = read_stats_receiver.try_recv() { + self.read_stats_vec.push(read_stats); + if self.read_stats_vec.len() == self.batch_recv_len { + break; + } + } + &mut self.read_stats_vec + } + + pub fn batch_recv_cpu_stats( + &mut self, + cpu_stats_receiver: &Receiver>, + ) -> (&mut Vec>, &mut CpuStatsCache) { + self.cpu_stats_vec.clear(); + self.cpu_stats_cache.region_cpu_map.clear(); + self.cpu_stats_cache.hottest_key_range_cpu_time_map.clear(); + + while let Ok(cpu_stats) = cpu_stats_receiver.try_recv() { + self.cpu_stats_vec.push(cpu_stats); + if self.cpu_stats_vec.len() == self.batch_recv_len { + break; + } + } + (&mut self.cpu_stats_vec, &mut self.cpu_stats_cache) + } + + pub fn empty_region_cpu_map(&mut self) -> &HashMap)> { + self.cpu_stats_cache.region_cpu_map.clear(); + &self.cpu_stats_cache.region_cpu_map + } + + pub fn maybe_gc(&mut self) { + let now = Instant::now_coarse(); + if now.saturating_duration_since(self.last_gc_time) > self.gc_duration { + self.read_stats_vec = Vec::default(); + self.cpu_stats_vec = Vec::default(); + self.cpu_stats_cache = CpuStatsCache::default(); + + self.last_gc_time = now; + } + } +} + #[cfg(test)] mod tests { + use std::sync::mpsc::{self, TryRecvError}; + use online_config::{ConfigChange, ConfigManager, ConfigValue}; use resource_metering::{RawRecord, TagInfos}; - use tikv_util::config::VersionTrack; + use tikv_util::config::{ReadableSize, VersionTrack}; use txn_types::Key; use super::*; use crate::store::worker::split_config::{ - DEFAULT_SAMPLE_NUM, REGION_CPU_OVERLOAD_THRESHOLD_RATIO, + BIG_REGION_CPU_OVERLOAD_THRESHOLD_RATIO, DEFAULT_SAMPLE_NUM, }; enum Position { @@ -1190,15 +1286,45 @@ mod tests { fail::remove("mock_region_is_busy"); } + fn new_auto_split_controller_ctx( + read_stats: Vec, + cpu_stats: Vec>, + ) -> ( + AutoSplitControllerContext, + Receiver, + Receiver>, + ) { + let len = std::cmp::max(read_stats.len(), cpu_stats.len()); + let (read_stats_sender, read_stats_receiver) = mpsc::sync_channel(len); + let (cpu_stats_sender, cpu_stats_receiver) = mpsc::sync_channel(len); + for s in cpu_stats { + cpu_stats_sender.try_send(s).unwrap(); + } + for s in read_stats { + read_stats_sender.try_send(s).unwrap(); + } + ( + AutoSplitControllerContext::new(len), + read_stats_receiver, + cpu_stats_receiver, + ) + } + fn check_split_key(mode: &[u8], qps_stats: Vec, split_keys: Vec<&[u8]>) { let mode = String::from_utf8(Vec::from(mode)).unwrap(); let mut hub = AutoSplitController::default(); - hub.cfg.qps_threshold = 1; + hub.cfg.qps_threshold = Some(1); hub.cfg.sample_threshold = 0; for i in 0..10 { - let (_, split_infos) = - hub.flush(qps_stats.clone(), vec![], &ThreadInfoStatistics::default()); + let (mut ctx, read_stats_receiver, cpu_stats_receiver) = + new_auto_split_controller_ctx(qps_stats.clone(), vec![]); + let (_, split_infos) = hub.flush( + &mut ctx, + &read_stats_receiver, + &cpu_stats_receiver, + &mut ThreadInfoStatistics::default(), + ); if (i + 1) % hub.cfg.detect_times != 0 { continue; } @@ -1226,14 +1352,17 @@ mod tests { ) { let mode = String::from_utf8(Vec::from(mode)).unwrap(); let mut hub = AutoSplitController::default(); - hub.cfg.qps_threshold = 1; + hub.cfg.qps_threshold = Some(1); hub.cfg.sample_threshold = 0; for i in 0..10 { + let (mut ctx, read_stats_receiver, cpu_stats_receiver) = + new_auto_split_controller_ctx(qps_stats.clone(), cpu_stats.clone()); let (_, split_infos) = hub.flush( - qps_stats.clone(), - cpu_stats.clone(), - &ThreadInfoStatistics::default(), + &mut ctx, + &read_stats_receiver, + &cpu_stats_receiver, + &mut ThreadInfoStatistics::default(), ); if (i + 1) % hub.cfg.detect_times != 0 { continue; @@ -1291,7 +1420,7 @@ mod tests { #[test] fn test_sample_key_num() { let mut hub = AutoSplitController::default(); - hub.cfg.qps_threshold = 2000; + hub.cfg.qps_threshold = Some(2000); hub.cfg.sample_num = 2000; hub.cfg.sample_threshold = 0; @@ -1318,7 +1447,15 @@ mod tests { ); } qps_stats_vec.push(qps_stats); - hub.flush(qps_stats_vec, vec![], &ThreadInfoStatistics::default()); + + let (mut ctx, read_stats_receiver, cpu_stats_receiver) = + new_auto_split_controller_ctx(qps_stats_vec.clone(), vec![]); + hub.flush( + &mut ctx, + &read_stats_receiver, + &cpu_stats_receiver, + &mut ThreadInfoStatistics::default(), + ); } // Test the empty key ranges. @@ -1331,7 +1468,15 @@ mod tests { qps_stats.add_query_num(1, &Peer::default(), KeyRange::default(), QueryKind::Get); } qps_stats_vec.push(qps_stats); - hub.flush(qps_stats_vec, vec![], &ThreadInfoStatistics::default()); + + let (mut ctx, read_stats_receiver, cpu_stats_receiver) = + new_auto_split_controller_ctx(qps_stats_vec, vec![]); + hub.flush( + &mut ctx, + &read_stats_receiver, + &cpu_stats_receiver, + &mut ThreadInfoStatistics::default(), + ); } fn check_sample_length(key_ranges: Vec>) { @@ -1608,7 +1753,8 @@ mod tests { #[test] fn test_refresh_and_check_cfg() { - let split_config = SplitConfig::default(); + let mut split_config = SplitConfig::default(); + split_config.optimize_for(ReadableSize::mb(5000)); let mut split_cfg_manager = SplitConfigManager::new(Arc::new(VersionTrack::new(split_config))); let mut auto_split_controller = @@ -1620,8 +1766,8 @@ mod tests { assert_eq!( auto_split_controller .cfg - .region_cpu_overload_threshold_ratio, - REGION_CPU_OVERLOAD_THRESHOLD_RATIO + .region_cpu_overload_threshold_ratio(), + BIG_REGION_CPU_OVERLOAD_THRESHOLD_RATIO ); // Set to zero. dispatch_split_cfg_change( @@ -1636,7 +1782,7 @@ mod tests { assert_eq!( auto_split_controller .cfg - .region_cpu_overload_threshold_ratio, + .region_cpu_overload_threshold_ratio(), 0.0 ); assert_eq!( @@ -1647,7 +1793,7 @@ mod tests { dispatch_split_cfg_change( &mut split_cfg_manager, "region_cpu_overload_threshold_ratio", - ConfigValue::F64(REGION_CPU_OVERLOAD_THRESHOLD_RATIO), + ConfigValue::F64(0.1), ); assert_eq!( auto_split_controller.refresh_and_check_cfg(), @@ -1656,8 +1802,8 @@ mod tests { assert_eq!( auto_split_controller .cfg - .region_cpu_overload_threshold_ratio, - REGION_CPU_OVERLOAD_THRESHOLD_RATIO + .region_cpu_overload_threshold_ratio(), + 0.1 ); assert_eq!( auto_split_controller.refresh_and_check_cfg(), @@ -1677,8 +1823,10 @@ mod tests { #[test] fn test_collect_cpu_stats() { - let auto_split_controller = AutoSplitController::default(); - let region_cpu_map = auto_split_controller.collect_cpu_stats(vec![]); + let mut auto_split_controller = AutoSplitController::default(); + + let (mut ctx, _, cpu_stats_receiver) = new_auto_split_controller_ctx(vec![], vec![]); + let region_cpu_map = auto_split_controller.collect_cpu_stats(&mut ctx, &cpu_stats_receiver); assert!(region_cpu_map.is_empty()); let ab_key_range_tag = Arc::new(TagInfos { @@ -1766,8 +1914,11 @@ mod tests { write_keys: 0, }, ); + + let (mut ctx, _, cpu_stats_receiver) = + new_auto_split_controller_ctx(vec![], vec![Arc::new(raw_records)]); let region_cpu_map = - auto_split_controller.collect_cpu_stats(vec![Arc::new(raw_records)]); + auto_split_controller.collect_cpu_stats(&mut ctx, &cpu_stats_receiver); assert_eq!( region_cpu_map.len(), 1, @@ -1868,12 +2019,21 @@ mod tests { for _i in 0..10 { other_qps_stats.push(default_qps_stats()); } + let (read_stats_sender, read_stats_receiver) = mpsc::sync_channel(other_qps_stats.len()); + let (_, cpu_stats_receiver) = mpsc::sync_channel(other_qps_stats.len()); + let mut ctx = AutoSplitControllerContext::new(other_qps_stats.len()); + let mut threads = ThreadInfoStatistics::default(); + b.iter(|| { let mut hub = AutoSplitController::default(); + for s in other_qps_stats.clone() { + read_stats_sender.send(s).unwrap(); + } hub.flush( - other_qps_stats.clone(), - vec![], - &ThreadInfoStatistics::default(), + &mut ctx, + &read_stats_receiver, + &cpu_stats_receiver, + &mut threads, ); }); } @@ -1914,4 +2074,102 @@ mod tests { ); }); } + + #[test] + fn test_auto_split_controller_ctx_batch_recv() { + let batch_limit = 3; + let mut ctx = AutoSplitControllerContext::new(batch_limit); + for len in [0, 2, 3, 5, 6] { + let (read_stats_sender, read_stats_receiver) = mpsc::sync_channel(len); + let (cpu_stats_sender, cpu_stats_receiver) = mpsc::sync_channel(len); + + let read_stats = ReadStats::default(); + let cpu_stats = Arc::new(RawRecords::default()); + for _ in 0..len { + read_stats_sender.try_send(read_stats.clone()).unwrap(); + cpu_stats_sender.try_send(cpu_stats.clone()).unwrap(); + } + // If channel is full, should return error. + assert!(read_stats_sender.try_send(read_stats.clone()).is_err()); + assert!(cpu_stats_sender.try_send(cpu_stats.clone()).is_err()); + loop { + let batch = ctx.batch_recv_read_stats(&read_stats_receiver); + if batch.is_empty() { + break; + } + assert!( + batch.len() == batch_limit || batch.len() == len % batch_limit, + "{:?}", + (len, batch.len()) + ); + } + assert_eq!( + read_stats_receiver.try_recv().unwrap_err(), + TryRecvError::Empty + ); + + loop { + let (batch, cache) = ctx.batch_recv_cpu_stats(&cpu_stats_receiver); + if batch.is_empty() { + break; + } + assert!( + batch.len() == batch_limit || batch.len() == len % batch_limit, + "{:?}", + (len, batch.len()) + ); + assert!(cache.region_cpu_map.is_empty()); + assert!(cache.hottest_key_range_cpu_time_map.is_empty()); + // The cache should be empty after the batch_recv_cpu_stats. + cache.region_cpu_map.insert(1, (0.0, None)); + cache.hottest_key_range_cpu_time_map.insert(1, 1); + } + assert_eq!( + read_stats_receiver.try_recv().unwrap_err(), + TryRecvError::Empty + ); + } + } + + #[test] + fn test_auto_split_controller_empty_region_cpu_map() { + let mut ctx = AutoSplitControllerContext::new(1); + ctx.cpu_stats_cache.region_cpu_map.insert(1, (0.0, None)); + assert!(ctx.empty_region_cpu_map().is_empty()); + } + + #[test] + fn test_auto_split_controller_empty_gc() { + let mut ctx = AutoSplitControllerContext::new(1); + ctx.cpu_stats_cache.region_cpu_map.insert(1, (0.0, None)); + ctx.cpu_stats_cache + .hottest_key_range_cpu_time_map + .insert(1, 1); + ctx.cpu_stats_vec.push(Arc::new(RawRecords::default())); + ctx.read_stats_vec.push(ReadStats::default()); + + ctx.last_gc_time = Instant::now_coarse(); + ctx.maybe_gc(); + + assert!(!ctx.cpu_stats_cache.region_cpu_map.is_empty()); + assert!( + !ctx.cpu_stats_cache + .hottest_key_range_cpu_time_map + .is_empty() + ); + assert!(!ctx.cpu_stats_vec.is_empty()); + assert!(!ctx.read_stats_vec.is_empty()); + + ctx.last_gc_time = Instant::now_coarse() - 2 * ctx.gc_duration; + ctx.maybe_gc(); + + assert!(ctx.cpu_stats_cache.region_cpu_map.is_empty()); + assert!( + ctx.cpu_stats_cache + .hottest_key_range_cpu_time_map + .is_empty() + ); + assert!(ctx.cpu_stats_vec.is_empty()); + assert!(ctx.read_stats_vec.is_empty()); + } } diff --git a/components/resolved_ts/src/advance.rs b/components/resolved_ts/src/advance.rs index 4428ed01a35..2667a43a304 100644 --- a/components/resolved_ts/src/advance.rs +++ b/components/resolved_ts/src/advance.rs @@ -43,7 +43,7 @@ use tokio::{ }; use txn_types::TimeStamp; -use crate::{endpoint::Task, metrics::*}; +use crate::{endpoint::Task, metrics::*, TsSource}; pub(crate) const DEFAULT_CHECK_LEADER_TIMEOUT_DURATION: Duration = Duration::from_secs(5); // 5s const DEFAULT_GRPC_GZIP_COMPRESSION_LEVEL: usize = 2; @@ -51,13 +51,12 @@ const DEFAULT_GRPC_MIN_MESSAGE_SIZE_TO_COMPRESS: usize = 4096; pub struct AdvanceTsWorker { pd_client: Arc, - advance_ts_interval: Duration, timer: SteadyTimer, worker: Runtime, scheduler: Scheduler, /// The concurrency manager for transactions. It's needed for CDC to check /// locks when calculating resolved_ts. - concurrency_manager: ConcurrencyManager, + pub(crate) concurrency_manager: ConcurrencyManager, // cache the last pd tso, used to approximate the next timestamp w/o an actual TSO RPC pub(crate) last_pd_tso: Arc>>, @@ -65,7 +64,6 @@ pub struct AdvanceTsWorker { impl AdvanceTsWorker { pub fn new( - advance_ts_interval: Duration, pd_client: Arc, scheduler: Scheduler, concurrency_manager: ConcurrencyManager, @@ -81,7 +79,6 @@ impl AdvanceTsWorker { scheduler, pd_client, worker, - advance_ts_interval, timer: SteadyTimer::default(), concurrency_manager, last_pd_tso: Arc::new(std::sync::Mutex::new(None)), @@ -104,7 +101,7 @@ impl AdvanceTsWorker { let timeout = self.timer.delay(advance_ts_interval); let min_timeout = self.timer.delay(cmp::min( DEFAULT_CHECK_LEADER_TIMEOUT_DURATION, - self.advance_ts_interval, + advance_ts_interval, )); let last_pd_tso = self.last_pd_tso.clone(); @@ -114,23 +111,28 @@ impl AdvanceTsWorker { if let Ok(mut last_pd_tso) = last_pd_tso.try_lock() { *last_pd_tso = Some((min_ts, Instant::now())); } + let mut ts_source = TsSource::PdTso; // Sync with concurrency manager so that it can work correctly when // optimizations like async commit is enabled. // Note: This step must be done before scheduling `Task::MinTs` task, and the // resolver must be checked in or after `Task::MinTs`' execution. cm.update_max_ts(min_ts); - if let Some(min_mem_lock_ts) = cm.global_min_lock_ts() { + if let Some((min_mem_lock_ts, lock)) = cm.global_min_lock() { if min_mem_lock_ts < min_ts { min_ts = min_mem_lock_ts; + ts_source = TsSource::MemoryLock(lock); } } - let regions = leader_resolver.resolve(regions, min_ts).await; + let regions = leader_resolver + .resolve(regions, min_ts, Some(advance_ts_interval)) + .await; if !regions.is_empty() { if let Err(e) = scheduler.schedule(Task::ResolvedTsAdvanced { regions, ts: min_ts, + ts_source, }) { info!("failed to schedule advance event"; "err" => ?e); } @@ -164,10 +166,7 @@ pub struct LeadershipResolver { // store_id -> check leader request, record the request to each stores. store_req_map: HashMap, - // region_id -> region, cache the information of regions. - region_map: HashMap>, - // region_id -> peers id, record the responses. - resp_map: HashMap>, + progresses: HashMap, checking_regions: HashSet, valid_regions: HashSet, @@ -193,8 +192,7 @@ impl LeadershipResolver { region_read_progress, store_req_map: HashMap::default(), - region_map: HashMap::default(), - resp_map: HashMap::default(), + progresses: HashMap::default(), valid_regions: HashSet::default(), checking_regions: HashSet::default(), last_gc_time: Instant::now_coarse(), @@ -206,8 +204,7 @@ impl LeadershipResolver { let now = Instant::now_coarse(); if now - self.last_gc_time > self.gc_interval { self.store_req_map = HashMap::default(); - self.region_map = HashMap::default(); - self.resp_map = HashMap::default(); + self.progresses = HashMap::default(); self.valid_regions = HashSet::default(); self.checking_regions = HashSet::default(); self.last_gc_time = now; @@ -219,10 +216,7 @@ impl LeadershipResolver { v.regions.clear(); v.ts = 0; } - for v in self.region_map.values_mut() { - v.clear(); - } - for v in self.resp_map.values_mut() { + for v in self.progresses.values_mut() { v.clear(); } self.checking_regions.clear(); @@ -233,7 +227,12 @@ impl LeadershipResolver { // This function broadcasts a special message to all stores, gets the leader id // of them to confirm whether current peer has a quorum which accepts its // leadership. - pub async fn resolve(&mut self, regions: Vec, min_ts: TimeStamp) -> Vec { + pub async fn resolve( + &mut self, + regions: Vec, + min_ts: TimeStamp, + timeout: Option, + ) -> Vec { if regions.is_empty() { return regions; } @@ -249,8 +248,7 @@ impl LeadershipResolver { let store_id = self.store_id; let valid_regions = &mut self.valid_regions; - let region_map = &mut self.region_map; - let resp_map = &mut self.resp_map; + let progresses = &mut self.progresses; let store_req_map = &mut self.store_req_map; let checking_regions = &mut self.checking_regions; for region_id in ®ions { @@ -272,13 +270,13 @@ impl LeadershipResolver { } let leader_info = core.get_leader_info(); + let prog = progresses + .entry(*region_id) + .or_insert_with(|| RegionProgress::new(peer_list.len())); let mut unvotes = 0; for peer in peer_list { if peer.store_id == store_id && peer.id == leader_id { - resp_map - .entry(*region_id) - .or_insert_with(|| Vec::with_capacity(peer_list.len())) - .push(store_id); + prog.resps.push(store_id); } else { // It's still necessary to check leader on learners even if they don't vote // because performing stale read on learners require it. @@ -296,15 +294,14 @@ impl LeadershipResolver { } } } + // Check `region_has_quorum` here because `store_map` can be empty, // in which case `region_has_quorum` won't be called any more. - if unvotes == 0 && region_has_quorum(peer_list, &resp_map[region_id]) { + if unvotes == 0 && region_has_quorum(peer_list, &prog.resps) { + prog.resolved = true; valid_regions.insert(*region_id); } else { - region_map - .entry(*region_id) - .or_insert_with(|| Vec::with_capacity(peer_list.len())) - .extend_from_slice(peer_list); + prog.peers.extend_from_slice(peer_list); } } }); @@ -318,8 +315,9 @@ impl LeadershipResolver { .values() .find(|req| !req.regions.is_empty()) .map_or(0, |req| req.regions[0].compute_size()); - let store_count = store_req_map.len(); let mut check_leader_rpcs = Vec::with_capacity(store_req_map.len()); + let timeout = get_min_timeout(timeout, DEFAULT_CHECK_LEADER_TIMEOUT_DURATION); + for (store_id, req) in store_req_map { if req.regions.is_empty() { continue; @@ -334,9 +332,16 @@ impl LeadershipResolver { let rpc = async move { PENDING_CHECK_LEADER_REQ_COUNT.inc(); defer!(PENDING_CHECK_LEADER_REQ_COUNT.dec()); - let client = get_tikv_client(to_store, pd_client, security_mgr, env, tikv_clients) - .await - .map_err(|e| (to_store, e.retryable(), format!("[get tikv client] {}", e)))?; + let client = get_tikv_client( + to_store, + pd_client, + security_mgr, + env, + tikv_clients, + timeout, + ) + .await + .map_err(|e| (to_store, e.retryable(), format!("[get tikv client] {}", e)))?; // Set min_ts in the request. req.set_ts(min_ts.into_inner()); @@ -367,7 +372,6 @@ impl LeadershipResolver { PENDING_CHECK_LEADER_REQ_SENT_COUNT.inc(); defer!(PENDING_CHECK_LEADER_REQ_SENT_COUNT.dec()); - let timeout = DEFAULT_CHECK_LEADER_TIMEOUT_DURATION; let resp = tokio::time::timeout(timeout, rpc) .map_err(|e| (to_store, true, format!("[timeout] {}", e))) .await? @@ -384,6 +388,7 @@ impl LeadershipResolver { .with_label_values(&["all"]) .observe(start.saturating_elapsed_secs()); }); + let rpc_count = check_leader_rpcs.len(); for _ in 0..rpc_count { // Use `select_all` to avoid the process getting blocked when some @@ -393,10 +398,16 @@ impl LeadershipResolver { match res { Ok((to_store, resp)) => { for region_id in resp.regions { - resp_map - .entry(region_id) - .or_insert_with(|| Vec::with_capacity(store_count)) - .push(to_store); + if let Some(prog) = progresses.get_mut(®ion_id) { + if prog.resolved { + continue; + } + prog.resps.push(to_store); + if region_has_quorum(&prog.peers, &prog.resps) { + prog.resolved = true; + valid_regions.insert(region_id); + } + } } } Err((to_store, reconnect, err)) => { @@ -406,24 +417,19 @@ impl LeadershipResolver { } } } - } - for (region_id, prs) in region_map { - if prs.is_empty() { - // The peer had the leadership before, but now it's no longer - // the case. Skip checking the region. - continue; - } - if let Some(resp) = resp_map.get(region_id) { - if resp.is_empty() { - // No response, maybe the peer lost leadership. - continue; - } - if region_has_quorum(prs, resp) { - valid_regions.insert(*region_id); - } + if valid_regions.len() >= progresses.len() { + break; } } - self.valid_regions.drain().collect() + let res: Vec = self.valid_regions.drain().collect(); + if res.len() != checking_regions.len() { + warn!( + "check leader returns valid regions different from checking regions"; + "valid_regions" => res.len(), + "checking_regions" => checking_regions.len(), + ); + } + res } } @@ -460,6 +466,11 @@ where resps.into_iter().flatten().collect::>() } +#[inline] +fn get_min_timeout(timeout: Option, default: Duration) -> Duration { + timeout.unwrap_or(default).min(default) +} + fn region_has_quorum(peers: &[Peer], stores: &[u64]) -> bool { let mut voters = 0; let mut incoming_voters = 0; @@ -516,6 +527,7 @@ async fn get_tikv_client( security_mgr: &SecurityManager, env: Arc, tikv_clients: &Mutex>, + timeout: Duration, ) -> pd_client::Result { { let clients = tikv_clients.lock().await; @@ -523,7 +535,6 @@ async fn get_tikv_client( return Ok(client); } } - let timeout = DEFAULT_CHECK_LEADER_TIMEOUT_DURATION; let store = tokio::time::timeout(timeout, pd_client.get_store_async(store_id)) .await .map_err(|e| pd_client::Error::Other(Box::new(e))) @@ -549,6 +560,27 @@ async fn get_tikv_client( Ok(cli) } +struct RegionProgress { + resolved: bool, + peers: Vec, + resps: Vec, +} + +impl RegionProgress { + fn new(len: usize) -> Self { + RegionProgress { + resolved: false, + peers: Vec::with_capacity(len), + resps: Vec::with_capacity(len), + } + } + fn clear(&mut self) { + self.resolved = false; + self.peers.clear(); + self.resps.clear(); + } +} + #[cfg(test)] mod tests { use std::{ @@ -641,19 +673,45 @@ mod tests { .region_read_progress .insert(2, Arc::new(progress2)); - leader_resolver.resolve(vec![1, 2], TimeStamp::new(1)).await; + leader_resolver + .resolve(vec![1, 2], TimeStamp::new(1), None) + .await; let req = rx.recv_timeout(Duration::from_secs(1)).unwrap(); assert_eq!(req.regions.len(), 2); // Checking one region only send 1 region in request. - leader_resolver.resolve(vec![1], TimeStamp::new(1)).await; + leader_resolver + .resolve(vec![1], TimeStamp::new(1), None) + .await; let req = rx.recv_timeout(Duration::from_secs(1)).unwrap(); assert_eq!(req.regions.len(), 1); // Checking zero region does not send request. - leader_resolver.resolve(vec![], TimeStamp::new(1)).await; + leader_resolver + .resolve(vec![], TimeStamp::new(1), None) + .await; rx.recv_timeout(Duration::from_secs(1)).unwrap_err(); let _ = server.shutdown().await; } + + #[test] + fn test_get_min_timeout() { + assert_eq!( + get_min_timeout(None, Duration::from_secs(5)), + Duration::from_secs(5) + ); + assert_eq!( + get_min_timeout(None, Duration::from_secs(2)), + Duration::from_secs(2) + ); + assert_eq!( + get_min_timeout(Some(Duration::from_secs(1)), Duration::from_secs(5)), + Duration::from_secs(1) + ); + assert_eq!( + get_min_timeout(Some(Duration::from_secs(20)), Duration::from_secs(5)), + Duration::from_secs(5) + ); + } } diff --git a/components/resolved_ts/src/cmd.rs b/components/resolved_ts/src/cmd.rs index 47d14304112..ec88bf45149 100644 --- a/components/resolved_ts/src/cmd.rs +++ b/components/resolved_ts/src/cmd.rs @@ -143,7 +143,8 @@ pub(crate) fn decode_write(key: &[u8], value: &[u8], is_apply: bool) -> Option = engine .take_last_modifies() @@ -398,6 +405,26 @@ mod tests { commit_ts: None, write_type: WriteType::Rollback, }, + ChangeRow::Prewrite { + key: k1.clone(), + start_ts: 6.into(), + value: Some(b"v4".to_vec()), + lock_type: LockType::Put, + }, + ChangeRow::Commit { + key: k1.clone(), + start_ts: Some(6.into()), + commit_ts: Some(7.into()), + write_type: WriteType::Put, + }, + ChangeRow::Prewrite { + key: k1.clone(), + start_ts: 7.into(), + value: Some(b"v5".to_vec()), + lock_type: LockType::Put, + }, + // Rollback of the txn@start_ts=7 will be missing as overlapped rollback is not + // hanlded. ]; assert_eq!(rows, expected); diff --git a/components/resolved_ts/src/endpoint.rs b/components/resolved_ts/src/endpoint.rs index 5d0dbdcd689..6a227716a3d 100644 --- a/components/resolved_ts/src/endpoint.rs +++ b/components/resolved_ts/src/endpoint.rs @@ -1,20 +1,19 @@ // Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. use std::{ + cmp::min, collections::HashMap, fmt, marker::PhantomData, - sync::{ - atomic::{AtomicBool, Ordering}, - Arc, Mutex, - }, + sync::{Arc, Mutex, MutexGuard}, time::Duration, }; use concurrency_manager::ConcurrencyManager; use engine_traits::KvEngine; +use futures::channel::oneshot::{channel, Receiver, Sender}; use grpcio::Environment; -use kvproto::{metapb::Region, raft_cmdpb::AdminCmdType}; +use kvproto::{kvrpcpb::LeaderInfo, metapb::Region, raft_cmdpb::AdminCmdType}; use online_config::{self, ConfigChange, ConfigManager, OnlineConfig}; use pd_client::PdClient; use raftstore::{ @@ -22,38 +21,137 @@ use raftstore::{ router::CdcHandle, store::{ fsm::store::StoreRegionMeta, - util::{self, RegionReadProgress, RegionReadProgressRegistry}, + util::{ + self, ReadState, RegionReadProgress, RegionReadProgressCore, RegionReadProgressRegistry, + }, }, }; use security::SecurityManager; use tikv::config::ResolvedTsConfig; use tikv_util::{ + memory::{HeapSize, MemoryQuota}, warn, worker::{Runnable, RunnableWithTimer, Scheduler}, }; -use tokio::sync::Notify; +use tokio::sync::{Notify, Semaphore}; use txn_types::{Key, TimeStamp}; use crate::{ advance::{AdvanceTsWorker, LeadershipResolver, DEFAULT_CHECK_LEADER_TIMEOUT_DURATION}, cmd::{ChangeLog, ChangeRow}, metrics::*, - resolver::Resolver, - scanner::{ScanEntry, ScanMode, ScanTask, ScannerPool}, + resolver::{LastAttempt, Resolver}, + scanner::{ScanEntries, ScanTask, ScannerPool}, + Error, Result, TsSource, TxnLocks, ON_DROP_WARN_HEAP_SIZE, }; -/// grace period for logging safe-ts and resolved-ts gap in slow log +/// grace period for identifying identifying slow resolved-ts and safe-ts. const SLOW_LOG_GRACE_PERIOD_MS: u64 = 1000; +const MEMORY_QUOTA_EXCEEDED_BACKOFF: Duration = Duration::from_secs(30); enum ResolverStatus { Pending { tracked_index: u64, locks: Vec, - cancelled: Arc, + cancelled: Option>, + memory_quota: Arc, }, Ready, } +impl Drop for ResolverStatus { + fn drop(&mut self) { + let ResolverStatus::Pending { + locks, + memory_quota, + .. + } = self else { + return; + }; + if locks.is_empty() { + return; + } + + // Free memory quota used by pending locks and unlocks. + let mut bytes = 0; + let num_locks = locks.len(); + for lock in locks { + bytes += lock.approximate_heap_size(); + } + if bytes > ON_DROP_WARN_HEAP_SIZE { + warn!("drop huge ResolverStatus"; + "bytes" => bytes, + "num_locks" => num_locks, + "memory_quota_in_use" => memory_quota.in_use(), + "memory_quota_capacity" => memory_quota.capacity(), + ); + } + memory_quota.free(bytes); + } +} + +impl ResolverStatus { + fn push_pending_lock(&mut self, lock: PendingLock, region_id: u64) -> Result<()> { + let ResolverStatus::Pending { + locks, + memory_quota, + .. + } = self else { + panic!("region {:?} resolver has ready", region_id) + }; + // Check if adding a new lock or unlock will exceed the memory + // quota. + memory_quota + .alloc(lock.approximate_heap_size()) + .map_err(|e| { + fail::fail_point!("resolved_ts_on_pending_locks_memory_quota_exceeded"); + Error::MemoryQuotaExceeded(e) + })?; + locks.push(lock); + Ok(()) + } + + fn update_tracked_index(&mut self, index: u64, region_id: u64) { + let ResolverStatus::Pending { + tracked_index, + .. + } = self else { + panic!("region {:?} resolver has ready", region_id) + }; + assert!( + *tracked_index < index, + "region {}, tracked_index: {}, incoming index: {}", + region_id, + *tracked_index, + index + ); + *tracked_index = index; + } + + fn drain_pending_locks( + &mut self, + region_id: u64, + ) -> (u64, impl Iterator + '_) { + let ResolverStatus::Pending { + locks, + memory_quota, + tracked_index, + .. + } = self else { + panic!("region {:?} resolver has ready", region_id) + }; + // Must take locks, otherwise it may double free memory quota on drop. + let locks = std::mem::take(locks); + ( + *tracked_index, + locks.into_iter().map(|lock| { + memory_quota.free(lock.approximate_heap_size()); + lock + }), + ) + } +} + #[allow(dead_code)] enum PendingLock { Track { @@ -67,6 +165,16 @@ enum PendingLock { }, } +impl HeapSize for PendingLock { + fn approximate_heap_size(&self) -> usize { + match self { + PendingLock::Track { key, .. } | PendingLock::Untrack { key, .. } => { + key.as_encoded().approximate_heap_size() + } + } + } +} + // Records information related to observed region. // observe_id is used for avoiding ABA problems in incremental scan task, // advance resolved ts task, and command observing. @@ -80,122 +188,121 @@ struct ObserveRegion { } impl ObserveRegion { - fn new(meta: Region, rrp: Arc) -> Self { + fn new( + meta: Region, + rrp: Arc, + memory_quota: Arc, + cancelled: Sender<()>, + ) -> Self { ObserveRegion { - resolver: Resolver::with_read_progress(meta.id, Some(rrp)), + resolver: Resolver::with_read_progress(meta.id, Some(rrp), memory_quota.clone()), meta, handle: ObserveHandle::new(), resolver_status: ResolverStatus::Pending { tracked_index: 0, locks: vec![], - cancelled: Arc::new(AtomicBool::new(false)), + cancelled: Some(cancelled), + memory_quota, }, } } - fn read_progress(&self) -> &RegionReadProgress { - self.resolver.read_progress.as_ref().unwrap() + fn read_progress(&self) -> &Arc { + self.resolver.read_progress().unwrap() } - fn track_change_log(&mut self, change_logs: &[ChangeLog]) -> std::result::Result<(), String> { - match &mut self.resolver_status { - ResolverStatus::Pending { - locks, - tracked_index, - .. - } => { - for log in change_logs { - match log { - ChangeLog::Error(e) => { - debug!( - "skip change log error"; - "region" => self.meta.id, - "error" => ?e, - ); - continue; - } - ChangeLog::Admin(req_type) => { - // TODO: for admin cmd that won't change the region meta like peer list - // and key range (i.e. `CompactLog`, `ComputeHash`) we may not need to - // return error - return Err(format!( - "region met admin command {:?} while initializing resolver", - req_type - )); - } - ChangeLog::Rows { rows, index } => { - rows.iter().for_each(|row| match row { - ChangeRow::Prewrite { key, start_ts, .. } => { - locks.push(PendingLock::Track { - key: key.clone(), - start_ts: *start_ts, - }) - } + fn track_change_log(&mut self, change_logs: &[ChangeLog]) -> Result<()> { + if matches!(self.resolver_status, ResolverStatus::Pending { .. }) { + for log in change_logs { + match log { + ChangeLog::Error(e) => { + debug!( + "skip change log error"; + "region" => self.meta.id, + "error" => ?e, + ); + continue; + } + ChangeLog::Admin(req_type) => { + // TODO: for admin cmd that won't change the region meta like peer list + // and key range (i.e. `CompactLog`, `ComputeHash`) we may not need to + // return error + return Err(box_err!( + "region met admin command {:?} while initializing resolver", + req_type + )); + } + ChangeLog::Rows { rows, index } => { + for row in rows { + let lock = match row { + ChangeRow::Prewrite { key, start_ts, .. } => PendingLock::Track { + key: key.clone(), + start_ts: *start_ts, + }, ChangeRow::Commit { key, start_ts, commit_ts, .. - } => locks.push(PendingLock::Untrack { + } => PendingLock::Untrack { key: key.clone(), start_ts: *start_ts, commit_ts: *commit_ts, - }), + }, // One pc command do not contains any lock, so just skip it - ChangeRow::OnePc { .. } => {} - ChangeRow::IngestSsT => {} - }); - assert!( - *tracked_index < *index, - "region {}, tracked_index: {}, incoming index: {}", - self.meta.id, - *tracked_index, - *index - ); - *tracked_index = *index; + ChangeRow::OnePc { .. } | ChangeRow::IngestSsT => continue, + }; + self.resolver_status.push_pending_lock(lock, self.meta.id)?; } + self.resolver_status + .update_tracked_index(*index, self.meta.id); } } } - ResolverStatus::Ready => { - for log in change_logs { - match log { - ChangeLog::Error(e) => { + } else { + for log in change_logs { + match log { + ChangeLog::Error(e) => { + debug!( + "skip change log error"; + "region" => self.meta.id, + "error" => ?e, + ); + continue; + } + ChangeLog::Admin(req_type) => match req_type { + AdminCmdType::Split + | AdminCmdType::BatchSplit + | AdminCmdType::PrepareMerge + | AdminCmdType::RollbackMerge + | AdminCmdType::CommitMerge => { + info!( + "region met split/merge command, stop tracking since key range changed, wait for re-register"; + "req_type" => ?req_type, + ); + // Stop tracking so that `tracked_index` larger than the split/merge + // command index won't be published until `RegionUpdate` event + // trigger the region re-register and re-scan the new key range + self.resolver.stop_tracking(); + } + _ => { debug!( - "skip change log error"; + "skip change log admin"; "region" => self.meta.id, - "error" => ?e, + "req_type" => ?req_type, ); - continue; } - ChangeLog::Admin(req_type) => match req_type { - AdminCmdType::Split - | AdminCmdType::BatchSplit - | AdminCmdType::PrepareMerge - | AdminCmdType::RollbackMerge - | AdminCmdType::CommitMerge => { - info!( - "region met split/merge command, stop tracking since key range changed, wait for re-register"; - "req_type" => ?req_type, - ); - // Stop tracking so that `tracked_index` larger than the split/merge - // command index won't be published until `RegionUpdate` event - // trigger the region re-register and re-scan the new key range - self.resolver.stop_tracking(); - } - _ => { - debug!( - "skip change log admin"; - "region" => self.meta.id, - "req_type" => ?req_type, - ); - } - }, - ChangeLog::Rows { rows, index } => { - rows.iter().for_each(|row| match row { - ChangeRow::Prewrite { key, start_ts, .. } => self - .resolver - .track_lock(*start_ts, key.to_raw().unwrap(), Some(*index)), + }, + ChangeLog::Rows { rows, index } => { + for row in rows { + match row { + ChangeRow::Prewrite { key, start_ts, .. } => { + self.resolver.track_lock( + *start_ts, + key.to_raw().unwrap(), + Some(*index), + )?; + } ChangeRow::Commit { key, .. } => self .resolver .untrack_lock(&key.to_raw().unwrap(), Some(*index)), @@ -206,7 +313,7 @@ impl ObserveRegion { ChangeRow::IngestSsT => { self.resolver.update_tracked_index(*index); } - }); + } } } } @@ -215,73 +322,324 @@ impl ObserveRegion { Ok(()) } - fn track_scan_locks(&mut self, entries: Vec, apply_index: u64) { - for es in entries { - match es { - ScanEntry::Lock(locks) => { - if let ResolverStatus::Ready = self.resolver_status { - panic!("region {:?} resolver has ready", self.meta.id) - } - for (key, lock) in locks { - self.resolver - .track_lock(lock.ts, key.to_raw().unwrap(), Some(apply_index)); - } + /// Track locks in incoming scan entries. + fn track_scan_locks(&mut self, entries: ScanEntries, apply_index: u64) -> Result<()> { + match entries { + ScanEntries::Lock(locks) => { + if let ResolverStatus::Ready = self.resolver_status { + panic!("region {:?} resolver has ready", self.meta.id) } - ScanEntry::None => { - // Update the `tracked_index` to the snapshot's `apply_index` - self.resolver.update_tracked_index(apply_index); - let pending_tracked_index = - match std::mem::replace(&mut self.resolver_status, ResolverStatus::Ready) { - ResolverStatus::Pending { - locks, - tracked_index, - .. - } => { - locks.into_iter().for_each(|lock| match lock { - PendingLock::Track { key, start_ts } => { - self.resolver.track_lock( - start_ts, - key.to_raw().unwrap(), - Some(tracked_index), - ) - } - PendingLock::Untrack { key, .. } => self - .resolver - .untrack_lock(&key.to_raw().unwrap(), Some(tracked_index)), - }); - tracked_index - } - ResolverStatus::Ready => { - panic!("region {:?} resolver has ready", self.meta.id) - } - }; - info!( - "Resolver initialized"; - "region" => self.meta.id, - "observe_id" => ?self.handle.id, - "snapshot_index" => apply_index, - "pending_data_index" => pending_tracked_index, - ); + for (key, lock) in locks { + self.resolver + .track_lock(lock.ts, key.to_raw().unwrap(), Some(apply_index))?; + } + } + ScanEntries::None => { + // Update the `tracked_index` to the snapshot's `apply_index` + self.resolver.update_tracked_index(apply_index); + let mut resolver_status = + std::mem::replace(&mut self.resolver_status, ResolverStatus::Ready); + let (pending_tracked_index, pending_locks) = + resolver_status.drain_pending_locks(self.meta.id); + for lock in pending_locks { + match lock { + PendingLock::Track { key, start_ts } => { + self.resolver.track_lock( + start_ts, + key.to_raw().unwrap(), + Some(pending_tracked_index), + )?; + } + PendingLock::Untrack { key, .. } => self + .resolver + .untrack_lock(&key.to_raw().unwrap(), Some(pending_tracked_index)), + } } - ScanEntry::TxnEntry(_) => panic!("unexpected entry type"), + info!( + "Resolver initialized"; + "region" => self.meta.id, + "observe_id" => ?self.handle.id, + "snapshot_index" => apply_index, + "pending_data_index" => pending_tracked_index, + ); } } + Ok(()) } } pub struct Endpoint { store_id: Option, cfg: ResolvedTsConfig, + memory_quota: Arc, advance_notify: Arc, store_meta: Arc>, region_read_progress: RegionReadProgressRegistry, regions: HashMap, scanner_pool: ScannerPool, + scan_concurrency_semaphore: Arc, scheduler: Scheduler, advance_worker: AdvanceTsWorker, _phantom: PhantomData<(T, E)>, } +// methods that are used for metrics and logging +impl Endpoint +where + T: 'static + CdcHandle, + E: KvEngine, + S: StoreRegionMeta, +{ + fn collect_stats(&mut self) -> Stats { + fn is_leader(store_id: Option, leader_store_id: Option) -> bool { + store_id.is_some() && store_id == leader_store_id + } + + let store_id = self.get_or_init_store_id(); + let mut stats = Stats::default(); + self.region_read_progress.with(|registry| { + for (region_id, read_progress) in registry { + let (leader_info, leader_store_id) = read_progress.dump_leader_info(); + let core = read_progress.get_core(); + let resolved_ts = leader_info.get_read_state().get_safe_ts(); + let safe_ts = core.read_state().ts; + + if resolved_ts == 0 { + stats.zero_ts_count += 1; + continue; + } + + if is_leader(store_id, leader_store_id) { + // leader resolved-ts + if resolved_ts < stats.min_leader_resolved_ts.resolved_ts { + let resolver = self.regions.get_mut(region_id).map(|x| &mut x.resolver); + stats + .min_leader_resolved_ts + .set(*region_id, resolver, &core, &leader_info); + } + } else { + // follower safe-ts + if safe_ts > 0 && safe_ts < stats.min_follower_safe_ts.safe_ts { + stats.min_follower_safe_ts.set(*region_id, &core); + } + + // follower resolved-ts + if resolved_ts < stats.min_follower_resolved_ts.resolved_ts { + stats.min_follower_resolved_ts.set(*region_id, &core); + } + } + } + }); + + stats.resolver = self.collect_resolver_stats(); + stats.cm_min_lock = self.advance_worker.concurrency_manager.global_min_lock(); + stats + } + + fn collect_resolver_stats(&mut self) -> ResolverStats { + let mut stats = ResolverStats::default(); + for observed_region in self.regions.values() { + match &observed_region.resolver_status { + ResolverStatus::Pending { locks, .. } => { + for l in locks { + stats.heap_size += l.approximate_heap_size() as i64; + } + stats.unresolved_count += 1; + } + ResolverStatus::Ready { .. } => { + stats.heap_size += observed_region.resolver.approximate_heap_bytes() as i64; + stats.resolved_count += 1; + } + } + } + stats + } + + fn update_metrics(&self, stats: &Stats) { + let now = self.approximate_now_tso(); + // general + if stats.min_follower_resolved_ts.resolved_ts < stats.min_leader_resolved_ts.resolved_ts { + RTS_MIN_RESOLVED_TS.set(stats.min_follower_resolved_ts.resolved_ts as i64); + RTS_MIN_RESOLVED_TS_GAP.set(now.saturating_sub( + TimeStamp::from(stats.min_follower_resolved_ts.resolved_ts).physical(), + ) as i64); + RTS_MIN_RESOLVED_TS_REGION.set(stats.min_follower_resolved_ts.region_id as i64); + } else { + RTS_MIN_RESOLVED_TS.set(stats.min_leader_resolved_ts.resolved_ts as i64); + RTS_MIN_RESOLVED_TS_GAP.set(now.saturating_sub( + TimeStamp::from(stats.min_leader_resolved_ts.resolved_ts).physical(), + ) as i64); + RTS_MIN_RESOLVED_TS_REGION.set(stats.min_leader_resolved_ts.region_id as i64); + } + RTS_ZERO_RESOLVED_TS.set(stats.zero_ts_count); + + RTS_LOCK_HEAP_BYTES_GAUGE.set(stats.resolver.heap_size); + RTS_LOCK_QUOTA_IN_USE_BYTES_GAUGE.set(self.memory_quota.in_use() as i64); + RTS_REGION_RESOLVE_STATUS_GAUGE_VEC + .with_label_values(&["resolved"]) + .set(stats.resolver.resolved_count); + RTS_REGION_RESOLVE_STATUS_GAUGE_VEC + .with_label_values(&["unresolved"]) + .set(stats.resolver.unresolved_count); + + CONCURRENCY_MANAGER_MIN_LOCK_TS.set( + stats + .cm_min_lock + .clone() + .map(|(ts, _)| ts.into_inner()) + .unwrap_or_default() as i64, + ); + + // min follower safe ts + RTS_MIN_FOLLOWER_SAFE_TS_REGION.set(stats.min_follower_safe_ts.region_id as i64); + RTS_MIN_FOLLOWER_SAFE_TS.set(stats.min_follower_safe_ts.safe_ts as i64); + RTS_MIN_FOLLOWER_SAFE_TS_GAP.set( + now.saturating_sub(TimeStamp::from(stats.min_follower_safe_ts.safe_ts).physical()) + as i64, + ); + RTS_MIN_FOLLOWER_SAFE_TS_DURATION_TO_LAST_CONSUME_LEADER.set( + stats + .min_follower_safe_ts + .duration_to_last_consume_leader + .map(|x| x as i64) + .unwrap_or(-1), + ); + + // min leader resolved ts + RTS_MIN_LEADER_RESOLVED_TS.set(stats.min_leader_resolved_ts.resolved_ts as i64); + RTS_MIN_LEADER_RESOLVED_TS_REGION.set(stats.min_leader_resolved_ts.region_id as i64); + RTS_MIN_LEADER_RESOLVED_TS_REGION_MIN_LOCK_TS.set( + stats + .min_leader_resolved_ts + .min_lock + .as_ref() + .map(|(ts, _)| (*ts).into_inner() as i64) + .unwrap_or(-1), + ); + RTS_MIN_LEADER_RESOLVED_TS_GAP + .set(now.saturating_sub( + TimeStamp::from(stats.min_leader_resolved_ts.resolved_ts).physical(), + ) as i64); + RTS_MIN_LEADER_DUATION_TO_LAST_UPDATE_SAFE_TS.set( + stats + .min_leader_resolved_ts + .duration_to_last_update_ms + .map(|x| x as i64) + .unwrap_or(-1), + ); + + // min follower resolved ts + RTS_MIN_FOLLOWER_RESOLVED_TS.set(stats.min_follower_resolved_ts.resolved_ts as i64); + RTS_MIN_FOLLOWER_RESOLVED_TS_REGION.set(stats.min_follower_resolved_ts.region_id as i64); + RTS_MIN_FOLLOWER_RESOLVED_TS_GAP.set( + now.saturating_sub( + TimeStamp::from(stats.min_follower_resolved_ts.resolved_ts).physical(), + ) as i64, + ); + RTS_MIN_FOLLOWER_RESOLVED_TS_DURATION_TO_LAST_CONSUME_LEADER.set( + stats + .min_follower_resolved_ts + .duration_to_last_consume_leader + .map(|x| x as i64) + .unwrap_or(-1), + ); + } + + // Approximate a TSO from PD. It is better than local timestamp when clock skew + // exists. + // Returns the physical part. + fn approximate_now_tso(&self) -> u64 { + self.advance_worker + .last_pd_tso + .try_lock() + .map(|opt| { + opt.map(|(pd_ts, instant)| { + pd_ts.physical() + instant.saturating_elapsed().as_millis() as u64 + }) + .unwrap_or_else(|| TimeStamp::physical_now()) + }) + .unwrap_or_else(|_| TimeStamp::physical_now()) + } + + fn log_slow_regions(&self, stats: &Stats) { + let expected_interval = min( + self.cfg.advance_ts_interval.as_millis(), + DEFAULT_CHECK_LEADER_TIMEOUT_DURATION.as_millis() as u64, + ) + self.cfg.advance_ts_interval.as_millis(); + let leader_threshold = expected_interval + SLOW_LOG_GRACE_PERIOD_MS; + let follower_threshold = 2 * expected_interval + SLOW_LOG_GRACE_PERIOD_MS; + let now = self.approximate_now_tso(); + + // min leader resolved ts + let min_leader_resolved_ts_gap = now + .saturating_sub(TimeStamp::from(stats.min_leader_resolved_ts.resolved_ts).physical()); + if min_leader_resolved_ts_gap > leader_threshold { + info!( + "the max gap of leader resolved-ts is large"; + "region_id" => stats.min_leader_resolved_ts.region_id, + "gap" => format!("{}ms", min_leader_resolved_ts_gap), + "read_state" => ?stats.min_leader_resolved_ts.read_state, + "applied_index" => stats.min_leader_resolved_ts.applied_index, + "min_lock" => ?stats.min_leader_resolved_ts.min_lock, + "lock_num" => stats.min_leader_resolved_ts.lock_num, + "txn_num" => stats.min_leader_resolved_ts.txn_num, + "min_memory_lock" => ?stats.cm_min_lock, + "duration_to_last_update_safe_ts" => match stats.min_leader_resolved_ts.duration_to_last_update_ms { + Some(d) => format!("{}ms", d), + None => "none".to_owned(), + }, + "last_resolve_attempt" => &stats.min_leader_resolved_ts.last_resolve_attempt, + ); + } + + // min follower safe ts + let min_follower_safe_ts_gap = + now.saturating_sub(TimeStamp::from(stats.min_follower_safe_ts.safe_ts).physical()); + if min_follower_safe_ts_gap > follower_threshold { + info!( + "the max gap of follower safe-ts is large"; + "region_id" => stats.min_follower_safe_ts.region_id, + "gap" => format!("{}ms", min_follower_safe_ts_gap), + "safe_ts" => stats.min_follower_safe_ts.safe_ts, + "resolved_ts" => stats.min_follower_safe_ts.resolved_ts, + "duration_to_last_consume_leader" => match stats.min_follower_safe_ts.duration_to_last_consume_leader { + Some(d) => format!("{}ms", d), + None => "none".to_owned(), + }, + "applied_index" => stats.min_follower_safe_ts.applied_index, + "latest_candidate" => ?stats.min_follower_safe_ts.latest_candidate, + "oldest_candidate" => ?stats.min_follower_safe_ts.oldest_candidate, + ); + } + + // min follower resolved ts + let min_follower_resolved_ts_gap = now + .saturating_sub(TimeStamp::from(stats.min_follower_resolved_ts.resolved_ts).physical()); + if min_follower_resolved_ts_gap > follower_threshold { + if stats.min_follower_resolved_ts.region_id == stats.min_follower_safe_ts.region_id { + info!( + "the max gap of follower resolved-ts is large; it's the same region that has the min safe-ts" + ); + } else { + info!( + "the max gap of follower resolved-ts is large"; + "region_id" => stats.min_follower_resolved_ts.region_id, + "gap" => format!("{}ms", min_follower_resolved_ts_gap), + "safe_ts" => stats.min_follower_resolved_ts.safe_ts, + "resolved_ts" => stats.min_follower_resolved_ts.resolved_ts, + "duration_to_last_consume_leader" => match stats.min_follower_resolved_ts.duration_to_last_consume_leader { + Some(d) => format!("{}ms", d), + None => "none".to_owned(), + }, + "applied_index" => stats.min_follower_resolved_ts.applied_index, + "latest_candidate" => ?stats.min_follower_resolved_ts.latest_candidate, + "oldest_candidate" => ?stats.min_follower_resolved_ts.oldest_candidate, + ); + } + } + } +} + impl Endpoint where T: 'static + CdcHandle, @@ -302,12 +660,8 @@ where let meta = store_meta.lock().unwrap(); (meta.region_read_progress().clone(), meta.store_id()) }; - let advance_worker = AdvanceTsWorker::new( - cfg.advance_ts_interval.0, - pd_client.clone(), - scheduler.clone(), - concurrency_manager, - ); + let advance_worker = + AdvanceTsWorker::new(pd_client.clone(), scheduler.clone(), concurrency_manager); let scanner_pool = ScannerPool::new(cfg.scan_lock_pool_size, cdc_handle); let store_resolver_gc_interval = Duration::from_secs(60); let leader_resolver = LeadershipResolver::new( @@ -318,15 +672,18 @@ where region_read_progress.clone(), store_resolver_gc_interval, ); + let scan_concurrency_semaphore = Arc::new(Semaphore::new(cfg.incremental_scan_concurrency)); let ep = Self { store_id: Some(store_id), cfg: cfg.clone(), + memory_quota: Arc::new(MemoryQuota::new(cfg.memory_quota.0 as usize)), advance_notify: Arc::new(Notify::new()), scheduler, store_meta, region_read_progress, advance_worker, scanner_pool, + scan_concurrency_semaphore, regions: HashMap::default(), _phantom: PhantomData::default(), }; @@ -334,36 +691,31 @@ where ep } - fn register_region(&mut self, region: Region) { + fn register_region(&mut self, region: Region, backoff: Option) { let region_id = region.get_id(); assert!(self.regions.get(®ion_id).is_none()); - let observe_region = { - if let Some(read_progress) = self.region_read_progress.get(®ion_id) { - info!( - "register observe region"; - "region" => ?region - ); - ObserveRegion::new(region.clone(), read_progress) - } else { - warn!( - "try register unexit region"; - "region" => ?region, - ); - return; - } + let Some(read_progress) = self.region_read_progress.get(®ion_id) else { + warn!("try register nonexistent region"; "region" => ?region); + return; }; + info!("register observe region"; "region" => ?region); + let (cancelled_tx, cancelled_rx) = channel(); + let observe_region = ObserveRegion::new( + region.clone(), + read_progress, + self.memory_quota.clone(), + cancelled_tx, + ); let observe_handle = observe_region.handle.clone(); - let cancelled = match observe_region.resolver_status { - ResolverStatus::Pending { ref cancelled, .. } => cancelled.clone(), - ResolverStatus::Ready => panic!("resolved ts illeagal created observe region"), - }; observe_region .read_progress() .update_advance_resolved_ts_notify(self.advance_notify.clone()); self.regions.insert(region_id, observe_region); - let scan_task = self.build_scan_task(region, observe_handle, cancelled); - self.scanner_pool.spawn_task(scan_task); + let scan_task = self.build_scan_task(region, observe_handle, cancelled_rx, backoff); + let concurrency_semaphore = self.scan_concurrency_semaphore.clone(); + self.scanner_pool + .spawn_task(scan_task, concurrency_semaphore); RTS_SCAN_TASKS.with_label_values(&["total"]).inc(); } @@ -371,40 +723,17 @@ where &self, region: Region, observe_handle: ObserveHandle, - cancelled: Arc, + cancelled: Receiver<()>, + backoff: Option, ) -> ScanTask { let scheduler = self.scheduler.clone(); - let scheduler_error = self.scheduler.clone(); - let region_id = region.id; - let observe_id = observe_handle.id; ScanTask { handle: observe_handle, - tag: String::new(), - mode: ScanMode::LockOnly, region, checkpoint_ts: TimeStamp::zero(), - is_cancelled: Box::new(move || cancelled.load(Ordering::Acquire)), - send_entries: Box::new(move |entries, apply_index| { - scheduler - .schedule(Task::ScanLocks { - region_id, - observe_id, - entries, - apply_index, - }) - .unwrap_or_else(|e| warn!("schedule resolved ts task failed"; "err" => ?e)); - RTS_SCAN_TASKS.with_label_values(&["finish"]).inc(); - }), - on_error: Some(Box::new(move |observe_id, _region, e| { - scheduler_error - .schedule(Task::ReRegisterRegion { - region_id, - observe_id, - cause: format!("met error while handle scan task {:?}", e), - }) - .unwrap_or_else(|schedule_err| warn!("schedule re-register task failed"; "err" => ?schedule_err, "re_register_cause" => ?e)); - RTS_SCAN_TASKS.with_label_values(&["abort"]).inc(); - })), + backoff, + cancelled, + scheduler, } } @@ -412,7 +741,7 @@ where if let Some(observe_region) = self.regions.remove(®ion_id) { let ObserveRegion { handle, - resolver_status, + mut resolver_status, .. } = observe_region; @@ -425,8 +754,11 @@ where // Stop observing data handle.stop_observing(); // Stop scanning data - if let ResolverStatus::Pending { cancelled, .. } = resolver_status { - cancelled.store(true, Ordering::Release); + if let ResolverStatus::Pending { + ref mut cancelled, .. + } = resolver_status + { + let _ = cancelled.take(); } } else { debug!("deregister unregister region"; "region_id" => region_id); @@ -448,7 +780,7 @@ where // the `Resolver`'s lock heap // - `PrepareMerge` and `RollbackMerge`, the key range is unchanged self.deregister_region(region_id); - self.register_region(incoming_region); + self.register_region(incoming_region, None); } } @@ -479,7 +811,13 @@ where } // Deregister current observed region and try to register it again. - fn re_register_region(&mut self, region_id: u64, observe_id: ObserveId, cause: String) { + fn re_register_region( + &mut self, + region_id: u64, + observe_id: ObserveId, + cause: Error, + backoff: Option, + ) { if let Some(observe_region) = self.regions.get(®ion_id) { if observe_region.handle.id != observe_id { warn!("resolved ts deregister region failed due to observe_id not match"); @@ -490,7 +828,7 @@ where "register region again"; "region_id" => region_id, "observe_id" => ?observe_id, - "cause" => cause + "cause" => ?cause ); self.deregister_region(region_id); let region; @@ -501,13 +839,18 @@ where None => return, } } - self.register_region(region); + self.register_region(region, backoff); } } // Update advanced resolved ts. // Must ensure all regions are leaders at the point of ts. - fn handle_resolved_ts_advanced(&mut self, regions: Vec, ts: TimeStamp) { + fn handle_resolved_ts_advanced( + &mut self, + regions: Vec, + ts: TimeStamp, + ts_source: TsSource, + ) { if regions.is_empty() { return; } @@ -515,7 +858,9 @@ where for region_id in regions.iter() { if let Some(observe_region) = self.regions.get_mut(region_id) { if let ResolverStatus::Ready = observe_region.resolver_status { - let _ = observe_region.resolver.resolve(ts, Some(now)); + let _ = observe_region + .resolver + .resolve(ts, Some(now), ts_source.clone()); } } } @@ -538,7 +883,11 @@ where let logs = ChangeLog::encode_change_log(region_id, batch); if let Err(e) = observe_region.track_change_log(&logs) { drop(observe_region); - self.re_register_region(region_id, observe_id, e); + let backoff = match e { + Error::MemoryQuotaExceeded(_) => Some(MEMORY_QUOTA_EXCEEDED_BACKOFF), + Error::Other(_) => None, + }; + self.re_register_region(region_id, observe_id, e, backoff); } } else { debug!("resolved ts CmdBatch discarded"; @@ -555,18 +904,26 @@ where &mut self, region_id: u64, observe_id: ObserveId, - entries: Vec, + entries: ScanEntries, apply_index: u64, ) { - match self.regions.get_mut(®ion_id) { - Some(observe_region) => { - if observe_region.handle.id == observe_id { - observe_region.track_scan_locks(entries, apply_index); + let mut memory_quota_exceeded = None; + if let Some(observe_region) = self.regions.get_mut(®ion_id) { + if observe_region.handle.id == observe_id { + if let Err(Error::MemoryQuotaExceeded(e)) = + observe_region.track_scan_locks(entries, apply_index) + { + memory_quota_exceeded = Some(Error::MemoryQuotaExceeded(e)); } } - None => { - debug!("scan locks region not exist"; "region_id" => region_id, "observe_id" => ?observe_id); - } + } else { + debug!("scan locks region not exist"; + "region_id" => region_id, + "observe_id" => ?observe_id); + } + if let Some(e) = memory_quota_exceeded { + let backoff = Some(MEMORY_QUOTA_EXCEEDED_BACKOFF); + self.re_register_region(region_id, observe_id, e, backoff); } } @@ -586,6 +943,10 @@ where warn!("resolved-ts config fails"; "error" => ?e); } else { self.advance_notify.notify_waiters(); + self.memory_quota + .set_capacity(self.cfg.memory_quota.0 as usize); + self.scan_concurrency_semaphore = + Arc::new(Semaphore::new(self.cfg.incremental_scan_concurrency)); info!( "resolved-ts config changed"; "prev" => prev, @@ -638,7 +999,7 @@ pub enum Task { ReRegisterRegion { region_id: u64, observe_id: ObserveId, - cause: String, + cause: Error, }, AdvanceResolvedTs { leader_resolver: LeadershipResolver, @@ -646,6 +1007,7 @@ pub enum Task { ResolvedTsAdvanced { regions: Vec, ts: TimeStamp, + ts_source: TsSource, }, ChangeLog { cmd_batch: Vec, @@ -653,7 +1015,7 @@ pub enum Task { ScanLocks { region_id: u64, observe_id: ObserveId, - entries: Vec, + entries: ScanEntries, apply_index: u64, }, ChangeConfig { @@ -700,10 +1062,12 @@ impl fmt::Debug for Task { Task::ResolvedTsAdvanced { ref regions, ref ts, + ref ts_source, } => de .field("name", &"advance_resolved_ts") .field("regions", ®ions) .field("ts", &ts) + .field("ts_source", &ts_source.label()) .finish(), Task::ChangeLog { .. } => de.field("name", &"change_log").finish(), Task::ScanLocks { @@ -750,19 +1114,21 @@ where match task { Task::RegionDestroyed(region) => self.region_destroyed(region), Task::RegionUpdated(region) => self.region_updated(region), - Task::RegisterRegion { region } => self.register_region(region), + Task::RegisterRegion { region } => self.register_region(region, None), Task::DeRegisterRegion { region_id } => self.deregister_region(region_id), Task::ReRegisterRegion { region_id, observe_id, cause, - } => self.re_register_region(region_id, observe_id, cause), + } => self.re_register_region(region_id, observe_id, cause, None), Task::AdvanceResolvedTs { leader_resolver } => { self.handle_advance_resolved_ts(leader_resolver) } - Task::ResolvedTsAdvanced { regions, ts } => { - self.handle_resolved_ts_advanced(regions, ts) - } + Task::ResolvedTsAdvanced { + regions, + ts, + ts_source, + } => self.handle_resolved_ts_advanced(regions, ts, ts_source), Task::ChangeLog { cmd_batch } => self.handle_change_log(cmd_batch), Task::ScanLocks { region_id, @@ -798,6 +1164,130 @@ impl ConfigManager for ResolvedTsConfigManager { } } +#[derive(Default)] +struct Stats { + // stats for metrics + zero_ts_count: i64, + min_leader_resolved_ts: LeaderStats, + min_follower_safe_ts: FollowerStats, + min_follower_resolved_ts: FollowerStats, + resolver: ResolverStats, + // we don't care about min_safe_ts_leader, because safe_ts should be equal to resolved_ts in + // leaders + // The min memory lock in concurrency manager. + cm_min_lock: Option<(TimeStamp, Key)>, +} + +struct LeaderStats { + region_id: u64, + resolved_ts: u64, + read_state: ReadState, + duration_to_last_update_ms: Option, + last_resolve_attempt: Option, + applied_index: u64, + // min lock in LOCK CF + min_lock: Option<(TimeStamp, TxnLocks)>, + lock_num: Option, + txn_num: Option, +} + +impl Default for LeaderStats { + fn default() -> Self { + Self { + region_id: 0, + resolved_ts: u64::MAX, + read_state: ReadState::default(), + duration_to_last_update_ms: None, + applied_index: 0, + last_resolve_attempt: None, + min_lock: None, + lock_num: None, + txn_num: None, + } + } +} + +impl LeaderStats { + fn set( + &mut self, + region_id: u64, + mut resolver: Option<&mut Resolver>, + region_read_progress: &MutexGuard<'_, RegionReadProgressCore>, + leader_info: &LeaderInfo, + ) { + *self = LeaderStats { + region_id, + resolved_ts: leader_info.get_read_state().get_safe_ts(), + read_state: region_read_progress.read_state().clone(), + duration_to_last_update_ms: region_read_progress + .last_instant_of_update_ts() + .map(|i| i.saturating_elapsed().as_millis() as u64), + last_resolve_attempt: resolver.as_mut().and_then(|r| r.take_last_attempt()), + min_lock: resolver + .as_ref() + .and_then(|r| r.oldest_transaction().map(|(t, tk)| (*t, tk.clone()))), + applied_index: region_read_progress.applied_index(), + lock_num: resolver.as_ref().map(|r| r.num_locks()), + txn_num: resolver.as_ref().map(|r| r.num_transactions()), + }; + } +} + +struct FollowerStats { + region_id: u64, + resolved_ts: u64, + safe_ts: u64, + latest_candidate: Option, + oldest_candidate: Option, + applied_index: u64, + duration_to_last_consume_leader: Option, +} + +impl Default for FollowerStats { + fn default() -> Self { + Self { + region_id: 0, + safe_ts: u64::MAX, + resolved_ts: u64::MAX, + latest_candidate: None, + oldest_candidate: None, + applied_index: 0, + duration_to_last_consume_leader: None, + } + } +} + +impl FollowerStats { + fn set( + &mut self, + region_id: u64, + region_read_progress: &MutexGuard<'_, RegionReadProgressCore>, + ) { + let read_state = region_read_progress.read_state(); + *self = FollowerStats { + region_id, + resolved_ts: region_read_progress + .get_leader_info() + .get_read_state() + .get_safe_ts(), + safe_ts: read_state.ts, + applied_index: region_read_progress.applied_index(), + latest_candidate: region_read_progress.pending_items().back().cloned(), + oldest_candidate: region_read_progress.pending_items().front().cloned(), + duration_to_last_consume_leader: region_read_progress + .last_instant_of_consume_leader() + .map(|i| i.saturating_elapsed().as_millis() as u64), + }; + } +} + +#[derive(Default)] +struct ResolverStats { + resolved_count: i64, + unresolved_count: i64, + heap_size: i64, +} + const METRICS_FLUSH_INTERVAL: u64 = 10_000; // 10s impl RunnableWithTimer for Endpoint @@ -807,138 +1297,9 @@ where S: StoreRegionMeta, { fn on_timeout(&mut self) { - let store_id = self.get_or_init_store_id(); - let (mut oldest_ts, mut oldest_region, mut zero_ts_count) = (u64::MAX, 0, 0); - let (mut oldest_leader_ts, mut oldest_leader_region) = (u64::MAX, 0); - let (mut oldest_safe_ts, mut oldest_safe_ts_region) = (u64::MAX, 0); - let mut oldest_duration_to_last_update_ms = 0; - let mut oldest_duration_to_last_consume_leader_ms = 0; - self.region_read_progress.with(|registry| { - for (region_id, read_progress) in registry { - let safe_ts = read_progress.safe_ts(); - if safe_ts > 0 && safe_ts < oldest_safe_ts { - oldest_safe_ts = safe_ts; - oldest_safe_ts_region = *region_id; - } - - let (leader_info, leader_store_id) = read_progress.dump_leader_info(); - // this is maximum resolved-ts pushed to region_read_progress, namely candidates - // of safe_ts. It may not be the safe_ts yet - let ts = leader_info.get_read_state().get_safe_ts(); - if ts == 0 { - zero_ts_count += 1; - continue; - } - if ts < oldest_ts { - oldest_ts = ts; - oldest_region = *region_id; - // use -1 to denote none. - oldest_duration_to_last_update_ms = read_progress - .get_core() - .last_instant_of_consume_leader() - .map(|t| t.saturating_elapsed().as_millis() as i64) - .unwrap_or(-1); - oldest_duration_to_last_consume_leader_ms = read_progress - .get_core() - .last_instant_of_consume_leader() - .map(|t| t.saturating_elapsed().as_millis() as i64) - .unwrap_or(-1); - } - - if let (Some(store_id), Some(leader_store_id)) = (store_id, leader_store_id) { - if leader_store_id == store_id && ts < oldest_leader_ts { - oldest_leader_ts = ts; - oldest_leader_region = *region_id; - } - } - } - }); - let mut lock_heap_size = 0; - let (mut resolved_count, mut unresolved_count) = (0, 0); - for observe_region in self.regions.values() { - match &observe_region.resolver_status { - ResolverStatus::Pending { locks, .. } => { - for l in locks { - match l { - PendingLock::Track { key, .. } => lock_heap_size += key.len(), - PendingLock::Untrack { key, .. } => lock_heap_size += key.len(), - } - } - unresolved_count += 1; - } - ResolverStatus::Ready { .. } => { - lock_heap_size += observe_region.resolver.size(); - resolved_count += 1; - } - } - } - // approximate a TSO from PD. It is better than local timestamp when clock skew - // exists. - let now: u64 = self - .advance_worker - .last_pd_tso - .try_lock() - .map(|opt| { - opt.map(|(pd_ts, instant)| { - pd_ts.physical() + instant.saturating_elapsed().as_millis() as u64 - }) - .unwrap_or_else(|| TimeStamp::physical_now()) - }) - .unwrap_or_else(|_| TimeStamp::physical_now()); - - RTS_MIN_SAFE_TS.set(oldest_safe_ts as i64); - RTS_MIN_SAFE_TS_REGION.set(oldest_safe_ts_region as i64); - let safe_ts_gap = now.saturating_sub(TimeStamp::from(oldest_safe_ts).physical()); - if safe_ts_gap - > self.cfg.advance_ts_interval.as_millis() - + DEFAULT_CHECK_LEADER_TIMEOUT_DURATION.as_millis() as u64 - + SLOW_LOG_GRACE_PERIOD_MS - { - let mut lock_num = None; - let mut min_start_ts = None; - if let Some(ob) = self.regions.get(&oldest_safe_ts_region) { - min_start_ts = ob - .resolver - .locks() - .keys() - .next() - .cloned() - .map(TimeStamp::into_inner); - lock_num = Some(ob.resolver.locks_by_key.len()); - } - info!( - "the max gap of safe-ts is large"; - "gap" => safe_ts_gap, - "oldest_safe_ts" => ?oldest_safe_ts, - "region_id" => oldest_safe_ts_region, - "advance_ts_interval" => ?self.cfg.advance_ts_interval, - "lock_num" => lock_num, - "min_start_ts" => min_start_ts, - ); - } - RTS_MIN_SAFE_TS_GAP.set(safe_ts_gap as i64); - RTS_MIN_SAFE_TS_DUATION_TO_UPDATE_SAFE_TS.set(oldest_duration_to_last_update_ms); - RTS_MIN_SAFE_TS_DURATION_TO_LAST_CONSUME_LEADER - .set(oldest_duration_to_last_consume_leader_ms); - - RTS_MIN_RESOLVED_TS_REGION.set(oldest_region as i64); - RTS_MIN_RESOLVED_TS.set(oldest_ts as i64); - RTS_ZERO_RESOLVED_TS.set(zero_ts_count as i64); - RTS_MIN_RESOLVED_TS_GAP - .set(now.saturating_sub(TimeStamp::from(oldest_ts).physical()) as i64); - - RTS_MIN_LEADER_RESOLVED_TS_REGION.set(oldest_leader_region as i64); - RTS_MIN_LEADER_RESOLVED_TS.set(oldest_leader_ts as i64); - RTS_MIN_LEADER_RESOLVED_TS_GAP - .set(now.saturating_sub(TimeStamp::from(oldest_leader_ts).physical()) as i64); - - RTS_LOCK_HEAP_BYTES_GAUGE.set(lock_heap_size as i64); - RTS_REGION_RESOLVE_STATUS_GAUGE_VEC - .with_label_values(&["resolved"]) - .set(resolved_count as _); - RTS_REGION_RESOLVE_STATUS_GAUGE_VEC - .with_label_values(&["unresolved"]) - .set(unresolved_count as _); + let stats = self.collect_stats(); + self.update_metrics(&stats); + self.log_slow_regions(&stats); } fn get_interval(&self) -> Duration { diff --git a/components/resolved_ts/src/errors.rs b/components/resolved_ts/src/errors.rs index d9845440c07..4e14c1d78d9 100644 --- a/components/resolved_ts/src/errors.rs +++ b/components/resolved_ts/src/errors.rs @@ -1,62 +1,14 @@ // Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. -use std::io::Error as IoError; - -use engine_traits::Error as EngineTraitsError; -use kvproto::errorpb::Error as ErrorHeader; -use raftstore::Error as RaftstoreError; use thiserror::Error; -use tikv::storage::{ - kv::{Error as KvError, ErrorInner as EngineErrorInner}, - mvcc::{Error as MvccError, ErrorInner as MvccErrorInner}, - txn::{Error as TxnError, ErrorInner as TxnErrorInner}, -}; -use txn_types::Error as TxnTypesError; +use tikv_util::memory::MemoryQuotaExceeded; #[derive(Debug, Error)] pub enum Error { - #[error("IO error {0}")] - Io(#[from] IoError), - #[error("Engine error {0}")] - Kv(#[from] KvError), - #[error("Transaction error {0}")] - Txn(#[from] TxnError), - #[error("Mvcc error {0}")] - Mvcc(#[from] MvccError), - #[error("Request error {0:?}")] - Request(Box), - #[error("Engine traits error {0}")] - EngineTraits(#[from] EngineTraitsError), - #[error("Txn types error {0}")] - TxnTypes(#[from] TxnTypesError), - #[error("Raftstore error {0}")] - Raftstore(#[from] RaftstoreError), + #[error("Memory quota exceeded")] + MemoryQuotaExceeded(#[from] MemoryQuotaExceeded), #[error("Other error {0}")] Other(#[from] Box), } -impl Error { - pub fn request(err: ErrorHeader) -> Error { - Error::Request(Box::new(err)) - } - - pub fn extract_error_header(self) -> ErrorHeader { - match self { - Error::Kv(KvError(box EngineErrorInner::Request(e))) - | Error::Txn(TxnError(box TxnErrorInner::Engine(KvError( - box EngineErrorInner::Request(e), - )))) - | Error::Txn(TxnError(box TxnErrorInner::Mvcc(MvccError(box MvccErrorInner::Kv( - KvError(box EngineErrorInner::Request(e)), - ))))) - | Error::Request(box e) => e, - other => { - let mut e = ErrorHeader::default(); - e.set_message(format!("{:?}", other)); - e - } - } - } -} - pub type Result = std::result::Result; diff --git a/components/resolved_ts/src/lib.rs b/components/resolved_ts/src/lib.rs index eef1211a580..f9eeb7c8b70 100644 --- a/components/resolved_ts/src/lib.rs +++ b/components/resolved_ts/src/lib.rs @@ -14,6 +14,7 @@ #![feature(box_patterns)] #![feature(result_flattening)] +#![feature(let_chains)] #[macro_use] extern crate tikv_util; diff --git a/components/resolved_ts/src/metrics.rs b/components/resolved_ts/src/metrics.rs index 15b3463f70e..fb751491d10 100644 --- a/components/resolved_ts/src/metrics.rs +++ b/components/resolved_ts/src/metrics.rs @@ -38,7 +38,7 @@ lazy_static! { .unwrap(); pub static ref RTS_MIN_RESOLVED_TS_GAP: IntGauge = register_int_gauge!( "tikv_resolved_ts_min_resolved_ts_gap_millis", - "The minimal (non-zero) resolved ts gap for observed regions" + "The gap between now() and the minimal (non-zero) resolved ts" ) .unwrap(); pub static ref RTS_RESOLVED_FAIL_ADVANCE_VEC: IntCounterVec = register_int_counter_vec!( @@ -69,29 +69,29 @@ lazy_static! { "The minimal (non-zero) resolved ts for observed regions" ) .unwrap(); - pub static ref RTS_MIN_SAFE_TS_REGION: IntGauge = register_int_gauge!( - "tikv_resolved_ts_min_safe_ts_region", - "The region which has minimal safe ts" + pub static ref RTS_MIN_FOLLOWER_SAFE_TS_REGION: IntGauge = register_int_gauge!( + "tikv_resolved_ts_min_follower_safe_ts_region", + "The region id of the follower that has minimal safe ts" ) .unwrap(); - pub static ref RTS_MIN_SAFE_TS: IntGauge = register_int_gauge!( - "tikv_resolved_ts_min_safe_ts", - "The minimal (non-zero) safe ts for observed regions" + pub static ref RTS_MIN_FOLLOWER_SAFE_TS: IntGauge = register_int_gauge!( + "tikv_resolved_ts_min_follower_safe_ts", + "The minimal (non-zero) safe ts for followers" ) .unwrap(); - pub static ref RTS_MIN_SAFE_TS_GAP: IntGauge = register_int_gauge!( - "tikv_resolved_ts_min_safe_ts_gap_millis", - "The minimal (non-zero) safe ts gap for observed regions" + pub static ref RTS_MIN_FOLLOWER_SAFE_TS_GAP: IntGauge = register_int_gauge!( + "tikv_resolved_ts_min_follower_safe_ts_gap_millis", + "The gap between now() and the minimal (non-zero) safe ts for followers" ) .unwrap(); - pub static ref RTS_MIN_SAFE_TS_DUATION_TO_UPDATE_SAFE_TS: IntGauge = register_int_gauge!( - "tikv_resolved_ts_min_safe_ts_duration_to_update_safe_ts", - "The duration since last update_safe_ts() called by resolved-ts routine. -1 denotes None." + pub static ref RTS_MIN_LEADER_DUATION_TO_LAST_UPDATE_SAFE_TS: IntGauge = register_int_gauge!( + "tikv_resolved_ts_leader_min_resolved_ts_duration_to_last_update_safe_ts", + "The duration since last update_safe_ts() called by resolved-ts routine in the leader with min resolved ts. -1 denotes None." ) .unwrap(); - pub static ref RTS_MIN_SAFE_TS_DURATION_TO_LAST_CONSUME_LEADER: IntGauge = register_int_gauge!( - "tikv_resolved_ts_min_safe_ts_duration_to_last_consume_leader", - "The duration since last check_leader(). -1 denotes None." + pub static ref RTS_MIN_FOLLOWER_SAFE_TS_DURATION_TO_LAST_CONSUME_LEADER: IntGauge = register_int_gauge!( + "tikv_resolved_ts_min_follower_safe_ts_duration_to_last_consume_leader", + "The duration since last check_leader() in the follower region with min safe ts. -1 denotes None." ) .unwrap(); pub static ref RTS_ZERO_RESOLVED_TS: IntGauge = register_int_gauge!( @@ -104,6 +104,11 @@ lazy_static! { "Total bytes in memory of resolved-ts observed regions's lock heap" ) .unwrap(); + pub static ref RTS_LOCK_QUOTA_IN_USE_BYTES_GAUGE: IntGauge = register_int_gauge!( + "tikv_resolved_ts_memory_quota_in_use_bytes", + "Total bytes in memory of resolved-ts observed regions's lock heap" + ) + .unwrap(); pub static ref RTS_REGION_RESOLVE_STATUS_GAUGE_VEC: IntGaugeVec = register_int_gauge_vec!( "tikv_resolved_ts_region_resolve_status", "The status of resolved-ts observed regions", @@ -125,7 +130,17 @@ lazy_static! { .unwrap(); pub static ref RTS_MIN_LEADER_RESOLVED_TS_REGION: IntGauge = register_int_gauge!( "tikv_resolved_ts_min_leader_resolved_ts_region", - "The region which its leader peer has minimal resolved ts" + "The region whose leader peer has minimal resolved ts" + ) + .unwrap(); + pub static ref RTS_MIN_LEADER_RESOLVED_TS_REGION_MIN_LOCK_TS: IntGauge = register_int_gauge!( + "tikv_resolved_ts_min_leader_resolved_ts_region_min_lock_ts", + "The minimal lock ts for the region whose leader peer has minimal resolved ts. 0 means no lock. -1 means no region found." + ) + .unwrap(); + pub static ref CONCURRENCY_MANAGER_MIN_LOCK_TS: IntGauge = register_int_gauge!( + "tikv_concurrency_manager_min_lock_ts", + "The minimal lock ts in concurrency manager. 0 means no lock." ) .unwrap(); pub static ref RTS_MIN_LEADER_RESOLVED_TS: IntGauge = register_int_gauge!( @@ -135,7 +150,35 @@ lazy_static! { .unwrap(); pub static ref RTS_MIN_LEADER_RESOLVED_TS_GAP: IntGauge = register_int_gauge!( "tikv_resolved_ts_min_leader_resolved_ts_gap_millis", - "The minimal (non-zero) resolved ts gap for observe leader peers" + "The gap between now() and the minimal (non-zero) resolved ts for leader peers" + ) + .unwrap(); + + // for min_follower_resolved_ts + pub static ref RTS_MIN_FOLLOWER_RESOLVED_TS_REGION: IntGauge = register_int_gauge!( + "tikv_resolved_ts_min_follower_resolved_ts_region", + "The region id of the follower has minimal resolved ts" + ) + .unwrap(); + pub static ref RTS_MIN_FOLLOWER_RESOLVED_TS: IntGauge = register_int_gauge!( + "tikv_resolved_ts_min_follower_resolved_ts", + "The minimal (non-zero) resolved ts for follower regions" + ) + .unwrap(); + pub static ref RTS_MIN_FOLLOWER_RESOLVED_TS_GAP: IntGauge = register_int_gauge!( + "tikv_resolved_ts_min_follower_resolved_ts_gap_millis", + "The max gap of now() and the minimal (non-zero) resolved ts for follower regions" + ) + .unwrap(); + pub static ref RTS_MIN_FOLLOWER_RESOLVED_TS_DURATION_TO_LAST_CONSUME_LEADER: IntGauge = register_int_gauge!( + "tikv_resolved_ts_min_follower_resolved_ts_duration_to_last_consume_leader", + "The duration since last check_leader() in the follower region with min resolved ts. -1 denotes None." + ) + .unwrap(); + pub static ref RTS_INITIAL_SCAN_BACKOFF_DURATION_HISTOGRAM: Histogram = register_histogram!( + "tikv_resolved_ts_initial_scan_backoff_duration_seconds", + "Bucketed histogram of resolved-ts initial scan backoff duration", + exponential_buckets(0.1, 2.0, 16).unwrap(), ) .unwrap(); } diff --git a/components/resolved_ts/src/resolver.rs b/components/resolved_ts/src/resolver.rs index 799c5584723..2aec9c336cd 100644 --- a/components/resolved_ts/src/resolver.rs +++ b/components/resolved_ts/src/resolver.rs @@ -1,75 +1,195 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. -use std::{cmp, collections::BTreeMap, sync::Arc}; +use std::{cmp, collections::BTreeMap, sync::Arc, time::Duration}; -use collections::{HashMap, HashSet}; +use collections::{HashMap, HashMapEntry}; use raftstore::store::RegionReadProgress; -use tikv_util::time::Instant; -use txn_types::TimeStamp; +use tikv_util::{ + memory::{MemoryQuota, MemoryQuotaExceeded}, + time::Instant, +}; +use txn_types::{Key, TimeStamp}; + +use crate::metrics::*; + +pub const ON_DROP_WARN_HEAP_SIZE: usize = 64 * 1024 * 1024; // 64MB + +#[derive(Clone)] +pub enum TsSource { + // A lock in LOCK CF + Lock(TxnLocks), + // A memory lock in concurrency manager + MemoryLock(Key), + PdTso, + // The following sources can also come from PD or memory lock, but we care more about sources + // in resolved-ts. + BackupStream, + Cdc, +} + +impl TsSource { + pub fn label(&self) -> &str { + match self { + TsSource::Lock(_) => "lock", + TsSource::MemoryLock(_) => "rts_cm_min_lock", + TsSource::PdTso => "pd_tso", + TsSource::BackupStream => "backup_stream", + TsSource::Cdc => "cdc", + } + } + + pub fn key(&self) -> Option { + match self { + TsSource::Lock(locks) => locks + .sample_lock + .as_ref() + .map(|k| Key::from_encoded_slice(k)), + TsSource::MemoryLock(k) => Some(k.clone()), + _ => None, + } + } +} -use crate::metrics::RTS_RESOLVED_FAIL_ADVANCE_VEC; +#[derive(Default, Clone, PartialEq, Eq)] +pub struct TxnLocks { + pub lock_count: usize, + // A sample key in a transaction. + pub sample_lock: Option>, +} -const MAX_NUMBER_OF_LOCKS_IN_LOG: usize = 10; +impl std::fmt::Debug for TxnLocks { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("TxnLocks") + .field("lock_count", &self.lock_count) + .field( + "sample_lock", + &self + .sample_lock + .as_ref() + .map(|k| log_wrappers::Value::key(k)), + ) + .finish() + } +} // Resolver resolves timestamps that guarantee no more commit will happen before // the timestamp. pub struct Resolver { region_id: u64, // key -> start_ts - pub(crate) locks_by_key: HashMap, TimeStamp>, + locks_by_key: HashMap, TimeStamp>, // start_ts -> locked keys. - lock_ts_heap: BTreeMap>>, + lock_ts_heap: BTreeMap, + // The last shrink time. + last_aggressive_shrink_time: Instant, // The timestamps that guarantees no more commit will happen before. resolved_ts: TimeStamp, // The highest index `Resolver` had been tracked tracked_index: u64, // The region read progress used to utilize `resolved_ts` to serve stale read request - pub(crate) read_progress: Option>, + read_progress: Option>, // The timestamps that advance the resolved_ts when there is no more write. min_ts: TimeStamp, // Whether the `Resolver` is stopped stopped: bool, + // The memory quota for the `Resolver` and its lock keys and timestamps. + memory_quota: Arc, + // The last attempt of resolve(), used for diagnosis. + last_attempt: Option, +} + +#[derive(Clone)] +pub(crate) struct LastAttempt { + success: bool, + ts: TimeStamp, + reason: TsSource, +} + +impl slog::Value for LastAttempt { + fn serialize( + &self, + _record: &slog::Record<'_>, + key: slog::Key, + serializer: &mut dyn slog::Serializer, + ) -> slog::Result { + serializer.emit_arguments( + key, + &format_args!( + "{{ success={}, ts={}, reason={}, key={:?} }}", + self.success, + self.ts, + self.reason.label(), + self.reason.key(), + ), + ) + } } impl std::fmt::Debug for Resolver { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - let far_lock = self.lock_ts_heap.iter().next(); + let far_lock = self.oldest_transaction(); let mut dt = f.debug_tuple("Resolver"); dt.field(&format_args!("region={}", self.region_id)); - if let Some((ts, keys)) = far_lock { + if let Some((ts, txn_locks)) = far_lock { + dt.field(&format_args!( + "oldest_lock_count={:?}", + txn_locks.lock_count + )); dt.field(&format_args!( - "far_lock={:?}", - keys.iter() - // We must use Display format here or the redact won't take effect. - .map(|k| format!("{}", log_wrappers::Value::key(k))) - .collect::>() + "oldest_lock_sample={:?}", + txn_locks.sample_lock )); - dt.field(&format_args!("far_lock_ts={:?}", ts)); + dt.field(&format_args!("oldest_lock_ts={:?}", ts)); } dt.finish() } } +impl Drop for Resolver { + fn drop(&mut self) { + // Free memory quota used by locks_by_key. + let mut bytes = 0; + let num_locks = self.num_locks(); + for key in self.locks_by_key.keys() { + bytes += self.lock_heap_size(key); + } + if bytes > ON_DROP_WARN_HEAP_SIZE { + warn!("drop huge resolver"; + "region_id" => self.region_id, + "bytes" => bytes, + "num_locks" => num_locks, + "memory_quota_in_use" => self.memory_quota.in_use(), + "memory_quota_capacity" => self.memory_quota.capacity(), + ); + } + self.memory_quota.free(bytes); + } +} + impl Resolver { - pub fn new(region_id: u64) -> Resolver { - Resolver::with_read_progress(region_id, None) + pub fn new(region_id: u64, memory_quota: Arc) -> Resolver { + Resolver::with_read_progress(region_id, None, memory_quota) } pub fn with_read_progress( region_id: u64, read_progress: Option>, + memory_quota: Arc, ) -> Resolver { Resolver { region_id, resolved_ts: TimeStamp::zero(), locks_by_key: HashMap::default(), lock_ts_heap: BTreeMap::new(), + last_aggressive_shrink_time: Instant::now_coarse(), read_progress, tracked_index: 0, min_ts: TimeStamp::zero(), stopped: false, + memory_quota, + last_attempt: None, } } @@ -85,16 +205,7 @@ impl Resolver { self.stopped } - pub fn size(&self) -> usize { - self.locks_by_key.keys().map(|k| k.len()).sum::() - + self - .lock_ts_heap - .values() - .map(|h| h.iter().map(|k| k.len()).sum::()) - .sum::() - } - - pub fn locks(&self) -> &BTreeMap>> { + pub fn locks(&self) -> &BTreeMap { &self.lock_ts_heap } @@ -115,19 +226,92 @@ impl Resolver { self.tracked_index = index; } - pub fn track_lock(&mut self, start_ts: TimeStamp, key: Vec, index: Option) { + // Return an approximate heap memory usage in bytes. + pub fn approximate_heap_bytes(&self) -> usize { + if self.locks_by_key.is_empty() { + return 0; + } + + const SAMPLE_COUNT: usize = 8; + let mut key_count = 0; + let mut key_bytes = 0; + for key in self.locks_by_key.keys() { + key_count += 1; + key_bytes += key.len(); + if key_count >= SAMPLE_COUNT { + break; + } + } + self.locks_by_key.len() * (key_bytes / key_count + std::mem::size_of::()) + + self.lock_ts_heap.len() + * (std::mem::size_of::() + std::mem::size_of::()) + } + + fn lock_heap_size(&self, key: &[u8]) -> usize { + // A resolver has + // * locks_by_key: HashMap, TimeStamp> + // * lock_ts_heap: BTreeMap + // + // We only count memory used by locks_by_key. Because the majority of + // memory is consumed by keys, locks_by_key and lock_ts_heap shares + // the same Arc<[u8]>, so lock_ts_heap is negligible. Also, it's hard to + // track accurate memory usage of lock_ts_heap as a timestamp may have + // many keys. + std::mem::size_of_val(key) + std::mem::size_of::() + } + + fn shrink_ratio(&mut self, ratio: usize) { + // HashMap load factor is 87% approximately, leave some margin to avoid + // frequent rehash. + // + // See https://github.com/rust-lang/hashbrown/blob/v0.14.0/src/raw/mod.rs#L208-L220 + const MIN_SHRINK_RATIO: usize = 2; + if self.locks_by_key.capacity() + > self.locks_by_key.len() * cmp::max(MIN_SHRINK_RATIO, ratio) + { + self.locks_by_key.shrink_to_fit(); + } + } + + pub fn track_lock( + &mut self, + start_ts: TimeStamp, + key: Vec, + index: Option, + ) -> Result<(), MemoryQuotaExceeded> { if let Some(index) = index { self.update_tracked_index(index); } + let bytes = self.lock_heap_size(&key); debug!( - "track lock {}@{}, region {}", + "track lock {}@{}", &log_wrappers::Value::key(&key), - start_ts, - self.region_id + start_ts; + "region_id" => self.region_id, + "memory_in_use" => self.memory_quota.in_use(), + "memory_capacity" => self.memory_quota.capacity(), + "key_heap_size" => bytes, ); + self.memory_quota.alloc(bytes)?; let key: Arc<[u8]> = key.into_boxed_slice().into(); - self.locks_by_key.insert(key.clone(), start_ts); - self.lock_ts_heap.entry(start_ts).or_default().insert(key); + match self.locks_by_key.entry(key) { + HashMapEntry::Occupied(_) => { + // Free memory quota because it's already in the map. + self.memory_quota.free(bytes); + } + HashMapEntry::Vacant(entry) => { + // Add lock count for the start ts. + let txn_locks = self.lock_ts_heap.entry(start_ts).or_insert_with(|| { + let mut txn_locks = TxnLocks::default(); + txn_locks.sample_lock = Some(entry.key().clone()); + txn_locks + }); + txn_locks.lock_count += 1; + + entry.insert(start_ts); + } + } + Ok(()) } pub fn untrack_lock(&mut self, key: &[u8], index: Option) { @@ -135,49 +319,92 @@ impl Resolver { self.update_tracked_index(index); } let start_ts = if let Some(start_ts) = self.locks_by_key.remove(key) { + let bytes = self.lock_heap_size(key); + self.memory_quota.free(bytes); start_ts } else { - debug!("untrack a lock that was not tracked before"; "key" => &log_wrappers::Value::key(key)); + debug!("untrack a lock that was not tracked before"; + "key" => &log_wrappers::Value::key(key), + "region_id" => self.region_id, + ); return; }; debug!( - "untrack lock {}@{}, region {}", + "untrack lock {}@{}", &log_wrappers::Value::key(key), - start_ts, - self.region_id, + start_ts; + "region_id" => self.region_id, + "memory_in_use" => self.memory_quota.in_use(), ); - let entry = self.lock_ts_heap.get_mut(&start_ts); - if let Some(locked_keys) = entry { - locked_keys.remove(key); - if locked_keys.is_empty() { + if let Some(txn_locks) = self.lock_ts_heap.get_mut(&start_ts) { + if txn_locks.lock_count > 0 { + txn_locks.lock_count -= 1; + } + if txn_locks.lock_count == 0 { self.lock_ts_heap.remove(&start_ts); } - } + }; + // Use a large ratio to amortize the cost of rehash. + let shrink_ratio = 8; + self.shrink_ratio(shrink_ratio); } /// Try to advance resolved ts. /// /// `min_ts` advances the resolver even if there is no write. /// Return None means the resolver is not initialized. - pub fn resolve(&mut self, min_ts: TimeStamp, now: Option) -> TimeStamp { + pub fn resolve( + &mut self, + min_ts: TimeStamp, + now: Option, + source: TsSource, + ) -> TimeStamp { + // Use a small ratio to shrink the memory usage aggressively. + const AGGRESSIVE_SHRINK_RATIO: usize = 2; + const AGGRESSIVE_SHRINK_INTERVAL: Duration = Duration::from_secs(10); + if self.last_aggressive_shrink_time.saturating_elapsed() > AGGRESSIVE_SHRINK_INTERVAL { + self.shrink_ratio(AGGRESSIVE_SHRINK_RATIO); + self.last_aggressive_shrink_time = Instant::now_coarse(); + } + // The `Resolver` is stopped, not need to advance, just return the current // `resolved_ts` if self.stopped { return self.resolved_ts; } + // Find the min start ts. - let min_lock = self.lock_ts_heap.keys().next().cloned(); + let min_lock = self.oldest_transaction(); let has_lock = min_lock.is_some(); - let min_start_ts = min_lock.unwrap_or(min_ts); + let min_start_ts = min_lock.as_ref().map(|(ts, _)| **ts).unwrap_or(min_ts); // No more commit happens before the ts. let new_resolved_ts = cmp::min(min_start_ts, min_ts); + // reason is the min source of the new resolved ts. + let reason = match (min_lock, min_ts) { + (Some((lock_ts, txn_locks)), min_ts) if *lock_ts < min_ts => { + TsSource::Lock(txn_locks.clone()) + } + (Some(_), _) => source, + (None, _) => source, + }; + if self.resolved_ts >= new_resolved_ts { - let label = if has_lock { "has_lock" } else { "stale_ts" }; RTS_RESOLVED_FAIL_ADVANCE_VEC - .with_label_values(&[label]) + .with_label_values(&[reason.label()]) .inc(); + self.last_attempt = Some(LastAttempt { + success: false, + ts: new_resolved_ts, + reason, + }); + } else { + self.last_attempt = Some(LastAttempt { + success: true, + ts: new_resolved_ts, + reason, + }) } // Resolved ts never decrease. @@ -204,21 +431,16 @@ impl Resolver { pub(crate) fn log_locks(&self, min_start_ts: u64) { // log lock with the minimum start_ts >= min_start_ts - if let Some((start_ts, keys)) = self + if let Some((start_ts, txn_locks)) = self .lock_ts_heap .range(TimeStamp::new(min_start_ts)..) .next() { - let keys_for_log = keys - .iter() - .map(|key| log_wrappers::Value::key(key)) - .take(MAX_NUMBER_OF_LOCKS_IN_LOG) - .collect::>(); info!( "locks with the minimum start_ts in resolver"; "region_id" => self.region_id, "start_ts" => start_ts, - "sampled_keys" => ?keys_for_log, + "txn_locks" => ?txn_locks, ); } } @@ -230,6 +452,18 @@ impl Resolver { pub(crate) fn num_transactions(&self) -> u64 { self.lock_ts_heap.len() as u64 } + + pub(crate) fn read_progress(&self) -> Option<&Arc> { + self.read_progress.as_ref() + } + + pub(crate) fn oldest_transaction(&self) -> Option<(&TimeStamp, &TxnLocks)> { + self.lock_ts_heap.iter().next() + } + + pub(crate) fn take_last_attempt(&mut self) -> Option { + self.last_attempt.take() + } } #[cfg(test)] @@ -300,16 +534,19 @@ mod tests { ]; for (i, case) in cases.into_iter().enumerate() { - let mut resolver = Resolver::new(1); + let memory_quota = Arc::new(MemoryQuota::new(std::usize::MAX)); + let mut resolver = Resolver::new(1, memory_quota); for e in case.clone() { match e { Event::Lock(start_ts, key) => { - resolver.track_lock(start_ts.into(), key.into_raw().unwrap(), None) + resolver + .track_lock(start_ts.into(), key.into_raw().unwrap(), None) + .unwrap(); } Event::Unlock(key) => resolver.untrack_lock(&key.into_raw().unwrap(), None), Event::Resolve(min_ts, expect) => { assert_eq!( - resolver.resolve(min_ts.into(), None), + resolver.resolve(min_ts.into(), None, TsSource::PdTso), expect.into(), "case {}", i @@ -319,4 +556,158 @@ mod tests { } } } + + #[test] + fn test_memory_quota() { + let memory_quota = Arc::new(MemoryQuota::new(1024)); + let mut resolver = Resolver::new(1, memory_quota.clone()); + let mut key = vec![0; 77]; + let lock_size = resolver.lock_heap_size(&key); + let mut ts = TimeStamp::default(); + while resolver.track_lock(ts, key.clone(), None).is_ok() { + ts.incr(); + key[0..8].copy_from_slice(&ts.into_inner().to_be_bytes()); + } + let remain = 1024 % lock_size; + assert_eq!(memory_quota.in_use(), 1024 - remain); + + let mut ts = TimeStamp::default(); + for _ in 0..5 { + ts.incr(); + key[0..8].copy_from_slice(&ts.into_inner().to_be_bytes()); + resolver.untrack_lock(&key, None); + } + assert_eq!(memory_quota.in_use(), 1024 - 5 * lock_size - remain); + drop(resolver); + assert_eq!(memory_quota.in_use(), 0); + } + + #[test] + fn test_untrack_lock_shrink_ratio() { + let memory_quota = Arc::new(MemoryQuota::new(std::usize::MAX)); + let mut resolver = Resolver::new(1, memory_quota); + let mut key = vec![0; 16]; + let mut ts = TimeStamp::default(); + for _ in 0..1000 { + ts.incr(); + key[0..8].copy_from_slice(&ts.into_inner().to_be_bytes()); + let _ = resolver.track_lock(ts, key.clone(), None); + } + assert!( + resolver.locks_by_key.capacity() >= 1000, + "{}", + resolver.locks_by_key.capacity() + ); + + let mut ts = TimeStamp::default(); + for _ in 0..901 { + ts.incr(); + key[0..8].copy_from_slice(&ts.into_inner().to_be_bytes()); + resolver.untrack_lock(&key, None); + } + // shrink_to_fit may reserve some space in accordance with the resize + // policy, but it is expected to be less than 500. + assert!( + resolver.locks_by_key.capacity() < 500, + "{}, {}", + resolver.locks_by_key.capacity(), + resolver.locks_by_key.len(), + ); + + for _ in 0..99 { + ts.incr(); + key[0..8].copy_from_slice(&ts.into_inner().to_be_bytes()); + resolver.untrack_lock(&key, None); + } + assert!( + resolver.locks_by_key.capacity() < 100, + "{}, {}", + resolver.locks_by_key.capacity(), + resolver.locks_by_key.len(), + ); + + // Trigger aggressive shrink. + resolver.last_aggressive_shrink_time = Instant::now_coarse() - Duration::from_secs(600); + resolver.resolve(TimeStamp::new(0), None, TsSource::PdTso); + assert!( + resolver.locks_by_key.capacity() == 0, + "{}, {}", + resolver.locks_by_key.capacity(), + resolver.locks_by_key.len(), + ); + } + + #[test] + fn test_idempotent_track_and_untrack_lock() { + let memory_quota = Arc::new(MemoryQuota::new(std::usize::MAX)); + let mut resolver = Resolver::new(1, memory_quota); + let mut key = vec![0; 16]; + + // track_lock + let mut ts = TimeStamp::default(); + for c in 0..10 { + ts.incr(); + for k in 0..100u64 { + key[0..8].copy_from_slice(&k.to_be_bytes()); + key[8..16].copy_from_slice(&ts.into_inner().to_be_bytes()); + let _ = resolver.track_lock(ts, key.clone(), None); + } + let in_use1 = resolver.memory_quota.in_use(); + let key_count1 = resolver.locks_by_key.len(); + let txn_count1 = resolver.lock_ts_heap.len(); + let txn_lock_count1 = resolver.lock_ts_heap[&ts].lock_count; + assert!(in_use1 > 0); + assert_eq!(key_count1, (c + 1) * 100); + assert_eq!(txn_count1, c + 1); + + // Put same keys again, resolver internal state must be idempotent. + for k in 0..100u64 { + key[0..8].copy_from_slice(&k.to_be_bytes()); + key[8..16].copy_from_slice(&ts.into_inner().to_be_bytes()); + let _ = resolver.track_lock(ts, key.clone(), None); + } + let in_use2 = resolver.memory_quota.in_use(); + let key_count2 = resolver.locks_by_key.len(); + let txn_count2 = resolver.lock_ts_heap.len(); + let txn_lock_count2 = resolver.lock_ts_heap[&ts].lock_count; + assert_eq!(in_use1, in_use2); + assert_eq!(key_count1, key_count2); + assert_eq!(txn_count1, txn_count2); + assert_eq!(txn_lock_count1, txn_lock_count2); + } + assert_eq!(resolver.resolve(ts, None, TsSource::PdTso), 1.into()); + + // untrack_lock + let mut ts = TimeStamp::default(); + for _ in 0..10 { + ts.incr(); + for k in 0..100u64 { + key[0..8].copy_from_slice(&k.to_be_bytes()); + key[8..16].copy_from_slice(&ts.into_inner().to_be_bytes()); + resolver.untrack_lock(&key, None); + } + let in_use1 = resolver.memory_quota.in_use(); + let key_count1 = resolver.locks_by_key.len(); + let txn_count1 = resolver.lock_ts_heap.len(); + + // Unlock same keys again, resolver internal state must be idempotent. + for k in 0..100u64 { + key[0..8].copy_from_slice(&k.to_be_bytes()); + key[8..16].copy_from_slice(&ts.into_inner().to_be_bytes()); + resolver.untrack_lock(&key, None); + } + let in_use2 = resolver.memory_quota.in_use(); + let key_count2 = resolver.locks_by_key.len(); + let txn_count2 = resolver.lock_ts_heap.len(); + assert_eq!(in_use1, in_use2); + assert_eq!(key_count1, key_count2); + assert_eq!(txn_count1, txn_count2); + + assert_eq!(resolver.resolve(ts, None, TsSource::PdTso), ts); + } + + assert_eq!(resolver.memory_quota.in_use(), 0); + assert_eq!(resolver.locks_by_key.len(), 0); + assert_eq!(resolver.lock_ts_heap.len(), 0); + } } diff --git a/components/resolved_ts/src/scanner.rs b/components/resolved_ts/src/scanner.rs index 0ca74bda29d..6c8c90dc38f 100644 --- a/components/resolved_ts/src/scanner.rs +++ b/components/resolved_ts/src/scanner.rs @@ -3,56 +3,79 @@ use std::{marker::PhantomData, sync::Arc, time::Duration}; use engine_traits::KvEngine; -use futures::compat::Future01CompatExt; -use kvproto::{kvrpcpb::ExtraOp as TxnExtraOp, metapb::Region}; +use futures::{channel::oneshot::Receiver, compat::Future01CompatExt, FutureExt}; +use kvproto::metapb::Region; use raftstore::{ - coprocessor::{ObserveHandle, ObserveId}, + coprocessor::ObserveHandle, router::CdcHandle, store::{fsm::ChangeObserver, msg::Callback, RegionSnapshot}, }; use tikv::storage::{ kv::{ScanMode as MvccScanMode, Snapshot}, - mvcc::{DeltaScanner, MvccReader, ScannerBuilder}, - txn::{TxnEntry, TxnEntryScanner}, + mvcc::MvccReader, +}; +use tikv_util::{ + sys::thread::ThreadBuildWrapper, time::Instant, timer::GLOBAL_TIMER_HANDLE, worker::Scheduler, +}; +use tokio::{ + runtime::{Builder, Runtime}, + sync::Semaphore, }; -use tikv_util::{sys::thread::ThreadBuildWrapper, time::Instant, timer::GLOBAL_TIMER_HANDLE}; -use tokio::runtime::{Builder, Runtime}; use txn_types::{Key, Lock, LockType, TimeStamp}; use crate::{ errors::{Error, Result}, - metrics::RTS_SCAN_DURATION_HISTOGRAM, + metrics::*, + Task, }; -const DEFAULT_SCAN_BATCH_SIZE: usize = 1024; +const DEFAULT_SCAN_BATCH_SIZE: usize = 128; const GET_SNAPSHOT_RETRY_TIME: u32 = 3; const GET_SNAPSHOT_RETRY_BACKOFF_STEP: Duration = Duration::from_millis(100); -pub type BeforeStartCallback = Box; -pub type OnErrorCallback = Box; -pub type OnEntriesCallback = Box, u64) + Send>; -pub type IsCancelledCallback = Box bool + Send>; - -pub enum ScanMode { - LockOnly, - All, - AllWithOldValue, -} - pub struct ScanTask { pub handle: ObserveHandle, - pub tag: String, - pub mode: ScanMode, pub region: Region, pub checkpoint_ts: TimeStamp, - pub is_cancelled: IsCancelledCallback, - pub send_entries: OnEntriesCallback, - pub on_error: Option, + pub backoff: Option, + pub cancelled: Receiver<()>, + pub scheduler: Scheduler, +} + +impl ScanTask { + async fn send_entries(&self, entries: ScanEntries, apply_index: u64) { + let task = Task::ScanLocks { + region_id: self.region.get_id(), + observe_id: self.handle.id, + entries, + apply_index, + }; + if let Err(e) = self.scheduler.schedule(task) { + warn!("resolved_ts scheduler send entries failed"; "err" => ?e); + } + } + + fn is_cancelled(&mut self) -> bool { + matches!(self.cancelled.try_recv(), Err(_) | Ok(Some(_))) + } + + fn on_error(&self, err: Error) { + if let Err(e) = self.scheduler.schedule(Task::ReRegisterRegion { + region_id: self.region.get_id(), + observe_id: self.handle.id, + cause: err, + }) { + warn!("schedule re-register task failed"; + "region_id" => self.region.get_id(), + "observe_id" => ?self.handle.id, + "error" => ?e); + } + RTS_SCAN_TASKS.with_label_values(&["abort"]).inc(); + } } #[derive(Debug)] -pub enum ScanEntry { - TxnEntry(Vec), +pub enum ScanEntries { Lock(Vec<(Key, Lock)>), None, } @@ -81,95 +104,66 @@ impl, E: KvEngine> ScannerPool { } } - pub fn spawn_task(&self, mut task: ScanTask) { + pub fn spawn_task(&self, mut task: ScanTask, concurrency_semaphore: Arc) { let cdc_handle = self.cdc_handle.clone(); let fut = async move { + tikv_util::defer!({ + RTS_SCAN_TASKS.with_label_values(&["finish"]).inc(); + }); + if let Some(backoff) = task.backoff { + RTS_INITIAL_SCAN_BACKOFF_DURATION_HISTOGRAM.observe(backoff.as_secs_f64()); + let mut backoff = GLOBAL_TIMER_HANDLE + .delay(std::time::Instant::now() + backoff) + .compat() + .fuse(); + futures::select! { + res = backoff => if let Err(e) = res { + error!("failed to backoff"; "err" => ?e); + }, + _ = &mut task.cancelled => {} + } + if task.is_cancelled() { + return; + } + } + let _permit = concurrency_semaphore.acquire().await; + if task.is_cancelled() { + return; + } + fail::fail_point!("resolved_ts_before_scanner_get_snapshot"); let snap = match Self::get_snapshot(&mut task, cdc_handle).await { Ok(snap) => snap, Err(e) => { warn!("resolved_ts scan get snapshot failed"; "err" => ?e); - let ScanTask { - on_error, - region, - handle, - .. - } = task; - if let Some(on_error) = on_error { - on_error(handle.id, region, e); - } + task.on_error(e); return; } }; + fail::fail_point!("resolved_ts_after_scanner_get_snapshot"); let start = Instant::now(); let apply_index = snap.get_apply_index().unwrap(); - let mut entries = vec![]; - match task.mode { - ScanMode::All | ScanMode::AllWithOldValue => { - let txn_extra_op = if let ScanMode::AllWithOldValue = task.mode { - TxnExtraOp::ReadOldValue - } else { - TxnExtraOp::Noop - }; - let mut scanner = ScannerBuilder::new(snap, TimeStamp::max()) - .range(None, None) - .build_delta_scanner(task.checkpoint_ts, txn_extra_op) - .unwrap(); - let mut done = false; - while !done && !(task.is_cancelled)() { - let (es, has_remaining) = match Self::scan_delta(&mut scanner) { - Ok(rs) => rs, - Err(e) => { - warn!("resolved_ts scan delta failed"; "err" => ?e); - let ScanTask { - on_error, - region, - handle, - .. - } = task; - if let Some(on_error) = on_error { - on_error(handle.id, region, e); - } - return; - } - }; - done = !has_remaining; - entries.push(ScanEntry::TxnEntry(es)); - } - } - ScanMode::LockOnly => { - let mut reader = MvccReader::new(snap, Some(MvccScanMode::Forward), false); - let mut done = false; - let mut start = None; - while !done && !(task.is_cancelled)() { - let (locks, has_remaining) = - match Self::scan_locks(&mut reader, start.as_ref(), task.checkpoint_ts) - { - Ok(rs) => rs, - Err(e) => { - warn!("resolved_ts scan lock failed"; "err" => ?e); - let ScanTask { - on_error, - region, - handle, - .. - } = task; - if let Some(on_error) = on_error { - on_error(handle.id, region, e); - } - return; - } - }; - done = !has_remaining; - if has_remaining { - start = Some(locks.last().unwrap().0.clone()) + let mut reader = MvccReader::new(snap, Some(MvccScanMode::Forward), false); + let mut done = false; + let mut start_key = None; + while !done && !task.is_cancelled() { + let (locks, has_remaining) = + match Self::scan_locks(&mut reader, start_key.as_ref(), task.checkpoint_ts) { + Ok(rs) => rs, + Err(e) => { + warn!("resolved_ts scan lock failed"; "err" => ?e); + task.on_error(e); + return; } - entries.push(ScanEntry::Lock(locks)); - } + }; + done = !has_remaining; + if has_remaining { + start_key = Some(locks.last().unwrap().0.clone()) } + task.send_entries(ScanEntries::Lock(locks), apply_index) + .await; } - entries.push(ScanEntry::None); RTS_SCAN_DURATION_HISTOGRAM.observe(start.saturating_elapsed().as_secs_f64()); - (task.send_entries)(entries, apply_index); + task.send_entries(ScanEntries::None, apply_index).await; }; self.workers.spawn(fut); } @@ -181,49 +175,51 @@ impl, E: KvEngine> ScannerPool { let mut last_err = None; for retry_times in 0..=GET_SNAPSHOT_RETRY_TIME { if retry_times != 0 { - if let Err(e) = GLOBAL_TIMER_HANDLE + let mut backoff = GLOBAL_TIMER_HANDLE .delay( std::time::Instant::now() + GET_SNAPSHOT_RETRY_BACKOFF_STEP .mul_f64(10_f64.powi(retry_times as i32 - 1)), ) .compat() - .await - { - error!("failed to backoff"; "err" => ?e); + .fuse(); + futures::select! { + res = backoff => if let Err(e) = res { + error!("failed to backoff"; "err" => ?e); + }, + _ = &mut task.cancelled => {} } - if (task.is_cancelled)() { - return Err(Error::Other("scan task cancelled".into())); + if task.is_cancelled() { + return Err(box_err!("scan task cancelled")); } } let (cb, fut) = tikv_util::future::paired_future_callback(); let change_cmd = ChangeObserver::from_rts(task.region.id, task.handle.clone()); - cdc_handle.capture_change( - task.region.id, - task.region.get_region_epoch().clone(), - change_cmd, - Callback::read(Box::new(cb)), - )?; + cdc_handle + .capture_change( + task.region.id, + task.region.get_region_epoch().clone(), + change_cmd, + Callback::read(Box::new(cb)), + ) + .map_err(|e| Error::Other(box_err!("{:?}", e)))?; let mut resp = box_try!(fut.await); if resp.response.get_header().has_error() { let err = resp.response.take_header().take_error(); // These two errors can't handled by retrying since the epoch and observe id is // unchanged if err.has_epoch_not_match() || err.get_message().contains("stale observe id") { - return Err(Error::request(err)); + return Err(box_err!("get snapshot failed: {:?}", err)); } last_err = Some(err) } else { return Ok(resp.snapshot.unwrap()); } } - Err(Error::Other( - format!( - "backoff timeout after {} try, last error: {:?}", - GET_SNAPSHOT_RETRY_TIME, - last_err.unwrap() - ) - .into(), + Err(box_err!( + "backoff timeout after {} try, last error: {:?}", + GET_SNAPSHOT_RETRY_TIME, + last_err.unwrap() )) } @@ -232,29 +228,14 @@ impl, E: KvEngine> ScannerPool { start: Option<&Key>, _checkpoint_ts: TimeStamp, ) -> Result<(Vec<(Key, Lock)>, bool)> { - let (locks, has_remaining) = reader.scan_locks( - start, - None, - |lock| matches!(lock.lock_type, LockType::Put | LockType::Delete), - DEFAULT_SCAN_BATCH_SIZE, - )?; + let (locks, has_remaining) = reader + .scan_locks( + start, + None, + |lock| matches!(lock.lock_type, LockType::Put | LockType::Delete), + DEFAULT_SCAN_BATCH_SIZE, + ) + .map_err(|e| Error::Other(box_err!("{:?}", e)))?; Ok((locks, has_remaining)) } - - fn scan_delta(scanner: &mut DeltaScanner) -> Result<(Vec, bool)> { - let mut entries = Vec::with_capacity(DEFAULT_SCAN_BATCH_SIZE); - let mut has_remaining = true; - while entries.len() < entries.capacity() { - match scanner.next_entry()? { - Some(entry) => { - entries.push(entry); - } - None => { - has_remaining = false; - break; - } - } - } - Ok((entries, has_remaining)) - } } diff --git a/components/resolved_ts/tests/failpoints/mod.rs b/components/resolved_ts/tests/failpoints/mod.rs index 808f5ed62ff..0c594ab1d1d 100644 --- a/components/resolved_ts/tests/failpoints/mod.rs +++ b/components/resolved_ts/tests/failpoints/mod.rs @@ -2,6 +2,11 @@ #[path = "../mod.rs"] mod testsuite; +use std::{ + sync::{mpsc::channel, Mutex}, + time::Duration, +}; + use futures::executor::block_on; use kvproto::kvrpcpb::*; use pd_client::PdClient; @@ -128,3 +133,43 @@ fn test_report_min_resolved_ts_disable() { fail::remove("mock_min_resolved_ts_interval_disable"); suite.stop(); } + +#[test] +fn test_pending_locks_memory_quota_exceeded() { + // Pause scan lock so that locks will be put in pending locks. + fail::cfg("resolved_ts_after_scanner_get_snapshot", "pause").unwrap(); + // Check if memory quota exceeded is triggered. + let (tx, rx) = channel(); + let tx = Mutex::new(tx); + fail::cfg_callback( + "resolved_ts_on_pending_locks_memory_quota_exceeded", + move || { + let sender = tx.lock().unwrap(); + sender.send(()).unwrap(); + }, + ) + .unwrap(); + + let mut suite = TestSuite::new(1); + let region = suite.cluster.get_region(&[]); + + // Must not trigger memory quota exceeded. + rx.recv_timeout(Duration::from_millis(100)).unwrap_err(); + + // Set a small memory quota to trigger memory quota exceeded. + suite.must_change_memory_quota(1, 1); + let (k, v) = (b"k1", b"v"); + let start_ts = block_on(suite.cluster.pd_client.get_tso()).unwrap(); + let mut mutation = Mutation::default(); + mutation.set_op(Op::Put); + mutation.key = k.to_vec(); + mutation.value = v.to_vec(); + suite.must_kv_prewrite(region.id, vec![mutation], k.to_vec(), start_ts, false); + + // Must trigger memory quota exceeded. + rx.recv_timeout(Duration::from_secs(5)).unwrap(); + + fail::remove("resolved_ts_after_scanner_get_snapshot"); + fail::remove("resolved_ts_on_pending_locks_memory_quota_exceeded"); + suite.stop(); +} diff --git a/components/resolved_ts/tests/integrations/mod.rs b/components/resolved_ts/tests/integrations/mod.rs index 7802108b92b..881d0b299f1 100644 --- a/components/resolved_ts/tests/integrations/mod.rs +++ b/components/resolved_ts/tests/integrations/mod.rs @@ -2,15 +2,17 @@ #[path = "../mod.rs"] mod testsuite; -use std::time::Duration; +use std::{sync::mpsc::channel, time::Duration}; use futures::executor::block_on; use kvproto::{kvrpcpb::*, metapb::RegionEpoch}; use pd_client::PdClient; +use resolved_ts::Task; use tempfile::Builder; -use test_raftstore::sleep_ms; +use test_raftstore::{sleep_ms, IsolationFilterFactory}; use test_sst_importer::*; pub use testsuite::*; +use tikv_util::store::new_peer; #[test] fn test_resolved_ts_basic() { @@ -141,3 +143,120 @@ fn test_dynamic_change_advance_ts_interval() { suite.stop(); } + +#[test] +fn test_change_log_memory_quota_exceeded() { + let mut suite = TestSuite::new(1); + let region = suite.cluster.get_region(&[]); + + suite.must_get_rts_ge( + region.id, + block_on(suite.cluster.pd_client.get_tso()).unwrap(), + ); + + // Set a small memory quota to trigger memory quota exceeded. + suite.must_change_memory_quota(1, 1); + let (k, v) = (b"k1", b"v"); + let start_ts = block_on(suite.cluster.pd_client.get_tso()).unwrap(); + let mut mutation = Mutation::default(); + mutation.set_op(Op::Put); + mutation.key = k.to_vec(); + mutation.value = v.to_vec(); + suite.must_kv_prewrite(region.id, vec![mutation], k.to_vec(), start_ts, false); + + // Resolved ts should not advance. + let (tx, rx) = channel(); + suite.must_schedule_task( + 1, + Task::GetDiagnosisInfo { + region_id: 1, + log_locks: false, + min_start_ts: u64::MAX, + callback: Box::new(move |res| { + tx.send(res).unwrap(); + }), + }, + ); + let res = rx.recv_timeout(Duration::from_secs(5)).unwrap(); + assert_eq!(res.unwrap().1, 0, "{:?}", res); + + suite.stop(); +} + +#[test] +fn test_scan_log_memory_quota_exceeded() { + let mut suite = TestSuite::new(1); + let region = suite.cluster.get_region(&[]); + + suite.must_get_rts_ge( + region.id, + block_on(suite.cluster.pd_client.get_tso()).unwrap(), + ); + + let (k, v) = (b"k1", b"v"); + let start_ts = block_on(suite.cluster.pd_client.get_tso()).unwrap(); + let mut mutation = Mutation::default(); + mutation.set_op(Op::Put); + mutation.key = k.to_vec(); + mutation.value = v.to_vec(); + suite.must_kv_prewrite(region.id, vec![mutation], k.to_vec(), start_ts, false); + + // Set a small memory quota to trigger memory quota exceeded. + suite.must_change_memory_quota(1, 1); + // Split region + suite.cluster.must_split(®ion, k); + + let r1 = suite.cluster.get_region(&[]); + let r2 = suite.cluster.get_region(k); + let current_ts = block_on(suite.cluster.pd_client.get_tso()).unwrap(); + // Wait for scan log. + sleep_ms(500); + // Resolved ts of region1 should be advanced + suite.must_get_rts_ge(r1.id, current_ts); + + // Resolved ts should not advance. + let (tx, rx) = channel(); + suite.must_schedule_task( + r2.id, + Task::GetDiagnosisInfo { + region_id: r2.id, + log_locks: false, + min_start_ts: u64::MAX, + callback: Box::new(move |res| { + tx.send(res).unwrap(); + }), + }, + ); + let res = rx.recv_timeout(Duration::from_secs(5)).unwrap(); + assert_eq!(res.unwrap().1, 0, "{:?}", res); + + suite.stop(); +} + +// This case checks resolved ts can still be advanced quickly even if some TiKV +// stores are partitioned. +#[test] +fn test_store_partitioned() { + let mut suite = TestSuite::new(3); + let r = suite.cluster.get_region(&[]); + suite.cluster.must_transfer_leader(r.id, new_peer(1, 1)); + suite.must_get_rts_ge(r.id, block_on(suite.cluster.pd_client.get_tso()).unwrap()); + + suite + .cluster + .add_send_filter(IsolationFilterFactory::new(3)); + let tso = block_on(suite.cluster.pd_client.get_tso()).unwrap(); + for _ in 0..50 { + let rts = suite.region_resolved_ts(r.id).unwrap(); + if rts > tso { + if rts.physical() - tso.physical() < 3000 { + break; + } else { + panic!("resolved ts doesn't advance in time") + } + } + sleep_ms(100); + } + + suite.stop(); +} diff --git a/components/resolved_ts/tests/mod.rs b/components/resolved_ts/tests/mod.rs index 4e6226f8935..830e2156e9f 100644 --- a/components/resolved_ts/tests/mod.rs +++ b/components/resolved_ts/tests/mod.rs @@ -122,8 +122,21 @@ impl TestSuite { ); c }; + self.must_schedule_task(store_id, Task::ChangeConfig { change }); + } + + pub fn must_change_memory_quota(&self, store_id: u64, bytes: u64) { + let change = { + let mut c = std::collections::HashMap::default(); + c.insert("memory_quota".to_owned(), ConfigValue::Size(bytes)); + c + }; + self.must_schedule_task(store_id, Task::ChangeConfig { change }); + } + + pub fn must_schedule_task(&self, store_id: u64, task: Task) { let scheduler = self.endpoints.get(&store_id).unwrap().scheduler(); - scheduler.schedule(Task::ChangeConfig { change }).unwrap(); + scheduler.schedule(task).unwrap(); } pub fn must_kv_prewrite( diff --git a/components/resource_control/src/future.rs b/components/resource_control/src/future.rs index fd98fc9a092..3a45e086ba2 100644 --- a/components/resource_control/src/future.rs +++ b/components/resource_control/src/future.rs @@ -16,7 +16,7 @@ use tokio_timer::Delay; use crate::{ resource_group::{ResourceConsumeType, ResourceController}, - resource_limiter::ResourceLimiter, + resource_limiter::{ResourceLimiter, ResourceType}, }; const MAX_WAIT_DURATION: Duration = Duration::from_secs(10); @@ -92,7 +92,9 @@ pub struct LimitedFuture { #[pin] post_delay: OptionalFuture>, resource_limiter: Arc, - res: Poll, + // if the future is first polled, we need to let it consume a 0 value + // to compensate the debt of previously finished tasks. + is_first_poll: bool, } impl LimitedFuture { @@ -102,7 +104,7 @@ impl LimitedFuture { pre_delay: None.into(), post_delay: None.into(), resource_limiter, - res: Poll::Pending, + is_first_poll: true, } } } @@ -112,26 +114,50 @@ impl Future for LimitedFuture { fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll { let mut this = self.project(); - if !this.post_delay.is_done() { - assert!(this.pre_delay.is_done()); + if *this.is_first_poll { + debug_assert!(this.pre_delay.finished && this.post_delay.finished); + *this.is_first_poll = false; + let wait_dur = this + .resource_limiter + .consume(Duration::ZERO, IoBytes::default(), true) + .min(MAX_WAIT_DURATION); + if wait_dur > Duration::ZERO { + *this.pre_delay = Some( + GLOBAL_TIMER_HANDLE + .delay(std::time::Instant::now() + wait_dur) + .compat(), + ) + .into(); + } + } + if !this.post_delay.finished { + assert!(this.pre_delay.finished); std::mem::swap(&mut *this.pre_delay, &mut *this.post_delay); } - if !this.pre_delay.is_done() { + if !this.pre_delay.finished { let res = this.pre_delay.poll(cx); if res.is_pending() { return Poll::Pending; } } - if this.res.is_ready() { - return std::mem::replace(this.res, Poll::Pending); - } - let last_io_bytes = match get_thread_io_bytes_stats() { - Ok(b) => Some(b), - Err(e) => { - warn!("load thread io bytes failed"; "err" => e); - None + // get io stats is very expensive, so we only do so if only io control is + // enabled. + let mut last_io_bytes = None; + if this + .resource_limiter + .get_limiter(ResourceType::Io) + .get_rate_limit() + .is_finite() + { + match get_thread_io_bytes_stats() { + Ok(b) => { + last_io_bytes = Some(b); + } + Err(e) => { + warn!("load thread io bytes failed"; "err" => e); + } } - }; + } let start = Instant::now(); let res = this.f.poll(cx); let dur = start.saturating_elapsed(); @@ -146,8 +172,10 @@ impl Future for LimitedFuture { } else { IoBytes::default() }; - let mut wait_dur = this.resource_limiter.consume(dur, io_bytes); - if wait_dur == Duration::ZERO { + let mut wait_dur = this + .resource_limiter + .consume(dur, io_bytes, res.is_pending()); + if wait_dur == Duration::ZERO || res.is_ready() { return res; } if wait_dur > MAX_WAIT_DURATION { @@ -160,31 +188,24 @@ impl Future for LimitedFuture { .compat(), ) .into(); - if this.post_delay.poll(cx).is_ready() { - return res; - } - *this.res = res; + _ = this.post_delay.poll(cx); Poll::Pending } } /// `OptionalFuture` is similar to futures::OptionFuture, but provide an extra -/// `is_done` method. +/// `finished` flag to determine if the future requires poll. #[pin_project] struct OptionalFuture { #[pin] f: Option, - done: bool, + finished: bool, } impl OptionalFuture { fn new(f: Option) -> Self { - let done = f.is_none(); - Self { f, done } - } - - fn is_done(&self) -> bool { - self.done + let finished = f.is_none(); + Self { f, finished } } } @@ -201,7 +222,7 @@ impl Future for OptionalFuture { let this = self.project(); match this.f.as_pin_mut() { Some(x) => x.poll(cx).map(|r| { - *this.done = true; + *this.finished = true; Some(r) }), None => Poll::Ready(None), @@ -263,7 +284,13 @@ mod tests { .name_prefix("test") .build_future_pool(); - let resource_limiter = Arc::new(ResourceLimiter::new("".into(), f64::INFINITY, 1000.0, 0)); + let resource_limiter = Arc::new(ResourceLimiter::new( + "".into(), + f64::INFINITY, + 1000.0, + 0, + true, + )); fn spawn_and_wait(pool: &FuturePool, f: F, limiter: Arc) where @@ -295,15 +322,18 @@ mod tests { let delta = new_stats - stats; let dur = start.saturating_elapsed(); assert_eq!(delta.total_consumed, 150); - assert_eq!(delta.total_wait_dur_us, 150_000); - assert!(dur >= Duration::from_millis(150) && dur <= Duration::from_millis(160)); + assert!(delta.total_wait_dur_us >= 140_000 && delta.total_wait_dur_us <= 160_000); + assert!(dur >= Duration::from_millis(140) && dur <= Duration::from_millis(160)); // fetch io bytes failed, consumed value is 0. #[cfg(feature = "failpoints")] { fail::cfg("failed_to_get_thread_io_bytes_stats", "1*return").unwrap(); spawn_and_wait(&pool, empty(), resource_limiter.clone()); - assert_eq!(resource_limiter.get_limit_statistics(Io), new_stats); + assert_eq!( + resource_limiter.get_limit_statistics(Io).total_consumed, + new_stats.total_consumed + ); fail::remove("failed_to_get_thread_io_bytes_stats"); } } diff --git a/components/resource_control/src/lib.rs b/components/resource_control/src/lib.rs index 6cfd24914a1..53db62c96e1 100644 --- a/components/resource_control/src/lib.rs +++ b/components/resource_control/src/lib.rs @@ -1,6 +1,8 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. #![feature(test)] #![feature(local_key_cell_methods)] +#![feature(array_zip)] +#![feature(let_chains)] use std::sync::Arc; @@ -10,8 +12,8 @@ use serde::{Deserialize, Serialize}; mod resource_group; pub use resource_group::{ - ResourceConsumeType, ResourceController, ResourceGroupManager, TaskMetadata, - MIN_PRIORITY_UPDATE_INTERVAL, + priority_from_task_meta, ResourceConsumeType, ResourceController, ResourceGroupManager, + TaskMetadata, MIN_PRIORITY_UPDATE_INTERVAL, }; mod future; @@ -29,7 +31,9 @@ pub use channel::ResourceMetered; mod resource_limiter; pub use resource_limiter::ResourceLimiter; use tikv_util::worker::Worker; -use worker::{GroupQuotaAdjustWorker, BACKGROUND_LIMIT_ADJUST_DURATION}; +use worker::{ + GroupQuotaAdjustWorker, PriorityLimiterAdjustWorker, BACKGROUND_LIMIT_ADJUST_DURATION, +}; mod metrics; pub mod worker; @@ -66,10 +70,13 @@ pub fn start_periodic_tasks( bg_worker.spawn_async_task(async move { resource_mgr_service_clone.watch_resource_groups().await; }); - // spawn a task to auto adjust background quota limiter. + // spawn a task to auto adjust background quota limiter and priority quota + // limiter. let mut worker = GroupQuotaAdjustWorker::new(mgr.clone(), io_bandwidth); + let mut priority_worker = PriorityLimiterAdjustWorker::new(mgr.clone()); bg_worker.spawn_interval_task(BACKGROUND_LIMIT_ADJUST_DURATION, move || { worker.adjust_quota(); + priority_worker.adjust(); }); // spawn a task to periodically upload resource usage statistics to PD. bg_worker.spawn_async_task(async move { diff --git a/components/resource_control/src/metrics.rs b/components/resource_control/src/metrics.rs index 16338f41c6c..594c6af486a 100644 --- a/components/resource_control/src/metrics.rs +++ b/components/resource_control/src/metrics.rs @@ -7,19 +7,45 @@ lazy_static! { pub static ref BACKGROUND_QUOTA_LIMIT_VEC: IntGaugeVec = register_int_gauge_vec!( "tikv_resource_control_background_quota_limiter", "The quota limiter of background resource groups per resource type", - &["name", "type"] + &["resource_group", "type"] ) .unwrap(); pub static ref BACKGROUND_RESOURCE_CONSUMPTION: IntCounterVec = register_int_counter_vec!( "tikv_resource_control_background_resource_consumption", "Total resource consumed of background resource groups per resource type", - &["name", "type"] + &["resource_group", "type"] ) .unwrap(); pub static ref BACKGROUND_TASKS_WAIT_DURATION: IntCounterVec = register_int_counter_vec!( "tikv_resource_control_background_task_wait_duration", "Total wait duration of background tasks per resource group", - &["name"] + &["resource_group"] + ) + .unwrap(); + pub static ref PRIORITY_QUOTA_LIMIT_VEC: IntGaugeVec = register_int_gauge_vec!( + "tikv_resource_control_priority_quota_limit", + "The quota limiter for each priority in resource control", + &["priority"] + ) + .unwrap(); + pub static ref PRIORITY_CPU_TIME_VEC: IntCounterVec = register_int_counter_vec!( + "tikv_resource_control_priority_task_exec_duration", + "Total execution duration of tasks per-priority", + &["priority"] + ) + .unwrap(); + pub static ref PRIORITY_WAIT_DURATION_VEC: HistogramVec = register_histogram_vec!( + "tikv_resource_control_priority_wait_duration", + "Histogram of wait duration cause by priority quota limiter", + &["priority"], + exponential_buckets(1e-5, 2.0, 18).unwrap() // 10us ~ 2.5s + ) + .unwrap(); + + pub static ref BACKGROUND_TASK_RESOURCE_UTILIZATION_VEC: IntGaugeVec = register_int_gauge_vec!( + "tikv_resource_control_bg_resource_utilization", + "The total resource utilization percentage of background tasks", + &["type"] ) .unwrap(); } diff --git a/components/resource_control/src/resource_group.rs b/components/resource_control/src/resource_group.rs index a356d30a7ac..7e6d4279a25 100644 --- a/components/resource_control/src/resource_group.rs +++ b/components/resource_control/src/resource_group.rs @@ -22,6 +22,7 @@ use kvproto::{ resource_manager::{GroupMode, ResourceGroup as PbResourceGroup}, }; use parking_lot::{MappedRwLockReadGuard, RwLock, RwLockReadGuard}; +use strum::{EnumCount, EnumIter, IntoEnumIterator}; use tikv_util::{info, time::Instant}; use yatp::queue::priority::TaskPriorityProvider; @@ -40,7 +41,6 @@ const DEFAULT_MAX_RU_QUOTA: u64 = 10_000; /// The maximum RU quota that can be configured. const MAX_RU_QUOTA: u64 = i32::MAX as u64; -#[cfg(test)] const LOW_PRIORITY: u32 = 1; const MEDIUM_PRIORITY: u32 = 8; #[cfg(test)] @@ -56,21 +56,74 @@ pub enum ResourceConsumeType { IoBytes(u64), } +#[derive(Copy, Clone, Eq, PartialEq, EnumCount, EnumIter, Debug)] +#[repr(usize)] +pub enum TaskPriority { + High = 0, + Medium = 1, + Low = 2, +} + +impl TaskPriority { + pub fn as_str(&self) -> &'static str { + match *self { + TaskPriority::High => "high", + TaskPriority::Medium => "medium", + TaskPriority::Low => "low", + } + } +} + +impl From for TaskPriority { + fn from(value: u32) -> Self { + // map the resource group priority value (1,8,16) to (Low,Medium,High) + // 0 means the priority is not set, so map it to medium by default. + if value == 0 { + Self::Medium + } else if value < 6 { + Self::Low + } else if value < 11 { + Self::Medium + } else { + Self::High + } + } +} + /// ResourceGroupManager manages the metadata of each resource group. pub struct ResourceGroupManager { pub(crate) resource_groups: DashMap, + // the count of all groups, a fast path because call `DashMap::len` is a little slower. + group_count: AtomicU64, registry: RwLock>>, // auto incremental version generator used for mark the background // resource limiter has changed. version_generator: AtomicU64, + // the shared resource limiter of each priority + priority_limiters: [Arc; TaskPriority::COUNT], } impl Default for ResourceGroupManager { fn default() -> Self { + let priority_limiters = TaskPriority::iter() + .map(|p| { + Arc::new(ResourceLimiter::new( + p.as_str().to_owned(), + f64::INFINITY, + f64::INFINITY, + 0, + false, + )) + }) + .collect::>() + .try_into() + .unwrap(); let manager = Self { resource_groups: Default::default(), + group_count: AtomicU64::new(0), registry: Default::default(), version_generator: AtomicU64::new(0), + priority_limiters, }; // init the default resource group by default. @@ -90,6 +143,11 @@ impl Default for ResourceGroupManager { } impl ResourceGroupManager { + #[inline] + pub fn get_group_count(&self) -> u64 { + self.group_count.load(Ordering::Relaxed) + } + fn get_ru_setting(rg: &PbResourceGroup, is_read: bool) -> u64 { match (rg.get_mode(), is_read) { // RU mode, read and write use the same setting. @@ -129,8 +187,13 @@ impl ResourceGroupManager { .and_then(|g| g.limiter.clone()); let limiter = self.build_resource_limiter(&rg, prev_limiter); - self.resource_groups - .insert(group_name, ResourceGroup::new(rg, limiter)); + if self + .resource_groups + .insert(group_name, ResourceGroup::new(rg, limiter)) + .is_none() + { + self.group_count.fetch_add(1, Ordering::Relaxed); + } } fn build_resource_limiter( @@ -146,6 +209,7 @@ impl ResourceGroupManager { f64::INFINITY, f64::INFINITY, version, + true, ))) }) } else { @@ -161,6 +225,7 @@ impl ResourceGroupManager { if self.resource_groups.remove(&group_name).is_some() { deregister_metrics(name); info!("remove resource group"; "name"=> name); + self.group_count.fetch_sub(1, Ordering::Relaxed); } } @@ -184,6 +249,8 @@ impl ResourceGroupManager { controller.remove_resource_group(name.as_bytes()); } }); + self.group_count + .fetch_sub(removed_names.len() as u64, Ordering::Relaxed); } } @@ -234,20 +301,92 @@ impl ResourceGroupManager { } } + // only enable priority quota limiter when there is at least 1 user-defined + // resource group. + #[inline] + fn enable_priority_limiter(&self) -> bool { + self.get_group_count() > 1 + } + + /// return the priority of target resource group. + #[inline] + pub fn get_resource_group_priority(&self, group: &str) -> u32 { + self.resource_groups + .get(group) + .map_or(LOW_PRIORITY, |g| g.group.priority) + } + + // Always return the background resource limiter if any; + // Only return the foregroup limiter when priority is enabled. pub fn get_resource_limiter( &self, rg: &str, request_source: &str, + override_priority: u64, + ) -> Option> { + let (limiter, group_priority) = + self.get_background_resource_limiter_with_priority(rg, request_source); + if limiter.is_some() { + return limiter; + } + + // if there is only 1 resource group, priority quota limiter is useless so just + // return None for better performance. + if !self.enable_priority_limiter() { + return None; + } + + // request priority has higher priority, 0 means priority is not set. + let mut task_priority = override_priority as u32; + if task_priority == 0 { + task_priority = group_priority; + } + Some(self.priority_limiters[TaskPriority::from(task_priority) as usize].clone()) + } + + // return a ResourceLimiter for background tasks only. + pub fn get_background_resource_limiter( + &self, + rg: &str, + request_source: &str, ) -> Option> { + self.get_background_resource_limiter_with_priority(rg, request_source) + .0 + } + + fn get_background_resource_limiter_with_priority( + &self, + rg: &str, + request_source: &str, + ) -> (Option>, u32) { + fail_point!("only_check_source_task_name", |name| { + assert_eq!(&name.unwrap(), request_source); + (None, 8) + }); + let mut group_priority = None; if let Some(group) = self.resource_groups.get(rg) { + group_priority = Some(group.group.priority); if !group.fallback_default { - return group.get_resource_limiter(request_source); + return ( + group.get_background_resource_limiter(request_source), + group.group.priority, + ); } } - self.resource_groups + let default_group = self + .resource_groups .get(DEFAULT_RESOURCE_GROUP_NAME) - .and_then(|g| g.get_resource_limiter(request_source)) + .unwrap(); + ( + default_group.get_background_resource_limiter(request_source), + group_priority.unwrap_or(default_group.group.priority), + ) + } + + #[inline] + pub fn get_priority_resource_limiters(&self) -> [Arc; 3] { + self.priority_limiters.clone() } } @@ -282,7 +421,10 @@ impl ResourceGroup { .get_fill_rate() } - fn get_resource_limiter(&self, request_source: &str) -> Option> { + fn get_background_resource_limiter( + &self, + request_source: &str, + ) -> Option> { self.limiter.as_ref().and_then(|limiter| { // the source task name is the last part of `request_source` separated by "_" // the request_source is @@ -583,7 +725,7 @@ impl<'a> TaskMetadata<'a> { self.metadata.into_owned() } - fn override_priority(&self) -> u32 { + pub fn override_priority(&self) -> u32 { if self.metadata.is_empty() { return 0; } @@ -609,6 +751,15 @@ impl<'a> TaskMetadata<'a> { } } +// return the TaskPriority value from task metadata. +// This function is used for handling thread pool task waiting metrics. +pub fn priority_from_task_meta(meta: &[u8]) -> usize { + let priority = TaskMetadata::from_bytes(meta).override_priority(); + // mapping (high(15), medium(8), low(1)) -> (0, 1, 2) + debug_assert!(priority <= 16); + TaskPriority::from(priority) as usize +} + impl TaskPriorityProvider for ResourceController { fn priority_of(&self, extras: &yatp::queue::Extras) -> u64 { let metadata = TaskMetadata::from_bytes(extras.metadata()); @@ -867,6 +1018,35 @@ pub(crate) mod tests { ); } + #[test] + fn test_resource_group_crud() { + let resource_manager = ResourceGroupManager::default(); + assert_eq!(resource_manager.get_group_count(), 1); + + let group1 = new_resource_group_ru("test1".into(), 100, HIGH_PRIORITY); + resource_manager.add_resource_group(group1); + assert_eq!(resource_manager.get_group_count(), 2); + + let group2 = new_resource_group_ru("test2".into(), 200, LOW_PRIORITY); + resource_manager.add_resource_group(group2); + assert_eq!(resource_manager.get_group_count(), 3); + + let group1 = new_resource_group_ru("test1".into(), 150, HIGH_PRIORITY); + resource_manager.add_resource_group(group1.clone()); + assert_eq!(resource_manager.get_group_count(), 3); + assert_eq!( + resource_manager.get_resource_group("test1").unwrap().group, + group1 + ); + + resource_manager.remove_resource_group("test2"); + assert!(resource_manager.get_resource_group("test2").is_none()); + assert_eq!(resource_manager.get_group_count(), 2); + + resource_manager.remove_resource_group("test2"); + assert_eq!(resource_manager.get_group_count(), 2); + } + #[test] fn test_resource_group_priority() { let resource_manager = ResourceGroupManager::default(); @@ -1161,4 +1341,112 @@ pub(crate) mod tests { assert_eq!(metadata1.group_name(), group_name.as_bytes()); } } + + #[test] + fn test_get_resource_limiter() { + let mgr = ResourceGroupManager::default(); + + let default_group = new_background_resource_group_ru( + "default".into(), + 200, + MEDIUM_PRIORITY, + vec!["br".into(), "stats".into()], + ); + mgr.add_resource_group(default_group); + let default_limiter = mgr + .get_resource_group("default") + .unwrap() + .limiter + .clone() + .unwrap(); + + assert!(mgr.get_resource_limiter("default", "query", 0).is_none()); + assert!( + mgr.get_resource_limiter("default", "query", HIGH_PRIORITY as u64) + .is_none() + ); + + let group1 = new_resource_group("test1".into(), true, 100, 100, HIGH_PRIORITY); + mgr.add_resource_group(group1); + + let bg_group = new_background_resource_group_ru( + "bg".into(), + 50, + LOW_PRIORITY, + vec!["ddl".into(), "stats".into()], + ); + mgr.add_resource_group(bg_group); + let bg_limiter = mgr + .get_resource_group("bg") + .unwrap() + .limiter + .clone() + .unwrap(); + + assert!( + mgr.get_background_resource_limiter("test1", "ddl") + .is_none() + ); + assert!(Arc::ptr_eq( + &mgr.get_background_resource_limiter("test1", "stats") + .unwrap(), + &default_limiter + )); + + assert!(Arc::ptr_eq( + &mgr.get_background_resource_limiter("bg", "stats").unwrap(), + &bg_limiter + )); + assert!(mgr.get_background_resource_limiter("bg", "br").is_none()); + assert!( + mgr.get_background_resource_limiter("bg", "invalid") + .is_none() + ); + + assert!(Arc::ptr_eq( + &mgr.get_background_resource_limiter("unknown", "stats") + .unwrap(), + &default_limiter + )); + + assert!(Arc::ptr_eq( + &mgr.get_resource_limiter("test1", "stats", 0).unwrap(), + &default_limiter + )); + assert!(Arc::ptr_eq( + &mgr.get_resource_limiter("test1", "query", 0).unwrap(), + &mgr.priority_limiters[0] + )); + assert!(Arc::ptr_eq( + &mgr.get_resource_limiter("test1", "query", LOW_PRIORITY as u64) + .unwrap(), + &mgr.priority_limiters[2] + )); + + assert!(Arc::ptr_eq( + &mgr.get_resource_limiter("default", "query", LOW_PRIORITY as u64) + .unwrap(), + &mgr.priority_limiters[2] + )); + assert!(Arc::ptr_eq( + &mgr.get_resource_limiter("unknown", "query", 0).unwrap(), + &mgr.priority_limiters[1] + )); + } + + #[test] + fn test_task_priority() { + use TaskPriority::*; + let cases = [ + (0, Medium), + (1, Low), + (7, Medium), + (8, Medium), + (15, High), + (16, High), + ]; + for (value, priority) in cases { + assert_eq!(TaskPriority::from(value), priority); + } + } } diff --git a/components/resource_control/src/resource_limiter.rs b/components/resource_control/src/resource_limiter.rs index 8898b4eba23..4cc139152dd 100644 --- a/components/resource_control/src/resource_limiter.rs +++ b/components/resource_control/src/resource_limiter.rs @@ -8,10 +8,11 @@ use std::{ use file_system::IoBytes; use futures::compat::Future01CompatExt; +use prometheus::Histogram; use strum::EnumCount; use tikv_util::{time::Limiter, timer::GLOBAL_TIMER_HANDLE}; -use crate::metrics::BACKGROUND_TASKS_WAIT_DURATION; +use crate::{metrics::PRIORITY_WAIT_DURATION_VEC, resource_group::TaskPriority}; #[derive(Clone, Copy, Eq, PartialEq, EnumCount)] #[repr(usize)] @@ -36,9 +37,13 @@ impl fmt::Debug for ResourceType { } pub struct ResourceLimiter { - name: String, + _name: String, version: u64, limiters: [QuotaLimiter; ResourceType::COUNT], + // whether the resource limiter is a background limiter or priority limiter. + is_background: bool, + // the wait duration histogram for prioitry limiter. + wait_histogram: Option, } impl std::fmt::Debug for ResourceLimiter { @@ -48,29 +53,54 @@ impl std::fmt::Debug for ResourceLimiter { } impl ResourceLimiter { - pub fn new(name: String, cpu_limit: f64, io_limit: f64, version: u64) -> Self { + pub fn new( + name: String, + cpu_limit: f64, + io_limit: f64, + version: u64, + is_background: bool, + ) -> Self { let cpu_limiter = QuotaLimiter::new(cpu_limit); let io_limiter = QuotaLimiter::new(io_limit); + // high priority tasks does not triggers wait, so no need to generate an empty + // metrics. + let wait_histogram = if !is_background && name != TaskPriority::High.as_str() { + Some( + PRIORITY_WAIT_DURATION_VEC + .get_metric_with_label_values(&[&name]) + .unwrap(), + ) + } else { + None + }; Self { - name, + _name: name, version, limiters: [cpu_limiter, io_limiter], + is_background, + wait_histogram, } } - pub fn consume(&self, cpu_time: Duration, io_bytes: IoBytes) -> Duration { + pub fn is_background(&self) -> bool { + self.is_background + } + + pub fn consume(&self, cpu_time: Duration, io_bytes: IoBytes, wait: bool) -> Duration { let cpu_dur = - self.limiters[ResourceType::Cpu as usize].consume(cpu_time.as_micros() as u64); - let io_dur = self.limiters[ResourceType::Io as usize].consume_io(io_bytes); + self.limiters[ResourceType::Cpu as usize].consume(cpu_time.as_micros() as u64, wait); + let io_dur = self.limiters[ResourceType::Io as usize].consume_io(io_bytes, wait); let wait_dur = cpu_dur.max(io_dur); - BACKGROUND_TASKS_WAIT_DURATION - .with_label_values(&[&self.name]) - .inc_by(wait_dur.as_micros() as u64); + if !wait_dur.is_zero() + && let Some(h) = &self.wait_histogram + { + h.observe(wait_dur.as_secs_f64()); + } wait_dur } pub async fn async_consume(&self, cpu_time: Duration, io_bytes: IoBytes) -> Duration { - let dur = self.consume(cpu_time, io_bytes); + let dur = self.consume(cpu_time, io_bytes, true); if !dur.is_zero() { _ = GLOBAL_TIMER_HANDLE .delay(Instant::now() + dur) @@ -86,7 +116,7 @@ impl ResourceLimiter { } pub(crate) fn get_limit_statistics(&self, ty: ResourceType) -> GroupStatistics { - let (total_consumed, total_wait_dur_us, read_consumed, write_consumed) = + let (total_consumed, total_wait_dur_us, read_consumed, write_consumed, request_count) = self.limiters[ty as usize].get_statistics(); GroupStatistics { version: self.version, @@ -94,6 +124,7 @@ impl ResourceLimiter { total_wait_dur_us, read_consumed, write_consumed, + request_count, } } } @@ -104,15 +135,24 @@ pub(crate) struct QuotaLimiter { total_wait_dur_us: AtomicU64, read_bytes: AtomicU64, write_bytes: AtomicU64, + req_count: AtomicU64, } impl QuotaLimiter { fn new(limit: f64) -> Self { Self { - limiter: Limiter::new(limit), + // we use 1s refill and 1ms min_wait duration to avoid trigger + // wait too frequently or waiting too long. + // NOTE: the parameter `refill` mainly impact the capacity + // of token bucket but not refill interval. + limiter: Limiter::builder(limit) + .refill(Duration::from_millis(1000)) + .min_wait(Duration::from_millis(1)) + .build(), total_wait_dur_us: AtomicU64::new(0), read_bytes: AtomicU64::new(0), write_bytes: AtomicU64::new(0), + req_count: AtomicU64::new(0), } } @@ -128,40 +168,47 @@ impl QuotaLimiter { self.limiter.set_speed_limit(limit); } - fn get_statistics(&self) -> (u64, u64, u64, u64) { + fn get_statistics(&self) -> (u64, u64, u64, u64, u64) { ( self.limiter.total_bytes_consumed() as u64, self.total_wait_dur_us.load(Ordering::Relaxed), self.read_bytes.load(Ordering::Relaxed), self.write_bytes.load(Ordering::Relaxed), + self.req_count.load(Ordering::Relaxed), ) } - fn consume(&self, value: u64) -> Duration { - if value == 0 { + fn consume(&self, value: u64, wait: bool) -> Duration { + if value == 0 && self.limiter.speed_limit().is_infinite() { return Duration::ZERO; } - let dur = self.limiter.consume_duration(value as usize); - if dur != Duration::ZERO { + let mut dur = self.limiter.consume_duration(value as usize); + if !wait { + dur = Duration::ZERO; + } else if dur != Duration::ZERO { self.total_wait_dur_us .fetch_add(dur.as_micros() as u64, Ordering::Relaxed); } + self.req_count.fetch_add(1, Ordering::Relaxed); dur } - fn consume_io(&self, value: IoBytes) -> Duration { + fn consume_io(&self, value: IoBytes, wait: bool) -> Duration { self.read_bytes.fetch_add(value.read, Ordering::Relaxed); self.write_bytes.fetch_add(value.write, Ordering::Relaxed); let value = value.read + value.write; - if value == 0 { + if value == 0 && self.limiter.speed_limit().is_infinite() { return Duration::ZERO; } - let dur = self.limiter.consume_duration(value as usize); - if dur != Duration::ZERO { + let mut dur = self.limiter.consume_duration(value as usize); + if !wait { + dur = Duration::ZERO; + } else if dur != Duration::ZERO { self.total_wait_dur_us .fetch_add(dur.as_micros() as u64, Ordering::Relaxed); } + self.req_count.fetch_add(1, Ordering::Relaxed); dur } } @@ -173,6 +220,7 @@ pub struct GroupStatistics { pub total_wait_dur_us: u64, pub read_consumed: u64, pub write_consumed: u64, + pub request_count: u64, } impl std::ops::Sub for GroupStatistics { @@ -184,6 +232,7 @@ impl std::ops::Sub for GroupStatistics { total_wait_dur_us: self.total_wait_dur_us.saturating_sub(rhs.total_wait_dur_us), read_consumed: self.read_consumed.saturating_sub(rhs.read_consumed), write_consumed: self.write_consumed.saturating_sub(rhs.write_consumed), + request_count: self.request_count.saturating_sub(rhs.request_count), } } } @@ -198,6 +247,7 @@ impl std::ops::Div for GroupStatistics { total_wait_dur_us: (self.total_wait_dur_us as f64 / rhs) as u64, read_consumed: (self.read_consumed as f64 / rhs) as u64, write_consumed: (self.write_consumed as f64 / rhs) as u64, + request_count: (self.request_count as f64 / rhs) as u64, } } } diff --git a/components/resource_control/src/service.rs b/components/resource_control/src/service.rs index 5ecac9d74c4..26652cda00e 100644 --- a/components/resource_control/src/service.rs +++ b/components/resource_control/src/service.rs @@ -565,13 +565,17 @@ pub mod tests { s_clone.report_ru_metrics().await; }); // Mock consume. - let bg_limiter = s.manager.get_resource_limiter("background", "br").unwrap(); + let bg_limiter = s + .manager + .get_background_resource_limiter("background", "br") + .unwrap(); bg_limiter.consume( Duration::from_secs(2), IoBytes { read: 1000, write: 1000, }, + true, ); // Wait for report ru metrics. std::thread::sleep(Duration::from_millis(100)); @@ -584,7 +588,7 @@ pub mod tests { s.manager.add_resource_group(background_group); let new_bg_limiter = s .manager - .get_resource_limiter("background", "lightning") + .get_background_resource_limiter("background", "lightning") .unwrap(); new_bg_limiter.consume( Duration::from_secs(5), @@ -592,6 +596,7 @@ pub mod tests { read: 2000, write: 2000, }, + true, ); // Wait for report ru metrics. std::thread::sleep(Duration::from_millis(100)); diff --git a/components/resource_control/src/worker.rs b/components/resource_control/src/worker.rs index deb1b2e44de..007b2e27c47 100644 --- a/components/resource_control/src/worker.rs +++ b/components/resource_control/src/worker.rs @@ -9,16 +9,19 @@ use std::{ }; use file_system::{fetch_io_bytes, IoBytes, IoType}; -use strum::EnumCount; +use prometheus::Histogram; +use strum::{EnumCount, IntoEnumIterator}; use tikv_util::{ + debug, sys::{cpu_time::ProcessStat, SysQuota}, time::Instant, warn, + yatp_pool::metrics::YATP_POOL_SCHEDULE_WAIT_DURATION_VEC, }; use crate::{ metrics::*, - resource_group::ResourceGroupManager, + resource_group::{ResourceGroupManager, TaskPriority}, resource_limiter::{GroupStatistics, ResourceLimiter, ResourceType}, }; @@ -39,7 +42,7 @@ pub struct SysQuotaGetter { process_stat: ProcessStat, prev_io_stats: [IoBytes; IoType::COUNT], prev_io_ts: Instant, - io_bandwidth: u64, + io_bandwidth: f64, } impl ResourceStatsProvider for SysQuotaGetter { @@ -55,7 +58,7 @@ impl ResourceStatsProvider for SysQuotaGetter { } ResourceType::Io => { let mut stats = ResourceUsageStats { - total_quota: self.io_bandwidth as f64, + total_quota: self.io_bandwidth, current_used: 0.0, }; let now = Instant::now_coarse(); @@ -97,7 +100,7 @@ impl GroupQuotaAdjustWorker { process_stat: ProcessStat::cur_proc_stat().unwrap(), prev_io_stats: [IoBytes::default(); IoType::COUNT], prev_io_ts: Instant::now_coarse(), - io_bandwidth, + io_bandwidth: io_bandwidth as f64, }; Self::with_quota_getter(resource_ctl, resource_quota_getter) } @@ -203,6 +206,12 @@ impl GroupQuotaAdjustWorker { BACKGROUND_RESOURCE_CONSUMPTION .with_label_values(&[&g.name, resource_type.as_str()]) .inc_by(stats_delta.total_consumed); + if resource_type == ResourceType::Cpu { + BACKGROUND_TASKS_WAIT_DURATION + .with_label_values(&[&g.name]) + .inc_by(stats_delta.total_wait_dur_us); + } + let stats_per_sec = stats_delta / dur_secs; background_consumed_total += stats_per_sec.total_consumed as f64; g.stats_per_sec = stats_per_sec; @@ -295,6 +304,240 @@ struct GroupStats { expect_cost_rate: f64, } +/// PriorityLimiterAdjustWorker automically adjust the quota of each priority +/// limiter based on the statistics data during a certain period of time. +/// In general, caller should call this function in a fixed interval. +pub struct PriorityLimiterAdjustWorker { + resource_ctl: Arc, + trackers: [PriorityLimiterStatsTracker; 3], + resource_quota_getter: R, + last_adjust_time: Instant, + is_last_low_cpu: bool, + is_last_single_group: bool, +} + +impl PriorityLimiterAdjustWorker { + pub fn new(resource_ctl: Arc) -> Self { + let resource_quota_getter = SysQuotaGetter { + process_stat: ProcessStat::cur_proc_stat().unwrap(), + prev_io_stats: [IoBytes::default(); IoType::COUNT], + prev_io_ts: Instant::now_coarse(), + io_bandwidth: f64::INFINITY, + }; + Self::with_quota_getter(resource_ctl, resource_quota_getter) + } +} + +impl PriorityLimiterAdjustWorker { + fn with_quota_getter( + resource_ctl: Arc, + resource_quota_getter: R, + ) -> Self { + let priorities: [_; 3] = TaskPriority::iter().collect::>().try_into().unwrap(); + let trackers = resource_ctl + .get_priority_resource_limiters() + .zip(priorities) + .map(|(l, p)| PriorityLimiterStatsTracker::new(l, p.as_str())); + Self { + resource_ctl, + trackers, + resource_quota_getter, + last_adjust_time: Instant::now_coarse(), + is_last_low_cpu: true, + is_last_single_group: true, + } + } + pub fn adjust(&mut self) { + let now = Instant::now_coarse(); + let dur = now.saturating_duration_since(self.last_adjust_time); + if dur < Duration::from_secs(1) { + warn!("adjust duration too small, skip adjustment."; "dur" => ?dur); + return; + } + self.last_adjust_time = now; + + // fast path for only the default resource group which means resource + // control is not used at all. + let group_count = self.resource_ctl.get_group_count(); + if group_count == 1 { + if self.is_last_single_group { + return; + } + self.is_last_single_group = true; + self.trackers.iter().skip(1).for_each(|t| { + t.limiter + .get_limiter(ResourceType::Cpu) + .set_rate_limit(f64::INFINITY) + }); + return; + } + self.is_last_single_group = false; + + let stats: [_; 3] = + std::array::from_fn(|i| self.trackers[i].get_and_update_last_stats(dur.as_secs_f64())); + + let process_cpu_stats = match self + .resource_quota_getter + .get_current_stats(ResourceType::Cpu) + { + Ok(s) => s, + Err(e) => { + warn!("get process total cpu failed; skip adjusment."; "err" => ?e); + return; + } + }; + + if process_cpu_stats.current_used < process_cpu_stats.total_quota * 0.5 { + if self.is_last_low_cpu { + return; + } + self.is_last_low_cpu = true; + self.trackers.iter().skip(1).for_each(|t| { + t.limiter + .get_limiter(ResourceType::Cpu) + .set_rate_limit(f64::INFINITY); + // 0 represent infinity + PRIORITY_QUOTA_LIMIT_VEC + .get_metric_with_label_values(&[t.priority]) + .unwrap() + .set(0); + }); + return; + } + self.is_last_low_cpu = false; + + let total_reqs: u64 = stats.iter().map(|s| s.req_count).sum(); + let max_reqs = stats.iter().map(|s| s.req_count).max().unwrap(); + // there is only 1 active priority, do not restrict. + if total_reqs * 99 / 100 <= max_reqs { + self.trackers + .iter() + .skip(1) + .for_each(|t: &PriorityLimiterStatsTracker| { + t.limiter + .get_limiter(ResourceType::Cpu) + .set_rate_limit(f64::INFINITY) + }); + return; + } + + let real_cpu_total: f64 = stats.iter().map(|s| s.cpu_secs).sum(); + let expect_pool_cpu_total = real_cpu_total * (process_cpu_stats.total_quota * 0.95) + / process_cpu_stats.current_used; + let mut limits = [0.0; 2]; + let level_expected: [_; 3] = + std::array::from_fn(|i| stats[i].cpu_secs + stats[i].wait_secs); + // substract the cpu time usage for priority high. + let mut expect_cpu_time_total = expect_pool_cpu_total - level_expected[0]; + + // still reserve a minimal cpu quota + let minimal_quota = process_cpu_stats.total_quota / MICROS_PER_SEC * 0.05; + for i in 1..self.trackers.len() { + if expect_cpu_time_total < minimal_quota { + expect_cpu_time_total = minimal_quota; + } + let limit = expect_cpu_time_total * MICROS_PER_SEC; + self.trackers[i] + .limiter + .get_limiter(ResourceType::Cpu) + .set_rate_limit(limit); + PRIORITY_QUOTA_LIMIT_VEC + .get_metric_with_label_values(&[self.trackers[i].priority]) + .unwrap() + .set(limit as i64); + limits[i - 1] = limit; + expect_cpu_time_total -= level_expected[i]; + } + debug!("adjsut cpu limiter by priority"; "cpu_quota" => process_cpu_stats.total_quota, "process_cpu" => process_cpu_stats.current_used, "expected_cpu" => ?level_expected, + "limits" => ?limits, "limit_cpu_total" => expect_pool_cpu_total, "pool_cpu_cost" => real_cpu_total); + } +} + +#[derive(Debug)] +struct LimiterStats { + // QuotaLimiter consumed cpu secs in total + cpu_secs: f64, + // QuotaLimiter waited secs in total. + wait_secs: f64, + // the total number of tasks that are scheduled. + req_count: u64, +} + +struct HistogramTracker { + metrics: Histogram, + last_sum: f64, + last_count: u64, +} + +impl HistogramTracker { + fn new(metrics: Histogram) -> Self { + let last_sum = metrics.get_sample_sum(); + let last_count = metrics.get_sample_count(); + Self { + metrics, + last_sum, + last_count, + } + } + + fn get_and_upate_statistics(&mut self) -> (f64, u64) { + let cur_sum = self.metrics.get_sample_sum(); + let cur_count = self.metrics.get_sample_count(); + let res = (cur_sum - self.last_sum, cur_count - self.last_count); + self.last_sum = cur_sum; + self.last_count = cur_count; + res + } +} + +struct PriorityLimiterStatsTracker { + priority: &'static str, + limiter: Arc, + last_stats: GroupStatistics, + // unified-read-pool and schedule-worker-pool wait duration metrics. + task_wait_dur_trakcers: [HistogramTracker; 2], +} + +impl PriorityLimiterStatsTracker { + fn new(limiter: Arc, priority: &'static str) -> Self { + let task_wait_dur_trakcers = + ["unified-read-pool", "sched-worker-priority"].map(|pool_name| { + HistogramTracker::new( + YATP_POOL_SCHEDULE_WAIT_DURATION_VEC + .get_metric_with_label_values(&[pool_name, priority]) + .unwrap(), + ) + }); + let last_stats = limiter.get_limit_statistics(ResourceType::Cpu); + Self { + priority, + limiter, + last_stats, + task_wait_dur_trakcers, + } + } + + fn get_and_update_last_stats(&mut self, dur_secs: f64) -> LimiterStats { + let cur_stats = self.limiter.get_limit_statistics(ResourceType::Cpu); + let stats_delta = cur_stats - self.last_stats; + self.last_stats = cur_stats; + PRIORITY_CPU_TIME_VEC + .with_label_values(&[self.priority]) + .inc_by(stats_delta.total_consumed); + let stats_per_sec = stats_delta / dur_secs; + + let wait_stats: [_; 2] = + std::array::from_fn(|i| self.task_wait_dur_trakcers[i].get_and_upate_statistics()); + let schedule_wait_dur_secs = wait_stats.iter().map(|s| s.0).sum::() / dur_secs; + LimiterStats { + cpu_secs: stats_per_sec.total_consumed as f64 / MICROS_PER_SEC, + wait_secs: stats_per_sec.total_wait_dur_us as f64 / MICROS_PER_SEC + + schedule_wait_dur_secs, + req_count: stats_per_sec.request_count, + } + } +} + #[cfg(test)] mod tests { use std::time::Duration; @@ -340,7 +583,11 @@ mod tests { let resource_ctl = Arc::new(ResourceGroupManager::default()); let rg1 = new_resource_group_ru("test".into(), 1000, 14); resource_ctl.add_resource_group(rg1); - assert!(resource_ctl.get_resource_limiter("test", "br").is_none()); + assert!( + resource_ctl + .get_background_resource_limiter("test", "br") + .is_none() + ); let test_provider = TestResourceStatsProvider::new(8.0, 10000.0); let mut worker = @@ -351,10 +598,12 @@ mod tests { resource_ctl.add_resource_group(default_bg); assert!( resource_ctl - .get_resource_limiter("default", "lightning") + .get_background_resource_limiter("default", "lightning") .is_none() ); - let limiter = resource_ctl.get_resource_limiter("default", "br").unwrap(); + let limiter = resource_ctl + .get_background_resource_limiter("default", "br") + .unwrap(); assert!( limiter .get_limiter(ResourceType::Cpu) @@ -391,6 +640,7 @@ mod tests { worker.last_adjust_time = now - dur; }; + #[track_caller] fn check(val: f64, expected: f64) { assert!( expected * 0.99 < val && val < expected * 1.01, @@ -400,6 +650,7 @@ mod tests { ); } + #[track_caller] fn check_limiter(limiter: &Arc, cpu: f64, io: IoBytes) { check( limiter.get_limiter(ResourceType::Cpu).get_rate_limit(), @@ -452,6 +703,7 @@ mod tests { read: 1000, write: 1000, }, + true, ); worker.adjust_quota(); check_limiter( @@ -481,6 +733,7 @@ mod tests { read: 1000, write: 1000, }, + true, ); worker.adjust_quota(); check_limiter( @@ -499,6 +752,7 @@ mod tests { read: 5000, write: 5000, }, + true, ); worker.adjust_quota(); check_limiter( @@ -513,13 +767,15 @@ mod tests { let default = new_background_resource_group_ru("default".into(), 2000, 8, vec!["br".into()]); resource_ctl.add_resource_group(default); - let new_limiter = resource_ctl.get_resource_limiter("default", "br").unwrap(); + let new_limiter = resource_ctl + .get_background_resource_limiter("default", "br") + .unwrap(); assert_eq!(&*new_limiter as *const _, &*limiter as *const _); let bg = new_background_resource_group_ru("background".into(), 1000, 15, vec!["br".into()]); resource_ctl.add_resource_group(bg); let bg_limiter = resource_ctl - .get_resource_limiter("background", "br") + .get_background_resource_limiter("background", "br") .unwrap(); reset_quota(&mut worker, 5.0, 7000.0, Duration::from_secs(1)); @@ -548,6 +804,7 @@ mod tests { read: 600, write: 600, }, + true, ); bg_limiter.consume( Duration::from_millis(1800), @@ -555,22 +812,23 @@ mod tests { read: 900, write: 900, }, + true, ); worker.adjust_quota(); check_limiter( &limiter, - 2.4, + 1.2, IoBytes { - read: 1400, - write: 1400, + read: 1800, + write: 1800, }, ); check_limiter( &bg_limiter, - 1.6, + 2.8, IoBytes { - read: 1800, - write: 1800, + read: 1400, + write: 1400, }, ); @@ -581,7 +839,7 @@ mod tests { new_background_resource_group_ru("background".into(), 1000, 15, vec!["br".into()]); resource_ctl.add_resource_group(new_bg); let new_bg_limiter = resource_ctl - .get_resource_limiter("background", "br") + .get_background_resource_limiter("background", "br") .unwrap(); assert_ne!(&*bg_limiter as *const _, &*new_bg_limiter as *const _); assert!( @@ -623,6 +881,7 @@ mod tests { read: 600, write: 600, }, + true, ); new_bg_limiter.consume( Duration::from_millis(1800), @@ -630,24 +889,139 @@ mod tests { read: 900, write: 900, }, + true, ); worker.adjust_quota(); check_limiter( &limiter, - 2.4, + 2.2, IoBytes { - read: 1400, - write: 1400, + read: 2133, + write: 2133, }, ); check_limiter( &new_bg_limiter, - 1.6, + 1.8, IoBytes { - read: 1800, - write: 1800, + read: 1066, + write: 1066, }, ); } + + #[test] + fn test_adjust_priority_resource_limiter() { + let resource_ctl = Arc::new(ResourceGroupManager::default()); + let priority_limiters = resource_ctl.get_priority_resource_limiters(); + let test_provider = TestResourceStatsProvider::new(8.0, f64::INFINITY); + let mut worker = + PriorityLimiterAdjustWorker::with_quota_getter(resource_ctl.clone(), test_provider); + + let reset_quota = |worker: &mut PriorityLimiterAdjustWorker, + cpu: f64| { + worker.resource_quota_getter.cpu_used = cpu; + worker.last_adjust_time = Instant::now_coarse() - Duration::from_secs(10); + priority_limiters[1] + .get_limiter(ResourceType::Cpu) + .set_rate_limit(f64::INFINITY); + priority_limiters[2] + .get_limiter(ResourceType::Cpu) + .set_rate_limit(f64::INFINITY); + }; + + fn check(val: f64, expected: f64) { + assert!( + (val.is_infinite() && expected.is_infinite()) + || (expected * 0.99 < val && val < expected * 1.01), + "actual: {}, expected: {}", + val, + expected + ); + } + + let check_limiter = |high: f64, medium: f64, low: f64| { + check( + priority_limiters[0] + .get_limiter(ResourceType::Cpu) + .get_rate_limit(), + high * MICROS_PER_SEC, + ); + check( + priority_limiters[1] + .get_limiter(ResourceType::Cpu) + .get_rate_limit(), + medium * MICROS_PER_SEC, + ); + check( + priority_limiters[2] + .get_limiter(ResourceType::Cpu) + .get_rate_limit(), + low * MICROS_PER_SEC, + ); + }; + + // only default group, always return infinity. + reset_quota(&mut worker, 6.4); + priority_limiters[1].consume(Duration::from_secs(50), IoBytes::default(), true); + worker.adjust(); + check_limiter(f64::INFINITY, f64::INFINITY, f64::INFINITY); + + let rg1 = new_resource_group_ru("test_high".into(), 1000, 16); + resource_ctl.add_resource_group(rg1); + let rg2 = new_resource_group_ru("test_low".into(), 2000, 1); + resource_ctl.add_resource_group(rg2); + + reset_quota(&mut worker, 6.4); + priority_limiters[1].consume(Duration::from_secs(64), IoBytes::default(), true); + worker.adjust(); + check_limiter(f64::INFINITY, f64::INFINITY, f64::INFINITY); + + reset_quota(&mut worker, 6.4); + for _i in 0..100 { + priority_limiters[0].consume(Duration::from_millis(240), IoBytes::default(), true); + priority_limiters[1].consume(Duration::from_millis(400), IoBytes::default(), true); + } + worker.adjust(); + check_limiter(f64::INFINITY, 5.2, 1.2); + + reset_quota(&mut worker, 6.4); + for _i in 0..100 { + priority_limiters[0].consume(Duration::from_millis(120), IoBytes::default(), true); + priority_limiters[1].consume(Duration::from_millis(200), IoBytes::default(), true); + } + worker.adjust(); + check_limiter(f64::INFINITY, 2.6, 0.6); + + reset_quota(&mut worker, 6.4); + for _i in 0..100 { + priority_limiters[2].consume(Duration::from_millis(200), IoBytes::default(), true); + } + worker.adjust(); + check_limiter(f64::INFINITY, f64::INFINITY, f64::INFINITY); + + reset_quota(&mut worker, 8.0); + for _i in 0..100 { + priority_limiters[0].consume(Duration::from_millis(240), IoBytes::default(), true); + priority_limiters[1].consume(Duration::from_millis(240), IoBytes::default(), true); + priority_limiters[2].consume(Duration::from_millis(320), IoBytes::default(), true); + } + worker.adjust(); + check_limiter(f64::INFINITY, 5.2, 2.8); + + reset_quota(&mut worker, 6.0); + for _i in 0..100 { + priority_limiters[0].consume(Duration::from_millis(240), IoBytes::default(), true); + priority_limiters[2].consume(Duration::from_millis(360), IoBytes::default(), true); + } + worker.adjust(); + check_limiter(f64::INFINITY, 5.2, 5.2); + + // duration too small, unchanged. + worker.resource_quota_getter.cpu_used = 6.0; + worker.last_adjust_time = Instant::now_coarse() - Duration::from_millis(500); + worker.adjust(); + check_limiter(f64::INFINITY, 5.2, 5.2); + } } diff --git a/components/resource_metering/src/lib.rs b/components/resource_metering/src/lib.rs index ba8e2174e19..2c699998d3d 100644 --- a/components/resource_metering/src/lib.rs +++ b/components/resource_metering/src/lib.rs @@ -33,6 +33,7 @@ pub use reporter::{ ConfigChangeNotifier as ReporterConfigChangeNotifier, Reporter, Task, }; use tikv_util::{ + memory::HeapSize, sys::thread, warn, worker::{Scheduler, Worker}, @@ -98,6 +99,12 @@ impl ResourceMeteringTag { } } +impl HeapSize for ResourceMeteringTag { + fn approximate_heap_size(&self) -> usize { + self.infos.approximate_mem_size() + } +} + /// An RAII implementation of a [ResourceMeteringTag]. When this structure is /// dropped (falls out of scope), the tag will be removed. You can also clean /// up other data here if necessary. @@ -319,6 +326,12 @@ impl TagInfos { } } +impl HeapSize for TagInfos { + fn approximate_heap_size(&self) -> usize { + self.key_ranges.approximate_heap_size() + self.extra_attachment.approximate_heap_size() + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/components/server/Cargo.toml b/components/server/Cargo.toml index 55da894c6e8..a1f08d92d4c 100644 --- a/components/server/Cargo.toml +++ b/components/server/Cargo.toml @@ -17,18 +17,10 @@ failpoints = ["tikv/failpoints"] cloud-aws = ["encryption_export/cloud-aws"] cloud-gcp = ["encryption_export/cloud-gcp"] cloud-azure = ["encryption_export/cloud-azure"] -test-engine-kv-rocksdb = [ - "tikv/test-engine-kv-rocksdb" -] -test-engine-raft-raft-engine = [ - "tikv/test-engine-raft-raft-engine" -] -test-engines-rocksdb = [ - "tikv/test-engines-rocksdb", -] -test-engines-panic = [ - "tikv/test-engines-panic", -] +test-engine-kv-rocksdb = ["tikv/test-engine-kv-rocksdb"] +test-engine-raft-raft-engine = ["tikv/test-engine-raft-raft-engine"] +test-engines-rocksdb = ["tikv/test-engines-rocksdb"] +test-engines-panic = ["tikv/test-engines-panic"] nortcheck = ["engine_rocks/nortcheck"] backup-stream-debug = ["backup-stream/backup-stream-debug"] @@ -51,7 +43,6 @@ engine_traits = { workspace = true } error_code = { workspace = true } fail = "0.5" file_system = { workspace = true } -fs2 = "0.4" futures = "0.3" grpcio = { workspace = true } grpcio-health = { workspace = true } @@ -59,7 +50,10 @@ hex = "0.4" keys = { workspace = true } kvproto = { workspace = true } libc = "0.2" -log = { version = "0.4", features = ["max_level_trace", "release_max_level_debug"] } +log = { version = "0.4", features = [ + "max_level_trace", + "release_max_level_debug", +] } log_wrappers = { workspace = true } pd_client = { workspace = true } prometheus = { version = "0.13", features = ["nightly"] } diff --git a/components/server/src/common.rs b/components/server/src/common.rs index 165a1c8509e..32dfb925366 100644 --- a/components/server/src/common.rs +++ b/components/server/src/common.rs @@ -214,8 +214,9 @@ impl TikvServerCore { } } - let disk_stats = fs2::statvfs(&self.config.storage.data_dir).unwrap(); - let mut capacity = disk_stats.total_space(); + let (disk_cap, disk_avail) = + disk::get_disk_space_stats(&self.config.storage.data_dir).unwrap(); + let mut capacity = disk_cap; if self.config.raft_store.capacity.0 > 0 { capacity = cmp::min(capacity, self.config.raft_store.capacity.0); } @@ -223,11 +224,7 @@ impl TikvServerCore { let kv_reserved_size = calculate_reserved_space(capacity, self.config.storage.reserve_space.0); disk::set_disk_reserved_space(kv_reserved_size); - reserve_physical_space( - &self.config.storage.data_dir, - disk_stats.available_space(), - kv_reserved_size, - ); + reserve_physical_space(&self.config.storage.data_dir, disk_avail, kv_reserved_size); let raft_data_dir = if self.config.raft_engine.enable { self.config.raft_engine.config().dir @@ -238,18 +235,13 @@ impl TikvServerCore { let separated_raft_mount_path = path_in_diff_mount_point(&self.config.storage.data_dir, &raft_data_dir); if separated_raft_mount_path { - let raft_disk_stats = fs2::statvfs(&raft_data_dir).unwrap(); + let (raft_disk_cap, raft_disk_avail) = + disk::get_disk_space_stats(&raft_data_dir).unwrap(); // reserve space for raft engine if raft engine is deployed separately - let raft_reserved_size = calculate_reserved_space( - raft_disk_stats.total_space(), - self.config.storage.reserve_raft_space.0, - ); + let raft_reserved_size = + calculate_reserved_space(raft_disk_cap, self.config.storage.reserve_raft_space.0); disk::set_raft_disk_reserved_space(raft_reserved_size); - reserve_physical_space( - &raft_data_dir, - raft_disk_stats.available_space(), - raft_reserved_size, - ); + reserve_physical_space(&raft_data_dir, raft_disk_avail, raft_reserved_size); } } @@ -762,7 +754,11 @@ impl ConfiguredRaftEngine for RocksEngine { fn register_config(&self, cfg_controller: &mut ConfigController) { cfg_controller.register( tikv::config::Module::Raftdb, - Box::new(DbConfigManger::new(self.clone(), DbType::Raft)), + Box::new(DbConfigManger::new( + cfg_controller.get_current().rocksdb, + self.clone(), + DbType::Raft, + )), ); } } @@ -862,3 +858,340 @@ impl EngineMetricsManager { } } } + +fn calculate_disk_usage(a: disk::DiskUsage, b: disk::DiskUsage) -> disk::DiskUsage { + match (a, b) { + (disk::DiskUsage::AlreadyFull, _) => disk::DiskUsage::AlreadyFull, + (_, disk::DiskUsage::AlreadyFull) => disk::DiskUsage::AlreadyFull, + (disk::DiskUsage::AlmostFull, _) => disk::DiskUsage::AlmostFull, + (_, disk::DiskUsage::AlmostFull) => disk::DiskUsage::AlmostFull, + (disk::DiskUsage::Normal, disk::DiskUsage::Normal) => disk::DiskUsage::Normal, + } +} + +/// A checker to inspect the disk usage of kv engine and raft engine. +/// The caller should call `inspect` periodically to get the disk usage status +/// manually. +#[derive(Clone)] +pub struct DiskUsageChecker { + /// The path of kv engine. + kvdb_path: String, + /// The path of raft engine. + raft_path: String, + /// The path of auxiliary directory of raft engine if specified. + raft_auxiliary_path: Option, + /// Whether the main directory of raft engine is separated from kv engine. + separated_raft_mount_path: bool, + /// Whether the auxiliary directory of raft engine is separated from kv + /// engine. + separated_raft_auxiliary_mount_path: bool, + /// Whether the auxiliary directory of raft engine is both separated from + /// the main directory of raft engine and kv engine. + separated_raft_auxiliary_and_kvdb_mount_path: bool, + /// The threshold of disk usage of kv engine to trigger the almost full + /// status. + kvdb_almost_full_thd: u64, + /// The threshold of disk usage of raft engine to trigger the almost full + /// status. + raft_almost_full_thd: u64, + /// The specified disk capacity for the whole disk. + config_disk_capacity: u64, +} + +impl DiskUsageChecker { + pub fn new( + kvdb_path: String, + raft_path: String, + raft_auxiliary_path: Option, + separated_raft_mount_path: bool, + separated_raft_auxiliary_mount_path: bool, + separated_raft_auxiliary_and_kvdb_mount_path: bool, + kvdb_almost_full_thd: u64, + raft_almost_full_thd: u64, + config_disk_capacity: u64, + ) -> Self { + DiskUsageChecker { + kvdb_path, + raft_path, + raft_auxiliary_path, + separated_raft_mount_path, + separated_raft_auxiliary_mount_path, + separated_raft_auxiliary_and_kvdb_mount_path, + kvdb_almost_full_thd, + raft_almost_full_thd, + config_disk_capacity, + } + } + + /// Inspect the disk usage of kv engine and raft engine. + /// The `kvdb_used_size` is the used size of kv engine, and the + /// `raft_used_size` is the used size of raft engine. + /// + /// Returns the disk usage status of the whole disk, kv engine and raft + /// engine, the whole disk capacity and available size. + pub fn inspect( + &self, + kvdb_used_size: u64, + raft_used_size: u64, + ) -> ( + disk::DiskUsage, // whole disk status + disk::DiskUsage, // kvdb disk status + disk::DiskUsage, // raft disk status + u64, // whole capacity + u64, // whole available + ) { + // By default, the almost full threshold of kv engine is half of the + // configured value. + let kvdb_already_full_thd = self.kvdb_almost_full_thd / 2; + let raft_already_full_thd = self.raft_almost_full_thd / 2; + // Check the disk space of raft engine. + let raft_disk_status = { + if !self.separated_raft_mount_path || self.raft_almost_full_thd == 0 { + disk::DiskUsage::Normal + } else { + let (raft_disk_cap, raft_disk_avail) = match disk::get_disk_space_stats( + &self.raft_path, + ) { + Err(e) => { + error!( + "get disk stat for raft engine failed"; + "raft_engine_path" => &self.raft_path, + "err" => ?e + ); + return ( + disk::DiskUsage::Normal, + disk::DiskUsage::Normal, + disk::DiskUsage::Normal, + 0, + 0, + ); + } + Ok((cap, avail)) => { + if !self.separated_raft_auxiliary_mount_path { + // If the auxiliary directory of raft engine is not separated from + // kv engine, returns u64::MAX to indicate that the disk space of + // the raft engine should not be checked. + (std::u64::MAX, std::u64::MAX) + } else if self.separated_raft_auxiliary_and_kvdb_mount_path { + // If the auxiliary directory of raft engine is separated from kv + // engine and the main directory of + // raft engine, the disk space of + // the auxiliary directory should be + // checked. + assert!(self.raft_auxiliary_path.is_some()); + let (auxiliary_disk_cap, auxiliary_disk_avail) = + match disk::get_disk_space_stats( + self.raft_auxiliary_path.as_ref().unwrap(), + ) { + Err(e) => { + error!( + "get auxiliary disk stat for raft engine failed"; + "raft_engine_path" => self.raft_auxiliary_path.as_ref().unwrap(), + "err" => ?e + ); + (0_u64, 0_u64) + } + Ok((total, avail)) => (total, avail), + }; + (cap + auxiliary_disk_cap, avail + auxiliary_disk_avail) + } else { + (cap, avail) + } + } + }; + let raft_disk_available = cmp::min( + raft_disk_cap + .checked_sub(raft_used_size) + .unwrap_or_default(), + raft_disk_avail, + ); + if raft_disk_available <= raft_already_full_thd { + disk::DiskUsage::AlreadyFull + } else if raft_disk_available <= self.raft_almost_full_thd { + disk::DiskUsage::AlmostFull + } else { + disk::DiskUsage::Normal + } + } + }; + // Check the disk space of kv engine. + let (disk_cap, disk_avail) = match disk::get_disk_space_stats(&self.kvdb_path) { + Err(e) => { + error!( + "get disk stat for kv store failed"; + "kv_path" => &self.kvdb_path, + "err" => ?e + ); + return ( + disk::DiskUsage::Normal, + disk::DiskUsage::Normal, + disk::DiskUsage::Normal, + 0, + 0, + ); + } + Ok((total, avail)) => (total, avail), + }; + let capacity = if self.config_disk_capacity == 0 || disk_cap < self.config_disk_capacity { + disk_cap + } else { + self.config_disk_capacity + }; + let available = cmp::min( + capacity.checked_sub(kvdb_used_size).unwrap_or_default(), + disk_avail, + ); + let cur_kv_disk_status = if available <= kvdb_already_full_thd { + disk::DiskUsage::AlreadyFull + } else if available <= self.kvdb_almost_full_thd { + disk::DiskUsage::AlmostFull + } else { + disk::DiskUsage::Normal + }; + let cur_disk_status = calculate_disk_usage(raft_disk_status, cur_kv_disk_status); + ( + cur_disk_status, + cur_kv_disk_status, + raft_disk_status, + capacity, + available, + ) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_disk_usage_checker() { + let kvdb_path = "/tmp/tikv-kvdb".to_owned(); + let raft_path = "/tmp/tikv-raft".to_owned(); + let raft_spill_path = "/tmp/tikv-raft/spill".to_owned(); + + // Case 1: mock the kvdb and raft engine are not separated. + fail::cfg("mock_disk_space_stats", "return(10000,5000)").unwrap(); + let disk_usage_checker = DiskUsageChecker::new( + kvdb_path.clone(), + raft_path.clone(), + Some(raft_spill_path.clone()), + false, + true, + false, + 100, + 100, + 1000, + ); + let (disk_status, kvdb_status, raft_status, ..) = disk_usage_checker.inspect(4000, 1000); + assert_eq!(disk_status, disk::DiskUsage::AlreadyFull); + assert_eq!(kvdb_status, disk::DiskUsage::AlreadyFull); + assert_eq!(raft_status, disk::DiskUsage::Normal); + + let disk_usage_checker = DiskUsageChecker::new( + kvdb_path.clone(), + raft_path.clone(), + Some(raft_spill_path.clone()), + false, + true, + false, + 100, + 100, + 4100, + ); + let (disk_status, kvdb_status, raft_status, ..) = disk_usage_checker.inspect(4000, 1000); + assert_eq!(raft_status, disk::DiskUsage::Normal); + assert_eq!(kvdb_status, disk::DiskUsage::AlmostFull); + assert_eq!(disk_status, disk::DiskUsage::AlmostFull); + let (disk_status, kvdb_status, raft_status, ..) = disk_usage_checker.inspect(3999, 1000); + assert_eq!(raft_status, disk::DiskUsage::Normal); + assert_eq!(kvdb_status, disk::DiskUsage::Normal); + assert_eq!(disk_status, disk::DiskUsage::Normal); + fail::remove("mock_disk_space_stats"); + + // Case 2: mock the kvdb and raft engine are separated. + fail::cfg( + "mock_disk_space_stats", + "1*return(500,200)->1*return(5000,2000)->1*return(500,200)->1*return(5000,2000)->1*return(500,200)->1*return(5000,2000)", + ) + .unwrap(); + let disk_usage_checker = DiskUsageChecker::new( + kvdb_path.clone(), + raft_path.clone(), + Some(raft_spill_path.clone()), + true, + true, + false, + 100, + 100, + 6000, + ); + let (disk_status, kvdb_status, raft_status, ..) = disk_usage_checker.inspect(4000, 450); + assert_eq!(raft_status, disk::DiskUsage::AlreadyFull); + assert_eq!(kvdb_status, disk::DiskUsage::Normal); + assert_eq!(disk_status, disk::DiskUsage::AlreadyFull); + let (disk_status, kvdb_status, raft_status, ..) = disk_usage_checker.inspect(4000, 400); + assert_eq!(raft_status, disk::DiskUsage::AlmostFull); + assert_eq!(kvdb_status, disk::DiskUsage::Normal); + assert_eq!(disk_status, disk::DiskUsage::AlmostFull); + let (disk_status, kvdb_status, raft_status, ..) = disk_usage_checker.inspect(4000, 399); + assert_eq!(raft_status, disk::DiskUsage::Normal); + assert_eq!(kvdb_status, disk::DiskUsage::Normal); + assert_eq!(disk_status, disk::DiskUsage::Normal); + fail::remove("mock_disk_space_stats"); + + fail::cfg( + "mock_disk_space_stats", + "1*return(500,200)->1*return(5000,2000)->1*return(500,200)->1*return(5000,2000)->1*return(500,200)->1*return(5000,2000)", + ) + .unwrap(); + let disk_usage_checker = DiskUsageChecker::new( + kvdb_path.clone(), + raft_path.clone(), + Some(raft_spill_path.clone()), + true, + false, + false, + 100, + 100, + 6000, + ); + let (disk_status, kvdb_status, raft_status, ..) = disk_usage_checker.inspect(4000, 450); + assert_eq!(raft_status, disk::DiskUsage::Normal); + assert_eq!(kvdb_status, disk::DiskUsage::Normal); + assert_eq!(disk_status, disk::DiskUsage::Normal); + let (disk_status, kvdb_status, raft_status, ..) = disk_usage_checker.inspect(4000, 500); + assert_eq!(raft_status, disk::DiskUsage::Normal); + assert_eq!(kvdb_status, disk::DiskUsage::Normal); + assert_eq!(disk_status, disk::DiskUsage::Normal); + let (disk_status, kvdb_status, raft_status, ..) = disk_usage_checker.inspect(4900, 500); + assert_eq!(raft_status, disk::DiskUsage::Normal); + assert_eq!(kvdb_status, disk::DiskUsage::AlmostFull); + assert_eq!(disk_status, disk::DiskUsage::AlmostFull); + fail::remove("mock_disk_space_stats"); + + // Case 3: mock the kvdb and raft engine are separated and the auxiliary + // directory of raft engine is separated from the main directory of + // raft. + fail::cfg( + "mock_disk_space_stats", + "1*return(500,200)->1*return(100,20)->1*return(5000,2000)", + ) + .unwrap(); + let disk_usage_checker = DiskUsageChecker::new( + kvdb_path, + raft_path, + Some(raft_spill_path), + true, + true, + true, + 100, + 100, + 6000, + ); + let (disk_status, kvdb_status, raft_status, ..) = disk_usage_checker.inspect(4000, 450); + assert_eq!(raft_status, disk::DiskUsage::Normal); + assert_eq!(kvdb_status, disk::DiskUsage::Normal); + assert_eq!(disk_status, disk::DiskUsage::Normal); + fail::remove("mock_disk_space_stats"); + } +} diff --git a/components/server/src/memory.rs b/components/server/src/memory.rs index 303ff257a78..fadf18f7534 100644 --- a/components/server/src/memory.rs +++ b/components/server/src/memory.rs @@ -19,9 +19,24 @@ impl MemoryTraceManager { for id in ids { let sub_trace = provider.sub_trace(id); let sub_trace_name = sub_trace.name(); - MEM_TRACE_SUM_GAUGE - .with_label_values(&[&format!("{}-{}", provider_name, sub_trace_name)]) - .set(sub_trace.sum() as i64) + let leaf_ids = sub_trace.get_children_ids(); + if leaf_ids.is_empty() { + MEM_TRACE_SUM_GAUGE + .with_label_values(&[&format!("{}-{}", provider_name, sub_trace_name)]) + .set(sub_trace.sum() as i64); + } else { + for leaf_id in leaf_ids { + let leaf = sub_trace.sub_trace(leaf_id); + MEM_TRACE_SUM_GAUGE + .with_label_values(&[&format!( + "{}-{}-{}", + provider_name, + sub_trace_name, + leaf.name(), + )]) + .set(leaf.sum() as i64); + } + } } MEM_TRACE_SUM_GAUGE diff --git a/components/server/src/server.rs b/components/server/src/server.rs index 7ff51474d7d..54f159ed383 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -12,7 +12,6 @@ //! explicitly stopped. We keep these components in the `TikvServer` struct. use std::{ - cmp, collections::HashMap, convert::TryFrom, path::{Path, PathBuf}, @@ -28,7 +27,7 @@ use backup_stream::{ BackupStreamResolver, }; use causal_ts::CausalTsProviderImpl; -use cdc::{CdcConfigManager, MemoryQuota}; +use cdc::CdcConfigManager; use concurrency_manager::ConcurrencyManager; use engine_rocks::{from_rocks_compression_type, RocksEngine, RocksStatistics}; use engine_rocks_helper::sst_recovery::{RecoveryRunner, DEFAULT_CHECK_INTERVAL}; @@ -64,18 +63,21 @@ use raftstore::{ RaftBatchSystem, RaftRouter, StoreMeta, MULTI_FILES_SNAPSHOT_FEATURE, PENDING_MSG_CAP, }, memory::MEMTRACE_ROOT as MEMTRACE_RAFTSTORE, - AutoSplitController, CheckLeaderRunner, LocalReader, SnapManager, SnapManagerBuilder, - SplitCheckRunner, SplitConfigManager, StoreMetaDelegate, + snapshot_backup::PrepareDiskSnapObserver, + AutoSplitController, CheckLeaderRunner, DiskCheckRunner, LocalReader, SnapManager, + SnapManagerBuilder, SplitCheckRunner, SplitConfigManager, StoreMetaDelegate, }, RaftRouterCompactedEventSender, }; use resolved_ts::{LeadershipResolver, Task}; -use resource_control::ResourceGroupManager; +use resource_control::{priority_from_task_meta, ResourceGroupManager}; use security::SecurityManager; use service::{service_event::ServiceEvent, service_manager::GrpcServiceManager}; use snap_recovery::RecoveryService; use tikv::{ - config::{ConfigController, DbConfigManger, DbType, LogConfigManager, TikvConfig}, + config::{ + ConfigController, DbConfigManger, DbType, LogConfigManager, MemoryConfigManager, TikvConfig, + }, coprocessor::{self, MEMTRACE_ROOT as MEMTRACE_COPROCESSOR}, coprocessor_v2, import::{ImportSstService, SstImporter}, @@ -105,9 +107,11 @@ use tikv::{ Engine, Storage, }, }; +use tikv_alloc::{add_thread_memory_accessor, remove_thread_memory_accessor}; use tikv_util::{ check_environment_variables, config::VersionTrack, + memory::MemoryQuota, mpsc as TikvMpsc, quota_limiter::{QuotaLimitConfigManager, QuotaLimiter}, sys::{disk, path_in_diff_mount_point, register_memory_usage_high_water, SysQuota}, @@ -120,7 +124,10 @@ use tikv_util::{ use tokio::runtime::Builder; use crate::{ - common::{ConfiguredRaftEngine, EngineMetricsManager, EnginesResourceInfo, TikvServerCore}, + common::{ + ConfiguredRaftEngine, DiskUsageChecker, EngineMetricsManager, EnginesResourceInfo, + TikvServerCore, + }, memory::*, setup::*, signal_handler, @@ -252,6 +259,7 @@ struct TikvServer { br_snap_recovery_mode: bool, // use for br snapshot recovery resolved_ts_scheduler: Option>, grpc_service_mgr: GrpcServiceManager, + snap_br_rejector: Option>, } struct TikvEngines { @@ -266,7 +274,7 @@ struct Servers { node: Node, importer: Arc, cdc_scheduler: tikv_util::worker::Scheduler, - cdc_memory_quota: MemoryQuota, + cdc_memory_quota: Arc, rsmeter_pubsub_service: resource_metering::PubSubService, backup_stream_scheduler: Option>, debugger: DebuggerImpl>, LockManager, F>, @@ -289,10 +297,20 @@ where SecurityManager::new(&config.security) .unwrap_or_else(|e| fatal!("failed to create security manager: {}", e)), ); + let props = tikv_util::thread_group::current_properties(); let env = Arc::new( EnvBuilder::new() .cq_count(config.server.grpc_concurrency) .name_prefix(thd_name!(GRPC_THREAD_PREFIX)) + .after_start(move || { + tikv_util::thread_group::set_properties(props.clone()); + + // SAFETY: we will call `remove_thread_memory_accessor` at before_stop. + unsafe { add_thread_memory_accessor() }; + }) + .before_stop(|| { + remove_thread_memory_accessor(); + }) .build(), ); let pd_client = TikvServerCore::connect_to_pd_cluster( @@ -357,6 +375,7 @@ where router.clone(), config.coprocessor.clone(), )); + let region_info_accessor = RegionInfoAccessor::new(coprocessor_host.as_mut().unwrap()); // Initialize concurrency manager @@ -430,6 +449,7 @@ where br_snap_recovery_mode: is_recovering_marked, resolved_ts_scheduler: None, grpc_service_mgr: GrpcServiceManager::new(tx), + snap_br_rejector: None, } } @@ -496,6 +516,7 @@ where ); cfg_controller.register(tikv::config::Module::Log, Box::new(LogConfigManager)); + cfg_controller.register(tikv::config::Module::Memory, Box::new(MemoryConfigManager)); // Create cdc. let mut cdc_worker = Box::new(LazyWorker::new("cdc")); @@ -545,6 +566,7 @@ where engines.engine.clone(), resource_ctl, CleanupMethod::Remote(self.core.background_worker.remote()), + Some(Arc::new(priority_from_task_meta)), )) } else { None @@ -750,6 +772,13 @@ where let server_config = Arc::new(VersionTrack::new(self.core.config.server.clone())); self.core.config.raft_store.optimize_for(false); + self.core + .config + .raft_store + .optimize_inspector(path_in_diff_mount_point( + engines.engines.raft.get_engine_path().to_string().as_str(), + engines.engines.kv.path(), + )); self.core .config .raft_store @@ -777,20 +806,25 @@ where .unwrap_or_else(|e| fatal!("failed to bootstrap node id: {}", e)); self.snap_mgr = Some(snap_mgr.clone()); + + // Create coprocessor endpoint. + let copr = coprocessor::Endpoint::new( + &server_config.value(), + cop_read_pool_handle, + self.concurrency_manager.clone(), + resource_tag_factory, + self.quota_limiter.clone(), + self.resource_manager.clone(), + ); + let copr_config_manager = copr.config_manager(); + // Create server let server = Server::new( node.id(), &server_config, &self.security_mgr, storage.clone(), - coprocessor::Endpoint::new( - &server_config.value(), - cop_read_pool_handle, - self.concurrency_manager.clone(), - resource_tag_factory, - self.quota_limiter.clone(), - self.resource_manager.clone(), - ), + copr, coprocessor_v2::Endpoint::new(&self.core.config.coprocessor_v2), self.resolver.clone().unwrap(), Either::Left(snap_mgr.clone()), @@ -809,9 +843,14 @@ where server.get_snap_worker_scheduler(), server_config.clone(), server.get_grpc_mem_quota().clone(), + copr_config_manager, )), ); + let rejector = Arc::new(PrepareDiskSnapObserver::default()); + rejector.register_to(self.coprocessor_host.as_mut().unwrap()); + self.snap_br_rejector = Some(rejector); + // Start backup stream let backup_stream_scheduler = if self.core.config.log_backup.enable { // Create backup stream. @@ -852,6 +891,7 @@ where pd_client::meta_storage::Source::LogBackup, ))), self.core.config.log_backup.clone(), + self.core.config.resolved_ts.clone(), backup_stream_scheduler.clone(), backup_stream_ob, self.region_info_accessor.clone(), @@ -942,6 +982,8 @@ where .registry .register_consistency_check_observer(100, observer); + let disk_check_runner = DiskCheckRunner::new(self.core.store_path.clone()); + node.start( engines.engines.clone(), server.transport(), @@ -955,6 +997,7 @@ where self.concurrency_manager.clone(), collector_reg_handle, self.causal_ts_provider.clone(), + disk_check_runner, self.grpc_service_mgr.clone(), safe_point.clone(), ) @@ -986,10 +1029,13 @@ where } // Start CDC. - let cdc_memory_quota = MemoryQuota::new(self.core.config.cdc.sink_memory_quota.0 as _); + let cdc_memory_quota = Arc::new(MemoryQuota::new( + self.core.config.cdc.sink_memory_quota.0 as _, + )); let cdc_endpoint = cdc::Endpoint::new( self.core.config.server.cluster_id, &self.core.config.cdc, + &self.core.config.resolved_ts, self.core.config.storage.engine == EngineType::RaftKv2, self.core.config.storage.api_version(), self.pd_client.clone(), @@ -1069,6 +1115,7 @@ where servers.importer.clone(), None, self.resource_manager.clone(), + Arc::new(self.region_info_accessor.clone()), ); let import_cfg_mgr = import_service.get_config_manager(); @@ -1154,16 +1201,6 @@ where // Backup service. let mut backup_worker = Box::new(self.core.background_worker.lazy_build("backup-endpoint")); let backup_scheduler = backup_worker.scheduler(); - let backup_service = - backup::Service::::with_router(backup_scheduler, self.router.clone()); - if servers - .server - .register_service(create_backup(backup_service)) - .is_some() - { - fatal!("failed to register backup service"); - } - let backup_endpoint = backup::Endpoint::new( servers.node.id(), engines.engine.clone(), @@ -1175,6 +1212,20 @@ where self.causal_ts_provider.clone(), self.resource_manager.clone(), ); + let env = backup::disk_snap::Env::new( + Arc::new(Mutex::new(self.router.clone())), + self.snap_br_rejector.take().unwrap(), + Some(backup_endpoint.io_pool_handle().clone()), + ); + let backup_service = backup::Service::new(backup_scheduler, env); + if servers + .server + .register_service(create_backup(backup_service)) + .is_some() + { + fatal!("failed to register backup service"); + } + self.cfg_controller.as_mut().unwrap().register( tikv::config::Module::Backup, Box::new(backup_endpoint.get_config_manager()), @@ -1288,77 +1339,53 @@ where let raft_path = engines.raft.get_engine_path().to_string(); let separated_raft_mount_path = path_in_diff_mount_point(raft_path.as_str(), engines.kv.path()); - let raft_almost_full_threshold = reserve_raft_space; - let raft_already_full_threshold = reserve_raft_space / 2; - - let almost_full_threshold = reserve_space; - let already_full_threshold = reserve_space / 2; - fn calculate_disk_usage(a: disk::DiskUsage, b: disk::DiskUsage) -> disk::DiskUsage { - match (a, b) { - (disk::DiskUsage::AlreadyFull, _) => disk::DiskUsage::AlreadyFull, - (_, disk::DiskUsage::AlreadyFull) => disk::DiskUsage::AlreadyFull, - (disk::DiskUsage::AlmostFull, _) => disk::DiskUsage::AlmostFull, - (_, disk::DiskUsage::AlmostFull) => disk::DiskUsage::AlmostFull, - (disk::DiskUsage::Normal, disk::DiskUsage::Normal) => disk::DiskUsage::Normal, - } - } + // If the auxiliary directory of raft engine is specified, it's needed to be + // checked. Otherwise, it's not needed to be checked. And as the configuration + // is static, it's safe to check it only once. + let raft_auxiliay_path = if self.core.config.raft_engine.enable { + self.core.config.raft_engine.config().spill_dir + } else { + None + }; + let (separated_raft_auxillay_mount_path, separated_raft_auxiliary_with_kvdb) = + raft_auxiliay_path + .as_ref() + .map(|path| { + let seperated_with_kvdb = + path_in_diff_mount_point(path.as_str(), engines.kv.path()); + let seperated_with_raft = + path_in_diff_mount_point(path.as_str(), raft_path.as_str()); + ( + seperated_with_kvdb && seperated_with_raft, + seperated_with_kvdb, + ) + }) + .unwrap_or((false, false)); + let disk_usage_checker = DiskUsageChecker::new( + store_path.as_path().to_str().unwrap().to_string(), + raft_path, + raft_auxiliay_path, + separated_raft_mount_path, + separated_raft_auxillay_mount_path, + separated_raft_auxiliary_with_kvdb, + reserve_space, + reserve_raft_space, + config_disk_capacity, + ); self.core.background_worker .spawn_interval_task(DEFAULT_STORAGE_STATS_INTERVAL, move || { - let disk_stats = match fs2::statvfs(&store_path) { - Err(e) => { - error!( - "get disk stat for kv store failed"; - "kv_path" => store_path.to_str(), - "err" => ?e - ); - return; - } - Ok(stats) => stats, - }; - let disk_cap = disk_stats.total_space(); let snap_size = snap_mgr.get_total_snap_size().unwrap(); - let kv_size = engines .kv .get_engine_used_size() .expect("get kv engine size"); - let raft_size = engines .raft .get_engine_size() .expect("get raft engine size"); - - let mut raft_disk_status = disk::DiskUsage::Normal; - if separated_raft_mount_path && reserve_raft_space != 0 { - let raft_disk_stats = match fs2::statvfs(&raft_path) { - Err(e) => { - error!( - "get disk stat for raft engine failed"; - "raft_engine_path" => raft_path.clone(), - "err" => ?e - ); - return; - } - Ok(stats) => stats, - }; - let raft_disk_cap = raft_disk_stats.total_space(); - let mut raft_disk_available = - raft_disk_cap.checked_sub(raft_size).unwrap_or_default(); - raft_disk_available = cmp::min(raft_disk_available, raft_disk_stats.available_space()); - raft_disk_status = if raft_disk_available <= raft_already_full_threshold - { - disk::DiskUsage::AlreadyFull - } else if raft_disk_available <= raft_almost_full_threshold - { - disk::DiskUsage::AlmostFull - } else { - disk::DiskUsage::Normal - }; - } let placeholer_file_path = PathBuf::from_str(&data_dir) .unwrap() .join(Path::new(file_system::SPACE_PLACEHOLDER_FILE)); - let placeholder_size: u64 = file_system::get_file_size(placeholer_file_path).unwrap_or(0); @@ -1367,24 +1394,9 @@ where } else { snap_size + kv_size + placeholder_size }; - let capacity = if config_disk_capacity == 0 || disk_cap < config_disk_capacity { - disk_cap - } else { - config_disk_capacity - }; - - let mut available = capacity.checked_sub(used_size).unwrap_or_default(); - available = cmp::min(available, disk_stats.available_space()); - + // Check the disk usage and update the disk usage status. + let (cur_disk_status, cur_kv_disk_status, raft_disk_status, capacity, available) = disk_usage_checker.inspect(used_size, raft_size); let prev_disk_status = disk::get_disk_status(0); //0 no need care about failpoint. - let cur_kv_disk_status = if available <= already_full_threshold { - disk::DiskUsage::AlreadyFull - } else if available <= almost_full_threshold { - disk::DiskUsage::AlmostFull - } else { - disk::DiskUsage::Normal - }; - let cur_disk_status = calculate_disk_usage(raft_disk_status, cur_kv_disk_status); if prev_disk_status != cur_disk_status { warn!( "disk usage {:?}->{:?} (raft engine usage: {:?}, kv engine usage: {:?}), seperated raft mount={}, kv available={}, snap={}, kv={}, raft={}, capacity={}", @@ -1461,8 +1473,18 @@ where } } + fn prepare_stop(&self) { + if let Some(engines) = self.engines.as_ref() { + // Disable manul compaction jobs before shutting down the engines. And it + // will stop the compaction thread in advance, so it won't block the + // cleanup thread when exiting. + let _ = engines.engines.kv.disable_manual_compaction(); + } + } + fn stop(self) { tikv_util::thread_group::mark_shutdown(); + self.prepare_stop(); let mut servers = self.servers.unwrap(); servers .server @@ -1549,7 +1571,11 @@ impl TikvServer { let cfg_controller = self.cfg_controller.as_mut().unwrap(); cfg_controller.register( tikv::config::Module::Rocksdb, - Box::new(DbConfigManger::new(kv_engine.clone(), DbType::Kv)), + Box::new(DbConfigManger::new( + cfg_controller.get_current().rocksdb, + kv_engine.clone(), + DbType::Kv, + )), ); let reg = TabletRegistry::new( Box::new(SingletonFactory::new(kv_engine)), diff --git a/components/server/src/server2.rs b/components/server/src/server2.rs index fe2b685313e..c6becff29c9 100644 --- a/components/server/src/server2.rs +++ b/components/server/src/server2.rs @@ -12,7 +12,6 @@ //! explicitly stopped. We keep these components in the `TikvServer` struct. use std::{ - cmp, collections::HashMap, marker::PhantomData, path::{Path, PathBuf}, @@ -27,12 +26,13 @@ use std::{ }; use api_version::{dispatch_api_version, KvFormat}; +use backup::disk_snap::Env; use backup_stream::{ config::BackupStreamConfigManager, metadata::store::PdStore, observer::BackupStreamObserver, BackupStreamResolver, }; use causal_ts::CausalTsProviderImpl; -use cdc::{CdcConfigManager, MemoryQuota}; +use cdc::CdcConfigManager; use concurrency_manager::ConcurrencyManager; use engine_rocks::{from_rocks_compression_type, RocksEngine, RocksStatistics}; use engine_traits::{Engines, KvEngine, MiscExt, RaftEngine, TabletRegistry, CF_DEFAULT, CF_WRITE}; @@ -63,17 +63,17 @@ use raftstore::{ RegionInfoAccessor, }; use raftstore_v2::{ - router::{PeerMsg, RaftRouter}, + router::{DiskSnapBackupHandle, PeerMsg, RaftRouter}, StateStorage, }; use resolved_ts::Task; -use resource_control::ResourceGroupManager; +use resource_control::{priority_from_task_meta, ResourceGroupManager}; use security::SecurityManager; use service::{service_event::ServiceEvent, service_manager::GrpcServiceManager}; use tikv::{ config::{ loop_registry, ConfigController, ConfigurableDb, DbConfigManger, DbType, LogConfigManager, - TikvConfig, + MemoryConfigManager, TikvConfig, }, coprocessor::{self, MEMTRACE_ROOT as MEMTRACE_COPROCESSOR}, coprocessor_v2, @@ -103,9 +103,11 @@ use tikv::{ Engine, Storage, }, }; +use tikv_alloc::{add_thread_memory_accessor, remove_thread_memory_accessor}; use tikv_util::{ check_environment_variables, config::VersionTrack, + memory::MemoryQuota, mpsc as TikvMpsc, quota_limiter::{QuotaLimitConfigManager, QuotaLimiter}, sys::{disk, path_in_diff_mount_point, register_memory_usage_high_water, SysQuota}, @@ -118,7 +120,10 @@ use tikv_util::{ use tokio::runtime::Builder; use crate::{ - common::{ConfiguredRaftEngine, EngineMetricsManager, EnginesResourceInfo, TikvServerCore}, + common::{ + ConfiguredRaftEngine, DiskUsageChecker, EngineMetricsManager, EnginesResourceInfo, + TikvServerCore, + }, memory::*, setup::*, signal_handler, @@ -243,7 +248,7 @@ struct TikvServer { env: Arc, cdc_worker: Option>>, cdc_scheduler: Option>, - cdc_memory_quota: Option, + cdc_memory_quota: Option>, backup_stream_scheduler: Option>, sst_worker: Option>>, quota_limiter: Arc, @@ -284,10 +289,20 @@ where SecurityManager::new(&config.security) .unwrap_or_else(|e| fatal!("failed to create security manager: {}", e)), ); + let props = tikv_util::thread_group::current_properties(); let env = Arc::new( EnvBuilder::new() .cq_count(config.server.grpc_concurrency) .name_prefix(thd_name!(GRPC_THREAD_PREFIX)) + .after_start(move || { + tikv_util::thread_group::set_properties(props.clone()); + + // SAFETY: we will call `remove_thread_memory_accessor` at before_stop. + unsafe { add_thread_memory_accessor() }; + }) + .before_stop(|| { + remove_thread_memory_accessor(); + }) .build(), ); let pd_client = TikvServerCore::connect_to_pd_cluster( @@ -432,6 +447,7 @@ where ); cfg_controller.register(tikv::config::Module::Log, Box::new(LogConfigManager)); + cfg_controller.register(tikv::config::Module::Memory, Box::new(MemoryConfigManager)); let lock_mgr = LockManager::new(&self.core.config.pessimistic_txn); cfg_controller.register( @@ -459,6 +475,7 @@ where engines.engine.clone(), resource_ctl, CleanupMethod::Remote(self.core.background_worker.remote()), + Some(Arc::new(priority_from_task_meta)), )) } else { None @@ -637,10 +654,13 @@ where Box::new(CdcConfigManager(cdc_scheduler.clone())), ); // Start cdc endpoint. - let cdc_memory_quota = MemoryQuota::new(self.core.config.cdc.sink_memory_quota.0 as _); + let cdc_memory_quota = Arc::new(MemoryQuota::new( + self.core.config.cdc.sink_memory_quota.0 as _, + )); let cdc_endpoint = cdc::Endpoint::new( self.core.config.server.cluster_id, &self.core.config.cdc, + &self.core.config.resolved_ts, self.core.config.storage.engine == EngineType::RaftKv2, self.core.config.storage.api_version(), self.pd_client.clone(), @@ -712,6 +732,7 @@ where pd_client::meta_storage::Source::LogBackup, ))), self.core.config.log_backup.clone(), + self.core.config.resolved_ts.clone(), backup_stream_scheduler.clone(), backup_stream_ob, self.region_info_accessor.as_ref().unwrap().clone(), @@ -745,6 +766,17 @@ where let node = self.node.as_ref().unwrap(); + // Create coprocessor endpoint. + let copr = coprocessor::Endpoint::new( + &server_config.value(), + cop_read_pool_handle, + self.concurrency_manager.clone(), + resource_tag_factory, + self.quota_limiter.clone(), + self.resource_manager.clone(), + ); + let copr_config_manager = copr.config_manager(); + self.snap_mgr = Some(snap_mgr.clone()); // Create server let server = Server::new( @@ -752,14 +784,7 @@ where &server_config, &self.security_mgr, storage, - coprocessor::Endpoint::new( - &server_config.value(), - cop_read_pool_handle, - self.concurrency_manager.clone(), - resource_tag_factory, - self.quota_limiter.clone(), - self.resource_manager.clone(), - ), + copr, coprocessor_v2::Endpoint::new(&self.core.config.coprocessor_v2), self.resolver.clone().unwrap(), Either::Right(snap_mgr.clone()), @@ -778,6 +803,7 @@ where server.get_snap_worker_scheduler(), server_config.clone(), server.get_grpc_mem_quota().clone(), + copr_config_manager, )), ); @@ -910,7 +936,10 @@ where // Backup service. let mut backup_worker = Box::new(self.core.background_worker.lazy_build("backup-endpoint")); let backup_scheduler = backup_worker.scheduler(); - let backup_service = backup::Service::::new(backup_scheduler); + let backup_service = backup::Service::new( + backup_scheduler, + Env::new(DiskSnapBackupHandle, Default::default(), None), + ); if servers .server .register_service(create_backup(backup_service)) @@ -937,6 +966,7 @@ where backup_worker.start(backup_endpoint); // Import SST service. + let region_info_accessor = self.region_info_accessor.as_ref().unwrap().clone(); let import_service = ImportSstService::new( self.core.config.import.clone(), self.core.config.raft_store.raft_entry_max_size, @@ -945,6 +975,7 @@ where servers.importer.clone(), Some(self.router.as_ref().unwrap().store_meta().clone()), self.resource_manager.clone(), + Arc::new(region_info_accessor), ); let import_cfg_mgr = import_service.get_config_manager(); @@ -1130,36 +1161,42 @@ where let raft_path = raft_engine.get_engine_path().to_string(); let separated_raft_mount_path = path_in_diff_mount_point(raft_path.as_str(), tablet_registry.tablet_root()); - let raft_almost_full_threshold = reserve_raft_space; - let raft_already_full_threshold = reserve_raft_space / 2; - - let almost_full_threshold = reserve_space; - let already_full_threshold = reserve_space / 2; - fn calculate_disk_usage(a: disk::DiskUsage, b: disk::DiskUsage) -> disk::DiskUsage { - match (a, b) { - (disk::DiskUsage::AlreadyFull, _) => disk::DiskUsage::AlreadyFull, - (_, disk::DiskUsage::AlreadyFull) => disk::DiskUsage::AlreadyFull, - (disk::DiskUsage::AlmostFull, _) => disk::DiskUsage::AlmostFull, - (_, disk::DiskUsage::AlmostFull) => disk::DiskUsage::AlmostFull, - (disk::DiskUsage::Normal, disk::DiskUsage::Normal) => disk::DiskUsage::Normal, - } - } + // If the auxiliary directory of raft engine is specified, it's needed to be + // checked. Otherwise, it's not needed to be checked. And as the configuration + // is static, it's safe to check it only once. + let raft_auxiliay_path = if self.core.config.raft_engine.enable { + self.core.config.raft_engine.config().spill_dir + } else { + None + }; + let (separated_raft_auxillay_mount_path, separated_raft_auxiliary_with_kvdb) = + raft_auxiliay_path + .as_ref() + .map(|path| { + let seperated_with_kvdb = + path_in_diff_mount_point(path.as_str(), tablet_registry.tablet_root()); + let seperated_with_raft = + path_in_diff_mount_point(path.as_str(), raft_path.as_str()); + ( + seperated_with_kvdb && seperated_with_raft, + seperated_with_kvdb, + ) + }) + .unwrap_or((false, false)); + let disk_usage_checker = DiskUsageChecker::new( + store_path.as_path().to_str().unwrap().to_string(), + raft_path, + raft_auxiliay_path, + separated_raft_mount_path, + separated_raft_auxillay_mount_path, + separated_raft_auxiliary_with_kvdb, + reserve_space, + reserve_raft_space, + config_disk_capacity, + ); self.core.background_worker .spawn_interval_task(DEFAULT_STORAGE_STATS_INTERVAL, move || { - let disk_stats = match fs2::statvfs(&store_path) { - Err(e) => { - error!( - "get disk stat for kv store failed"; - "kv_path" => store_path.to_str(), - "err" => ?e - ); - return; - } - Ok(stats) => stats, - }; - let disk_cap = disk_stats.total_space(); let snap_size = snap_mgr.total_snap_size().unwrap(); - let mut kv_size = 0; tablet_registry.for_each_opened_tablet(|_, cached| { if let Some(tablet) = cached.latest() { @@ -1167,42 +1204,12 @@ where } true }); - let raft_size = raft_engine .get_engine_size() .expect("get raft engine size"); - - let mut raft_disk_status = disk::DiskUsage::Normal; - if separated_raft_mount_path && reserve_raft_space != 0 { - let raft_disk_stats = match fs2::statvfs(&raft_path) { - Err(e) => { - error!( - "get disk stat for raft engine failed"; - "raft_engine_path" => raft_path.clone(), - "err" => ?e - ); - return; - } - Ok(stats) => stats, - }; - let raft_disk_cap = raft_disk_stats.total_space(); - let mut raft_disk_available = - raft_disk_cap.checked_sub(raft_size).unwrap_or_default(); - raft_disk_available = cmp::min(raft_disk_available, raft_disk_stats.available_space()); - raft_disk_status = if raft_disk_available <= raft_already_full_threshold - { - disk::DiskUsage::AlreadyFull - } else if raft_disk_available <= raft_almost_full_threshold - { - disk::DiskUsage::AlmostFull - } else { - disk::DiskUsage::Normal - }; - } let placeholer_file_path = PathBuf::from_str(&data_dir) .unwrap() .join(Path::new(file_system::SPACE_PLACEHOLDER_FILE)); - let placeholder_size: u64 = file_system::get_file_size(placeholer_file_path).unwrap_or(0); @@ -1211,24 +1218,9 @@ where } else { snap_size + kv_size + placeholder_size }; - let capacity = if config_disk_capacity == 0 || disk_cap < config_disk_capacity { - disk_cap - } else { - config_disk_capacity - }; - - let mut available = capacity.checked_sub(used_size).unwrap_or_default(); - available = cmp::min(available, disk_stats.available_space()); - + // Check the disk usage and update the disk usage status. + let (cur_disk_status, cur_kv_disk_status, raft_disk_status, capacity, available) = disk_usage_checker.inspect(used_size, raft_size); let prev_disk_status = disk::get_disk_status(0); //0 no need care about failpoint. - let cur_kv_disk_status = if available <= already_full_threshold { - disk::DiskUsage::AlreadyFull - } else if available <= almost_full_threshold { - disk::DiskUsage::AlmostFull - } else { - disk::DiskUsage::Normal - }; - let cur_disk_status = calculate_disk_usage(raft_disk_status, cur_kv_disk_status); if prev_disk_status != cur_disk_status { warn!( "disk usage {:?}->{:?} (raft engine usage: {:?}, kv engine usage: {:?}), seperated raft mount={}, kv available={}, snap={}, kv={}, raft={}, capacity={}", @@ -1482,7 +1474,11 @@ impl TikvServer { let cfg_controller = self.cfg_controller.as_mut().unwrap(); cfg_controller.register( tikv::config::Module::Rocksdb, - Box::new(DbConfigManger::new(registry.clone(), DbType::Kv)), + Box::new(DbConfigManger::new( + cfg_controller.get_current().rocksdb, + registry.clone(), + DbType::Kv, + )), ); self.tablet_registry = Some(registry.clone()); raft_engine.register_config(cfg_controller); diff --git a/components/server/src/setup.rs b/components/server/src/setup.rs index b758b9e39df..b11ffbc45b6 100644 --- a/components/server/src/setup.rs +++ b/components/server/src/setup.rs @@ -245,12 +245,10 @@ pub fn initial_metric(cfg: &MetricConfig) { pub fn overwrite_config_with_cmd_args(config: &mut TikvConfig, matches: &ArgMatches<'_>) { if let Some(level) = matches.value_of("log-level") { config.log.level = logger::get_level_by_string(level).unwrap().into(); - config.log_level = slog::Level::Info.into(); } if let Some(file) = matches.value_of("log-file") { config.log.file.filename = file.to_owned(); - config.log_file = "".to_owned(); } if let Some(addr) = matches.value_of("addr") { diff --git a/components/snap_recovery/Cargo.toml b/components/snap_recovery/Cargo.toml index 8b0b0ec4c3a..23cbdcfe098 100644 --- a/components/snap_recovery/Cargo.toml +++ b/components/snap_recovery/Cargo.toml @@ -5,6 +5,13 @@ edition = "2021" publish = false # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html +[features] +default = ["test-engine-kv-rocksdb", "test-engine-raft-raft-engine"] +test-engine-kv-rocksdb = ["tikv/test-engine-kv-rocksdb"] +test-engine-raft-raft-engine = ["tikv/test-engine-raft-raft-engine"] +test-engines-rocksdb = ["tikv/test-engines-rocksdb"] +test-engines-panic = ["tikv/test-engines-panic"] + [dependencies] chrono = "0.4" encryption = { workspace = true } diff --git a/components/snap_recovery/src/leader_keeper.rs b/components/snap_recovery/src/leader_keeper.rs index 417d5becca3..ca2623c82ca 100644 --- a/components/snap_recovery/src/leader_keeper.rs +++ b/components/snap_recovery/src/leader_keeper.rs @@ -9,18 +9,17 @@ use std::{ use engine_traits::KvEngine; use futures::compat::Future01CompatExt; -use itertools::Itertools; use raftstore::{ errors::{Error, Result}, store::{Callback, CasualMessage, CasualRouter, SignificantMsg, SignificantRouter}, }; use tikv_util::{future::paired_future_callback, timer::GLOBAL_TIMER_HANDLE}; -pub struct LeaderKeeper { +pub struct LeaderKeeper<'a, EK, Router: 'a> { router: Router, not_leader: HashSet, - _ek: PhantomData, + _ek: PhantomData<&'a EK>, } #[derive(Default)] @@ -51,10 +50,10 @@ impl std::fmt::Debug for StepResult { } } -impl LeaderKeeper +impl<'a, EK, Router> LeaderKeeper<'a, EK, Router> where EK: KvEngine, - Router: CasualRouter + SignificantRouter + 'static, + Router: CasualRouter + SignificantRouter + 'a, { pub fn new(router: Router, to_keep: impl IntoIterator) -> Self { Self { @@ -85,8 +84,9 @@ where const CONCURRENCY: usize = 256; let r = Mutex::new(StepResult::default()); let success = Mutex::new(HashSet::new()); - for batch in &self.not_leader.iter().chunks(CONCURRENCY) { - let tasks = batch.map(|region_id| async { + let regions = self.not_leader.iter().copied().collect::>(); + for batch in regions.as_slice().chunks(CONCURRENCY) { + let tasks = batch.iter().map(|region_id| async { match self.check_leader(*region_id).await { Ok(_) => { success.lock().unwrap().insert(*region_id); @@ -150,7 +150,7 @@ mod test { leaders: RefCell>, } - impl LeaderKeeper { + impl<'a, EK, Router> LeaderKeeper<'a, EK, Router> { fn mut_router(&mut self) -> &mut Router { &mut self.router } diff --git a/components/snap_recovery/src/services.rs b/components/snap_recovery/src/services.rs index 10f82d64917..57716ec0272 100644 --- a/components/snap_recovery/src/services.rs +++ b/components/snap_recovery/src/services.rs @@ -2,8 +2,13 @@ use std::{ error::Error as StdError, + fmt::Display, + future::Future, result, - sync::mpsc::{sync_channel, SyncSender}, + sync::{ + atomic::{AtomicBool, Ordering}, + Arc, Mutex, + }, thread::Builder, time::Instant, }; @@ -17,10 +22,12 @@ use engine_traits::{CfNamesExt, CfOptionsExt, Engines, Peekable, RaftEngine}; use futures::{ channel::mpsc, executor::{ThreadPool, ThreadPoolBuilder}, + stream::{AbortHandle, Aborted}, FutureExt, SinkExt, StreamExt, }; use grpcio::{ - ClientStreamingSink, RequestStream, RpcContext, ServerStreamingSink, UnarySink, WriteFlags, + ClientStreamingSink, RequestStream, RpcContext, RpcStatus, RpcStatusCode, ServerStreamingSink, + UnarySink, WriteFlags, }; use kvproto::{raft_serverpb::StoreIdent, recoverdatapb::*}; use raftstore::{ @@ -28,12 +35,14 @@ use raftstore::{ store::{ fsm::RaftRouter, msg::{PeerMsg, SignificantMsg}, + snapshot_backup::{SnapshotBrWaitApplyRequest, SyncReport}, transport::SignificantRouter, - SnapshotRecoveryWaitApplySyncer, + SnapshotBrWaitApplySyncer, }, }; use thiserror::Error; use tikv_util::sys::thread::{StdThreadBuildWrapper, ThreadBuildWrapper}; +use tokio::sync::oneshot::{self, Sender}; use crate::{ data_resolver::DataResolverManager, @@ -65,6 +74,44 @@ pub struct RecoveryService { engines: Engines, router: RaftRouter, threads: ThreadPool, + + /// The handle to last call of recover region RPC. + /// + /// We need to make sure the execution of keeping leader exits before next + /// `RecoverRegion` rpc gets in. Or the previous call may stuck at keep + /// leader forever, once the second caller request the leader to be at + /// another store. + // NOTE: Perhaps it would be better to abort the procedure as soon as the client + // stream has been closed, but yet it seems there isn't such hook like + // `on_client_go` for us, and the current implementation only start + // work AFTER the client closes their sender part(!) + last_recovery_region_rpc: Arc>>, +} + +struct RecoverRegionState { + start_at: Instant, + finished: Arc, + abort: AbortHandle, +} + +impl RecoverRegionState { + /// Create the state by wrapping a execution of recover region. + fn wrap_task, T>( + task: F, + ) -> (Self, impl Future>) { + let finished = Arc::new(AtomicBool::new(false)); + let (cancelable_task, abort) = futures::future::abortable(task); + let state = Self { + start_at: Instant::now(), + finished: Arc::clone(&finished), + abort, + }; + (state, async move { + let res = cancelable_task.await; + finished.store(true, Ordering::SeqCst); + res + }) + } } impl RecoveryService { @@ -99,6 +146,7 @@ impl RecoveryService { engines, router, threads, + last_recovery_region_rpc: Arc::default(), } } @@ -140,15 +188,43 @@ impl RecoveryService { Ok(store_id) } + fn abort_last_recover_region(&self, place: impl Display) { + let mut last_state_lock = self.last_recovery_region_rpc.lock().unwrap(); + Self::abort_last_recover_region_of(place, &mut last_state_lock) + } + + fn replace_last_recover_region(&self, place: impl Display, new_state: RecoverRegionState) { + let mut last_state_lock = self.last_recovery_region_rpc.lock().unwrap(); + Self::abort_last_recover_region_of(place, &mut last_state_lock); + *last_state_lock = Some(new_state); + } + + fn abort_last_recover_region_of( + place: impl Display, + last_state_lock: &mut Option, + ) { + if let Some(last_state) = last_state_lock.take() { + info!("Another task enter, checking last task."; + "finished" => ?last_state.finished, + "start_before" => ?last_state.start_at.elapsed(), + "abort_by" => %place, + ); + if !last_state.finished.load(Ordering::SeqCst) { + last_state.abort.abort(); + warn!("Last task not finished, aborting it."); + } + } + } + // a new wait apply syncer share with all regions, // when all region reached the target index, share reference decreased to 0, // trigger closure to send finish info back. - pub fn wait_apply_last(router: RaftRouter, sender: SyncSender) { - let wait_apply = SnapshotRecoveryWaitApplySyncer::new(0, sender); + pub fn wait_apply_last(router: RaftRouter, sender: Sender) { + let wait_apply = SnapshotBrWaitApplySyncer::new(0, sender); router.broadcast_normal(|| { - PeerMsg::SignificantMsg(SignificantMsg::SnapshotRecoveryWaitApply( - wait_apply.clone(), - )) + PeerMsg::SignificantMsg(Box::new(SignificantMsg::SnapshotBrWaitApply( + SnapshotBrWaitApplyRequest::relaxed(wait_apply.clone()), + ))) }); } } @@ -190,7 +266,7 @@ impl RecoverData for RecoveryService { // 1. br start to ready region meta fn read_region_meta( &mut self, - _ctx: RpcContext<'_>, + ctx: RpcContext<'_>, _req: ReadRegionMetaRequest, mut sink: ServerStreamingSink, ) { @@ -215,6 +291,11 @@ impl RecoverData for RecoveryService { } }); + // Hacking: Sometimes, the client may omit the RPC call to `recover_region` if + // no leader should be register to some (unfortunate) store. So we abort + // last recover region here too, anyway this RPC implies a consequent + // `recover_region` for now. + self.abort_last_recover_region(format_args!("read_region_meta by {}", ctx.peer())); self.threads.spawn_ok(send_task); } @@ -222,11 +303,11 @@ impl RecoverData for RecoveryService { // assign region leader and wait leader apply to last log fn recover_region( &mut self, - _ctx: RpcContext<'_>, + ctx: RpcContext<'_>, mut stream: RequestStream, sink: ClientStreamingSink, ) { - let raft_router = self.router.clone(); + let mut raft_router = Mutex::new(self.router.clone()); let store_id = self.get_store_id(); info!("start to recover the region"); let task = async move { @@ -241,28 +322,28 @@ impl RecoverData for RecoveryService { } } - let mut lk = LeaderKeeper::new(raft_router.clone(), leaders.clone()); + let mut lk = LeaderKeeper::new(&raft_router, leaders.clone()); // We must use the tokio runtime here because there isn't a `block_in_place` // like thing in the futures executor. It simply panics when block // on the block_on context. // It is also impossible to directly `await` here, because that will make // borrowing to the raft router crosses the await point. - tokio::runtime::Builder::new_current_thread() - .build() - .expect("failed to build temporary tokio runtime.") - .block_on(lk.elect_and_wait_all_ready()); + lk.elect_and_wait_all_ready().await; info!("all region leader assigned done"; "count" => %leaders.len()); + drop(lk); let now = Instant::now(); // wait apply to the last log let mut rx_apply = Vec::with_capacity(leaders.len()); for ®ion_id in &leaders { - let (tx, rx) = sync_channel(1); + let (tx, rx) = oneshot::channel(); REGION_EVENT_COUNTER.start_wait_leader_apply.inc(); - let wait_apply = SnapshotRecoveryWaitApplySyncer::new(region_id, tx.clone()); - if let Err(e) = raft_router.significant_send( + let wait_apply = SnapshotBrWaitApplySyncer::new(region_id, tx); + if let Err(e) = raft_router.get_mut().unwrap().significant_send( region_id, - SignificantMsg::SnapshotRecoveryWaitApply(wait_apply.clone()), + SignificantMsg::SnapshotBrWaitApply(SnapshotBrWaitApplyRequest::relaxed( + wait_apply.clone(), + )), ) { error!( "failed to send wait apply"; @@ -270,23 +351,21 @@ impl RecoverData for RecoveryService { "err" => ?e, ); } - rx_apply.push(Some(rx)); + rx_apply.push(rx); } // leader apply to last log for (rid, rx) in leaders.iter().zip(rx_apply) { - if let Some(rx) = rx { - CURRENT_WAIT_APPLY_LEADER.set(*rid as _); - match rx.recv() { - Ok(region_id) => { - debug!("leader apply to last log"; "region_id" => region_id); - } - Err(e) => { - error!("leader failed to apply to last log"; "error" => ?e); - } + CURRENT_WAIT_APPLY_LEADER.set(*rid as _); + match rx.await { + Ok(_) => { + debug!("leader apply to last log"; "region_id" => rid); + } + Err(e) => { + error!("leader failed to apply to last log"; "error" => ?e); } - REGION_EVENT_COUNTER.finish_wait_leader_apply.inc(); } + REGION_EVENT_COUNTER.finish_wait_leader_apply.inc(); } CURRENT_WAIT_APPLY_LEADER.set(0); @@ -301,10 +380,20 @@ impl RecoverData for RecoveryService { Err(e) => error!("failed to get store id"; "error" => ?e), }; - let _ = sink.success(resp).await; + resp }; - self.threads.spawn_ok(task); + let (state, task) = RecoverRegionState::wrap_task(task); + self.replace_last_recover_region(format!("recover_region by {}", ctx.peer()), state); + self.threads.spawn_ok(async move { + let res = match task.await { + Ok(resp) => sink.success(resp), + Err(Aborted) => sink.fail(RpcStatus::new(RpcStatusCode::ABORTED)), + }; + if let Err(err) = res.await { + warn!("failed to response recover region rpc"; "err" => %err); + } + }); } // 3. ensure all region peer/follower apply to last @@ -318,14 +407,11 @@ impl RecoverData for RecoveryService { info!("wait_apply start"); let task = async move { let now = Instant::now(); - // FIXME: this function will exit once the first region finished apply. - // BUT for the flashback resolve KV implementation, that is fine because the - // raft log stats is consistent. - let (tx, rx) = sync_channel(1); - RecoveryService::wait_apply_last(router, tx.clone()); - match rx.recv() { + let (tx, rx) = oneshot::channel(); + RecoveryService::wait_apply_last(router, tx); + match rx.await { Ok(id) => { - info!("follower apply to last log"; "error" => id); + info!("follower apply to last log"; "report" => ?id); } Err(e) => { error!("follower failed to apply to last log"; "error" => ?e); @@ -381,3 +467,32 @@ impl RecoverData for RecoveryService { self.threads.spawn_ok(send_task); } } + +#[cfg(test)] +mod test { + use std::{sync::atomic::Ordering, time::Duration}; + + use futures::never::Never; + + use super::RecoverRegionState; + + #[test] + fn test_state() { + let rt = tokio::runtime::Builder::new_current_thread() + .enable_time() + .build() + .unwrap(); + let (state, task) = RecoverRegionState::wrap_task(futures::future::pending::()); + let hnd = rt.spawn(task); + state.abort.abort(); + rt.block_on(async { tokio::time::timeout(Duration::from_secs(10), hnd).await }) + .unwrap() + .unwrap() + .unwrap_err(); + + let (state, task) = RecoverRegionState::wrap_task(futures::future::ready(42)); + assert_eq!(state.finished.load(Ordering::SeqCst), false); + assert_eq!(rt.block_on(task), Ok(42)); + assert_eq!(state.finished.load(Ordering::SeqCst), true); + } +} diff --git a/components/sst_importer/src/OWNERS b/components/sst_importer/src/OWNERS new file mode 100644 index 00000000000..70df3de925d --- /dev/null +++ b/components/sst_importer/src/OWNERS @@ -0,0 +1,7 @@ +# See the OWNERS docs at https://go.k8s.io/owners +options: + no_parent_owners: true +filters: + "(OWNERS|config\\.rs)$": + approvers: + - sig-critical-approvers-config-components diff --git a/components/sst_importer/src/errors.rs b/components/sst_importer/src/errors.rs index 7ff940fff12..c79bd2db9f8 100644 --- a/components/sst_importer/src/errors.rs +++ b/components/sst_importer/src/errors.rs @@ -2,6 +2,7 @@ use std::{ error::Error as StdError, io::Error as IoError, num::ParseIntError, path::PathBuf, result, + time::Duration, }; use encryption::Error as EncryptionError; @@ -31,6 +32,7 @@ pub fn error_inc(type_: &str, err: &Error) { Error::BadFormat(..) => "bad_format", Error::Encryption(..) => "encryption", Error::CodecError(..) => "codec", + Error::Suspended { .. } => "suspended", _ => return, }; IMPORTER_ERROR_VEC.with_label_values(&[type_, label]).inc(); @@ -116,6 +118,12 @@ pub enum Error { #[error("Importing a SST file with imcompatible api version")] IncompatibleApiVersion, + #[error("{0}, please retry write later")] + RequestTooNew(String), + + #[error("{0}, please rescan region later")] + RequestTooOld(String), + #[error("Key mode mismatched with the request mode, writer: {:?}, storage: {:?}, key: {}", .writer, .storage_api_version, .key)] InvalidKeyMode { writer: SstWriterType, @@ -125,6 +133,12 @@ pub enum Error { #[error("resource is not enough {0}")] ResourceNotEnough(String), + + #[error("imports are suspended for {time_to_lease_expire:?}")] + Suspended { time_to_lease_expire: Duration }, + + #[error("TiKV disk space is not enough.")] + DiskSpaceNotEnough, } impl Error { @@ -160,6 +174,17 @@ impl From for import_sstpb::Error { err.set_store_error(import_err); err.set_message(format!("{}", e)); } + Error::Suspended { + time_to_lease_expire, + } => { + let mut store_err = errorpb::Error::default(); + let mut server_is_busy = errorpb::ServerIsBusy::default(); + server_is_busy.set_backoff_ms(time_to_lease_expire.as_millis() as _); + store_err.set_server_is_busy(server_is_busy); + store_err.set_message(format!("{}", e)); + err.set_store_error(store_err); + err.set_message(format!("{}", e)); + } _ => { err.set_message(format!("{}", e)); } @@ -197,6 +222,10 @@ impl ErrorCodeExt for Error { Error::IncompatibleApiVersion => error_code::sst_importer::INCOMPATIBLE_API_VERSION, Error::InvalidKeyMode { .. } => error_code::sst_importer::INVALID_KEY_MODE, Error::ResourceNotEnough(_) => error_code::sst_importer::RESOURCE_NOT_ENOUTH, + Error::Suspended { .. } => error_code::sst_importer::SUSPENDED, + Error::RequestTooNew(_) => error_code::sst_importer::REQUEST_TOO_NEW, + Error::RequestTooOld(_) => error_code::sst_importer::REQUEST_TOO_OLD, + Error::DiskSpaceNotEnough => error_code::sst_importer::DISK_SPACE_NOT_ENOUGH, } } } diff --git a/components/sst_importer/src/import_file.rs b/components/sst_importer/src/import_file.rs index b270d26a411..850df867da8 100644 --- a/components/sst_importer/src/import_file.rs +++ b/components/sst_importer/src/import_file.rs @@ -6,14 +6,14 @@ use std::{ io::{self, Write}, path::{Path, PathBuf}, sync::Arc, + time::SystemTime, }; use api_version::api_v2::TIDB_RANGES_COMPLEMENT; use encryption::{DataKeyManager, EncrypterWriter}; use engine_rocks::{get_env, RocksSstReader}; use engine_traits::{ - iter_option, EncryptionKeyManager, IterOptions, Iterator, KvEngine, RefIterable, SstExt, - SstMetaInfo, SstReader, + iter_option, EncryptionKeyManager, Iterator, KvEngine, RefIterable, SstMetaInfo, SstReader, }; use file_system::{get_io_rate_limiter, sync_dir, File, OpenOptions}; use keys::data_key; @@ -260,17 +260,36 @@ impl ImportDir { }) } - pub fn join(&self, meta: &SstMeta) -> Result { + pub fn join_for_write(&self, meta: &SstMeta) -> Result { let file_name = sst_meta_to_path(meta)?; self.get_import_path(file_name.to_str().unwrap()) } + /// Different with join_for_write, join_for_read will also handle the api + /// version 1 filenames which can be generated by old version TiKV. + pub fn join_for_read(&self, meta: &SstMeta) -> Result { + let file_name = sst_meta_to_path(meta)?; + let files_result = self.get_import_path(file_name.to_str().unwrap()); + // if files does not exists, it means the SstMeta is generated by old version + // TiKV, we try sst_meta_to_path_v1 + match files_result { + Ok(path) => { + if path.save.exists() { + return Ok(path); + } + let file_name = sst_meta_to_path_v1(meta)?; + self.get_import_path(file_name.to_str().unwrap()) + } + Err(e) => Err(e), + } + } + pub fn create( &self, meta: &SstMeta, key_manager: Option>, ) -> Result { - let path = self.join(meta)?; + let path = self.join_for_write(meta)?; if path.save.exists() { return Err(Error::FileExists(path.save, "create SST upload cache")); } @@ -289,7 +308,7 @@ impl ImportDir { } pub fn delete(&self, meta: &SstMeta, manager: Option<&DataKeyManager>) -> Result { - let path = self.join(meta)?; + let path = self.join_for_read(meta)?; self.delete_file(&path.save, manager)?; self.delete_file(&path.temp, manager)?; self.delete_file(&path.clone, manager)?; @@ -297,7 +316,7 @@ impl ImportDir { } pub fn exist(&self, meta: &SstMeta) -> Result { - let path = self.join(meta)?; + let path = self.join_for_read(meta)?; Ok(path.save.exists()) } @@ -306,7 +325,7 @@ impl ImportDir { meta: &SstMeta, key_manager: Option>, ) -> Result { - let path = self.join(meta)?; + let path = self.join_for_read(meta)?; let path_str = path.save.to_str().unwrap(); let env = get_env(key_manager, get_io_rate_limiter())?; let sst_reader = RocksSstReader::open_with_env(path_str, Some(env))?; @@ -333,7 +352,7 @@ impl ImportDir { // otherwise we are upgrade/downgrade between V1 and V2 // this can be done if all keys are written by TiDB _ => { - let path = self.join(meta)?; + let path = self.join_for_read(meta)?; let path_str = path.save.to_str().unwrap(); let env = get_env(key_manager.clone(), get_io_rate_limiter())?; let sst_reader = RocksSstReader::open_with_env(path_str, Some(env))?; @@ -381,7 +400,7 @@ impl ImportDir { let mut paths = HashMap::new(); let mut ingest_bytes = 0; for info in metas { - let path = self.join(&info.meta)?; + let path = self.join_for_read(&info.meta)?; let cf = info.meta.get_cf_name(); super::prepare_sst_for_ingestion(&path.save, &path.clone, key_manager.as_deref())?; ingest_bytes += info.total_bytes; @@ -406,7 +425,7 @@ impl ImportDir { key_manager: Option>, ) -> Result<()> { for meta in metas { - let path = self.join(meta)?; + let path = self.join_for_read(meta)?; let path_str = path.save.to_str().unwrap(); let env = get_env(key_manager.clone(), get_io_rate_limiter())?; let sst_reader = RocksSstReader::open_with_env(path_str, Some(env))?; @@ -415,32 +434,7 @@ impl ImportDir { Ok(()) } - pub fn load_start_key_by_meta( - &self, - meta: &SstMeta, - km: Option>, - ) -> Result>> { - let path = self.join(meta)?; - let r = match km { - Some(km) => E::SstReader::open_encrypted(&path.save.to_string_lossy(), km)?, - None => E::SstReader::open(&path.save.to_string_lossy())?, - }; - let opts = IterOptions::new(None, None, false); - let mut i = r.iter(opts)?; - if !i.seek_to_first()? || !i.valid()? { - return Ok(None); - } - // Should we warn if the key doesn't start with the prefix key? (Is that - // possible?) - // Also note this brings implicit coupling between this and - // RocksEngine. Perhaps it is better to make the engine to provide - // decode functions. Anyway we have directly used the RocksSstReader - // somewhere... This won't make things worse. - let real_key = i.key().strip_prefix(keys::DATA_PREFIX_KEY); - Ok(real_key.map(ToOwned::to_owned)) - } - - pub fn list_ssts(&self) -> Result> { + pub fn list_ssts(&self) -> Result> { let mut ssts = Vec::new(); for e in file_system::read_dir(&self.root_dir)? { let e = e?; @@ -449,7 +443,10 @@ impl ImportDir { } let path = e.path(); match parse_meta_from_path(&path) { - Ok(sst) => ssts.push(sst), + Ok(sst) => { + let last_modify = e.metadata()?.modified()?; + ssts.push((sst.0, sst.1, last_modify)) + } Err(e) => error!(%e; "path_to_sst_meta failed"; "path" => %path.display(),), } } @@ -458,8 +455,28 @@ impl ImportDir { } const SST_SUFFIX: &str = ".sst"; - +// version 2: compared to version 1 which is the default version, we will check +// epoch of request and local region in write API. +pub const API_VERSION_2: i32 = 2; + +/// sst_meta_to_path will encode the filepath with default api version (current +/// is 2). So when the SstMeta is created in old version of TiKV and filepath +/// will not correspond to the real file, in the deletion logic we can't remove +/// these files. pub fn sst_meta_to_path(meta: &SstMeta) -> Result { + Ok(PathBuf::from(format!( + "{}_{}_{}_{}_{}_{}{}", + UuidBuilder::from_slice(meta.get_uuid())?.build(), + meta.get_region_id(), + meta.get_region_epoch().get_conf_ver(), + meta.get_region_epoch().get_version(), + meta.get_cf_name(), + API_VERSION_2, + SST_SUFFIX, + ))) +} + +pub fn sst_meta_to_path_v1(meta: &SstMeta) -> Result { Ok(PathBuf::from(format!( "{}_{}_{}_{}_{}{}", UuidBuilder::from_slice(meta.get_uuid())?.build(), @@ -471,7 +488,7 @@ pub fn sst_meta_to_path(meta: &SstMeta) -> Result { ))) } -pub fn parse_meta_from_path>(path: P) -> Result { +pub fn parse_meta_from_path>(path: P) -> Result<(SstMeta, i32)> { let path = path.as_ref(); let file_name = match path.file_name().and_then(|n| n.to_str()) { Some(name) => name, @@ -500,11 +517,17 @@ pub fn parse_meta_from_path>(path: P) -> Result { // cf_name to path. meta.set_cf_name(elems[4].to_owned()); } - Ok(meta) + let mut api_version = 1; + if elems.len() > 5 { + api_version = elems[5].parse()?; + } + Ok((meta, api_version)) } #[cfg(test)] mod test { + use std::fs; + use engine_traits::CF_DEFAULT; use super::*; @@ -520,11 +543,12 @@ mod test { meta.mut_region_epoch().set_version(3); let path = sst_meta_to_path(&meta).unwrap(); - let expected_path = format!("{}_1_2_3_default.sst", uuid); + let expected_path = format!("{}_1_2_3_default_2.sst", uuid); assert_eq!(path.to_str().unwrap(), &expected_path); - let new_meta = parse_meta_from_path(path).unwrap(); - assert_eq!(meta, new_meta); + let meta_with_ver = parse_meta_from_path(path).unwrap(); + assert_eq!(meta, meta_with_ver.0); + assert_eq!(2, meta_with_ver.1); } #[test] @@ -543,8 +567,38 @@ mod test { meta.get_region_epoch().get_version(), SST_SUFFIX, )); - let new_meta = parse_meta_from_path(path).unwrap(); - assert_eq!(meta, new_meta); + let meta_with_ver = parse_meta_from_path(path).unwrap(); + assert_eq!(meta, meta_with_ver.0); + assert_eq!(1, meta_with_ver.1); + } + + #[test] + fn test_join_for_rw() { + use tempfile::TempDir; + use uuid::Uuid; + + let tmp = TempDir::new().unwrap(); + let dir = ImportDir::new(tmp.path()).unwrap(); + let mut meta = SstMeta::default(); + meta.set_uuid(Uuid::new_v4().as_bytes().to_vec()); + let filename_v1 = sst_meta_to_path_v1(&meta).unwrap(); + let path_v1 = tmp.path().join(filename_v1); + + let got = dir + .join_for_read(&meta) + .expect("fallback to version 1 because version 2 file does not exist"); + assert_eq!(got.save, path_v1); + + let filename_v2 = sst_meta_to_path(&meta).unwrap(); + let path_v2 = tmp.path().join(filename_v2); + fs::File::create(&path_v2).expect("create empty file"); + let got = dir.join_for_read(&meta).expect("read should succeed"); + assert_eq!(got.save, path_v2); + fs::remove_file(path_v2).expect("delete file"); + + fs::File::create(&path_v1).expect("create empty file"); + let got = dir.join_for_read(&meta).expect("read should succeed"); + assert_eq!(got.save, path_v1); } #[cfg(feature = "test-engines-rocksdb")] @@ -595,15 +649,6 @@ mod test { .unwrap(); w.finish().unwrap(); dp.save(arcmgr.as_deref()).unwrap(); - let mut ssts = dir.list_ssts().unwrap(); - ssts.iter_mut().for_each(|meta| { - let start = dir - .load_start_key_by_meta::(meta, arcmgr.clone()) - .unwrap() - .unwrap(); - meta.mut_range().set_start(start) - }); - assert_eq!(ssts, vec![meta]); } #[test] diff --git a/components/sst_importer/src/lib.rs b/components/sst_importer/src/lib.rs index 0cfc3bab774..ff137005b09 100644 --- a/components/sst_importer/src/lib.rs +++ b/components/sst_importer/src/lib.rs @@ -27,7 +27,7 @@ pub mod sst_importer; pub use self::{ config::{Config, ConfigManager}, errors::{error_inc, Error, Result}, - import_file::sst_meta_to_path, + import_file::{sst_meta_to_path, API_VERSION_2}, import_mode2::range_overlaps, sst_importer::SstImporter, sst_writer::{RawSstWriter, TxnSstWriter}, diff --git a/components/sst_importer/src/sst_importer.rs b/components/sst_importer/src/sst_importer.rs index 33f3c691a26..23ff6f26d91 100644 --- a/components/sst_importer/src/sst_importer.rs +++ b/components/sst_importer/src/sst_importer.rs @@ -4,14 +4,14 @@ use std::{ borrow::Cow, collections::HashMap, fs::File, - io::{self, BufReader, Read}, + io::{self, BufReader, ErrorKind, Read}, ops::Bound, path::{Path, PathBuf}, sync::{ atomic::{AtomicU64, Ordering}, Arc, }, - time::Duration, + time::{Duration, SystemTime}, }; use collections::HashSet; @@ -289,10 +289,27 @@ impl SstImporter { } pub fn get_path(&self, meta: &SstMeta) -> PathBuf { - let path = self.dir.join(meta).unwrap(); + let path = self.dir.join_for_read(meta).unwrap(); path.save } + pub fn get_total_size(&self) -> Result { + let mut total_size = 0; + for entry in file_system::read_dir(self.dir.get_root_dir())? { + match entry.and_then(|e| e.metadata().map(|m| (e, m))) { + Ok((_, m)) => { + if !m.is_file() { + continue; + } + total_size += m.len(); + } + Err(e) if e.kind() == ErrorKind::NotFound => continue, + Err(e) => return Err(Error::from(e)), + }; + } + Ok(total_size) + } + pub fn create(&self, meta: &SstMeta) -> Result { match self.dir.create(meta, self.key_manager.clone()) { Ok(f) => { @@ -642,7 +659,6 @@ impl SstImporter { async fn exec_download( &self, meta: &KvMeta, - rewrite_rule: &RewriteRule, ext_storage: Arc, speed_limiter: &Limiter, ) -> Result { @@ -690,9 +706,8 @@ impl SstImporter { .with_label_values(&["exec_download"]) .observe(start.saturating_elapsed().as_secs_f64()); - let rewrite_buff = self.rewrite_kv_file(buff, rewrite_rule)?; Ok(LoadedFile { - content: Arc::from(rewrite_buff.into_boxed_slice()), + content: Arc::from(buff.into_boxed_slice()), permit, }) } @@ -700,7 +715,6 @@ impl SstImporter { pub async fn do_read_kv_file( &self, meta: &KvMeta, - rewrite_rule: &RewriteRule, ext_storage: Arc, speed_limiter: &Limiter, ) -> Result { @@ -741,7 +755,7 @@ impl SstImporter { } cache - .get_or_try_init(|| self.exec_download(meta, rewrite_rule, ext_storage, speed_limiter)) + .get_or_try_init(|| self.exec_download(meta, ext_storage, speed_limiter)) .await?; Ok(CacheKvFile::Mem(cache)) } @@ -814,7 +828,6 @@ impl SstImporter { pub async fn read_from_kv_file( &self, meta: &KvMeta, - rewrite_rule: &RewriteRule, ext_storage: Arc, backend: &StorageBackend, speed_limiter: &Limiter, @@ -823,7 +836,7 @@ impl SstImporter { self.do_download_kv_file(meta, backend, speed_limiter) .await? } else { - self.do_read_kv_file(meta, rewrite_rule, ext_storage, speed_limiter) + self.do_read_kv_file(meta, ext_storage, speed_limiter) .await? }; match c { @@ -841,8 +854,7 @@ impl SstImporter { let mut buffer = Vec::new(); reader.read_to_end(&mut buffer)?; - let rewrite_buff = self.rewrite_kv_file(buffer, rewrite_rule)?; - Ok(Arc::from(rewrite_buff.into_boxed_slice())) + Ok(Arc::from(buffer.into_boxed_slice())) } } } @@ -940,7 +952,11 @@ impl SstImporter { // perform iteration and key rewrite. let mut new_buff = Vec::with_capacity(file_buff.len()); - let mut event_iter = EventIterator::new(file_buff.as_slice()); + let mut event_iter = EventIterator::with_rewriting( + file_buff.as_slice(), + rewrite_rule.get_old_key_prefix(), + rewrite_rule.get_new_key_prefix(), + ); let mut key = new_prefix.to_vec(); let new_prefix_data_key_len = key.len(); @@ -983,9 +999,14 @@ impl SstImporter { start_ts: u64, restore_ts: u64, file_buff: Arc<[u8]>, + rewrite_rule: &RewriteRule, mut build_fn: impl FnMut(Vec, Vec), ) -> Result> { - let mut event_iter = EventIterator::new(file_buff.as_ref()); + let mut event_iter = EventIterator::with_rewriting( + file_buff.as_ref(), + rewrite_rule.get_old_key_prefix(), + rewrite_rule.get_new_key_prefix(), + ); let mut smallest_key = None; let mut largest_key = None; let mut total_key = 0; @@ -1001,6 +1022,16 @@ impl SstImporter { event_iter.next()?; INPORTER_APPLY_COUNT.with_label_values(&["key_meet"]).inc(); + if !event_iter + .key() + .starts_with(rewrite_rule.get_new_key_prefix()) + { + return Err(Error::WrongKeyPrefix { + what: "do_apply_kv_file", + key: event_iter.key().to_vec(), + prefix: rewrite_rule.get_old_key_prefix().to_vec(), + }); + } let key = event_iter.key().to_vec(); let value = event_iter.value().to_vec(); let ts = Key::decode_ts_from(&key)?; @@ -1028,7 +1059,7 @@ impl SstImporter { largest_key = largest_key .map_or_else(|| Some(key.clone()), |v: Vec| Some(v.max(key.clone()))); } - if total_key != not_in_range { + if not_in_range != 0 || ts_not_expected != 0 { info!("build download request file done"; "total_keys" => %total_key, "ts_filtered_keys" => %ts_not_expected, @@ -1085,7 +1116,7 @@ impl SstImporter { engine: E, ext: DownloadExt<'_>, ) -> Result> { - let path = self.dir.join(meta)?; + let path = self.dir.join_for_write(meta)?; let file_crypter = crypter.map(|c| FileEncryptionInfo { method: to_engine_encryption_method(c.cipher_type), @@ -1349,31 +1380,24 @@ impl SstImporter { Ok(Some(final_range)) } else { // nothing is written: prevents finishing the SST at all. + // also delete the empty sst file that is created when creating sst_writer + drop(sst_writer); + let _ = file_system::remove_file(&path.save); Ok(None) } } /// List the basic information of the current SST files. - /// The information contains UUID, region ID, region Epoch. - /// Other fields may be left blank. - pub fn list_ssts(&self) -> Result> { + /// The information contains UUID, region ID, region Epoch, api version, + /// last modified time. Other fields may be left blank. + pub fn list_ssts(&self) -> Result> { self.dir.list_ssts() } - /// Load the start key by a metadata. - /// This will open the internal SST and try to load the first user key. - /// (For RocksEngine, that is the key without the 'z' prefix.) - /// When the SST is empty or the first key cannot be parsed as user key, - /// return None. - pub fn load_start_key_by_meta(&self, meta: &SstMeta) -> Result>> { - self.dir - .load_start_key_by_meta::(meta, self.key_manager.clone()) - } - pub fn new_txn_writer(&self, db: &E, meta: SstMeta) -> Result> { let mut default_meta = meta.clone(); default_meta.set_cf_name(CF_DEFAULT.to_owned()); - let default_path = self.dir.join(&default_meta)?; + let default_path = self.dir.join_for_write(&default_meta)?; let default = E::SstWriterBuilder::new() .set_db(db) .set_cf(CF_DEFAULT) @@ -1383,7 +1407,7 @@ impl SstImporter { let mut write_meta = meta; write_meta.set_cf_name(CF_WRITE.to_owned()); - let write_path = self.dir.join(&write_meta)?; + let write_path = self.dir.join_for_write(&write_meta)?; let write = E::SstWriterBuilder::new() .set_db(db) .set_cf(CF_WRITE) @@ -1409,7 +1433,7 @@ impl SstImporter { mut meta: SstMeta, ) -> Result> { meta.set_cf_name(CF_DEFAULT.to_owned()); - let default_path = self.dir.join(&meta)?; + let default_path = self.dir.join_for_write(&meta)?; let default = E::SstWriterBuilder::new() .set_db(db) .set_cf(CF_DEFAULT) @@ -1490,7 +1514,7 @@ mod tests { let mut meta = SstMeta::default(); meta.set_uuid(Uuid::new_v4().as_bytes().to_vec()); - let path = dir.join(&meta).unwrap(); + let path = dir.join_for_write(&meta).unwrap(); // Test ImportDir::create() { @@ -1556,9 +1580,9 @@ mod tests { for sst in &ssts { ingested .iter() - .find(|s| s.get_uuid() == sst.get_uuid()) + .find(|s| s.get_uuid() == sst.0.get_uuid()) .unwrap(); - dir.delete(sst, key_manager.as_deref()).unwrap(); + dir.delete(&sst.0, key_manager.as_deref()).unwrap(); } assert!(dir.list_ssts().unwrap().is_empty()); } @@ -2050,10 +2074,8 @@ mod tests { }; // test do_read_kv_file() - let rewrite_rule = &new_rewrite_rule(b"", b"", 12345); let output = block_on_external_io(importer.do_read_kv_file( &kv_meta, - rewrite_rule, ext_storage, &Limiter::new(f64::INFINITY), )) @@ -2163,7 +2185,6 @@ mod tests { }; let importer = SstImporter::new(&cfg, import_dir, Some(key_manager), ApiVersion::V1, false).unwrap(); - let rewrite_rule = &new_rewrite_rule(b"", b"", 12345); let ext_storage = { importer.wrap_kms( importer.external_storage_or_cache(&backend, "").unwrap(), @@ -2181,7 +2202,6 @@ mod tests { assert!(importer.import_support_download()); let output = block_on_external_io(importer.read_from_kv_file( &kv_meta, - rewrite_rule, ext_storage, &backend, &Limiter::new(f64::INFINITY), @@ -2308,7 +2328,7 @@ mod tests { assert_eq!(range.get_end(), b"t123_r13"); // verifies that the file is saved to the correct place. - let sst_file_path = importer.dir.join(&meta).unwrap().save; + let sst_file_path = importer.dir.join_for_read(&meta).unwrap().save; let sst_file_metadata = sst_file_path.metadata().unwrap(); assert!(sst_file_metadata.is_file()); assert_eq!(sst_file_metadata.len(), meta.get_length()); @@ -2368,7 +2388,7 @@ mod tests { assert_eq!(range.get_end(), b"t123_r13"); // verifies that the file is saved to the correct place. - let sst_file_path = importer.dir.join(&meta).unwrap().save; + let sst_file_path = importer.dir.join_for_read(&meta).unwrap().save; let sst_file_metadata = sst_file_path.metadata().unwrap(); assert!(sst_file_metadata.is_file()); assert_eq!(sst_file_metadata.len(), meta.get_length()); @@ -2418,7 +2438,7 @@ mod tests { // verifies that the file is saved to the correct place. // (the file size may be changed, so not going to check the file size) - let sst_file_path = importer.dir.join(&meta).unwrap().save; + let sst_file_path = importer.dir.join_for_read(&meta).unwrap().save; assert!(sst_file_path.is_file()); // verifies the SST content is correct. @@ -2463,7 +2483,7 @@ mod tests { // verifies that the file is saved to the correct place. // (the file size may be changed, so not going to check the file size) - let sst_file_path = importer.dir.join(&meta).unwrap().save; + let sst_file_path = importer.dir.join_for_read(&meta).unwrap().save; assert!(sst_file_path.is_file()); // verifies the SST content is correct. @@ -2507,7 +2527,7 @@ mod tests { // verifies that the file is saved to the correct place. // (the file size may be changed, so not going to check the file size) - let sst_file_path = importer.dir.join(&meta).unwrap().save; + let sst_file_path = importer.dir.join_for_read(&meta).unwrap().save; assert!(sst_file_path.is_file()); // verifies the SST content is correct. @@ -2649,7 +2669,7 @@ mod tests { // verifies that the file is saved to the correct place. // (the file size is changed, so not going to check the file size) - let sst_file_path = importer.dir.join(&meta).unwrap().save; + let sst_file_path = importer.dir.join_for_read(&meta).unwrap().save; assert!(sst_file_path.is_file()); // verifies the SST content is correct. @@ -2693,7 +2713,7 @@ mod tests { assert_eq!(range.get_end(), b"t5_r07"); // verifies that the file is saved to the correct place. - let sst_file_path = importer.dir.join(&meta).unwrap().save; + let sst_file_path = importer.dir.join_for_read(&meta).unwrap().save; assert!(sst_file_path.is_file()); // verifies the SST content is correct. @@ -2758,6 +2778,9 @@ mod tests { db, ); + let path = importer.dir.join_for_write(&meta).unwrap(); + assert!(!file_system::file_exists(path.save)); + match result { Ok(None) => {} _ => panic!("unexpected download result: {:?}", result), @@ -2826,7 +2849,7 @@ mod tests { assert_eq!(range.get_end(), b"d"); // verifies that the file is saved to the correct place. - let sst_file_path = importer.dir.join(&meta).unwrap().save; + let sst_file_path = importer.dir.join_for_read(&meta).unwrap().save; let sst_file_metadata = sst_file_path.metadata().unwrap(); assert!(sst_file_metadata.is_file()); assert_eq!(sst_file_metadata.len(), meta.get_length()); @@ -2885,7 +2908,7 @@ mod tests { assert_eq!(range.get_end(), b"c\x00"); // verifies that the file is saved to the correct place. - let sst_file_path = importer.dir.join(&meta).unwrap().save; + let sst_file_path = importer.dir.join_for_read(&meta).unwrap().save; let sst_file_metadata = sst_file_path.metadata().unwrap(); assert!(sst_file_metadata.is_file()); @@ -2940,7 +2963,7 @@ mod tests { assert_eq!(range.get_end(), b"c"); // verifies that the file is saved to the correct place. - let sst_file_path = importer.dir.join(&meta).unwrap().save; + let sst_file_path = importer.dir.join_for_read(&meta).unwrap().save; let sst_file_metadata = sst_file_path.metadata().unwrap(); assert!(sst_file_metadata.is_file()); @@ -2986,7 +3009,7 @@ mod tests { .unwrap(); // verifies the SST is compressed using Snappy. - let sst_file_path = importer.dir.join(&meta).unwrap().save; + let sst_file_path = importer.dir.join_for_read(&meta).unwrap().save; assert!(sst_file_path.is_file()); let sst_reader = new_sst_reader(sst_file_path.to_str().unwrap(), None); @@ -3033,7 +3056,7 @@ mod tests { // verifies SST compression algorithm... for meta in metas { - let sst_file_path = importer.dir.join(&meta).unwrap().save; + let sst_file_path = importer.dir.join_for_read(&meta).unwrap().save; assert!(sst_file_path.is_file()); let sst_reader = new_sst_reader(sst_file_path.to_str().unwrap(), None); diff --git a/components/test_backup/Cargo.toml b/components/test_backup/Cargo.toml index 59300f993e3..9d773b5244b 100644 --- a/components/test_backup/Cargo.toml +++ b/components/test_backup/Cargo.toml @@ -16,6 +16,7 @@ backup = { workspace = true } collections = { workspace = true } concurrency_manager = { workspace = true } crc64fast = "0.1" +engine_rocks = { workspace = true } engine_traits = { workspace = true } external_storage_export = { workspace = true } file_system = { workspace = true } @@ -25,6 +26,7 @@ futures-util = { version = "0.3", default-features = false, features = ["io"] } grpcio = { workspace = true } kvproto = { workspace = true } protobuf = "2" +raftstore = { workspace = true } rand = "0.8" tempfile = "3.0" test_raftstore = { workspace = true } diff --git a/components/test_backup/src/disk_snap.rs b/components/test_backup/src/disk_snap.rs new file mode 100644 index 00000000000..5f6b4cd9236 --- /dev/null +++ b/components/test_backup/src/disk_snap.rs @@ -0,0 +1,246 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ + collections::{HashMap, HashSet}, + sync::{Arc, Mutex}, + time::Duration, +}; + +use backup::disk_snap::Env as BEnv; +use futures_executor::block_on; +use futures_util::{ + sink::SinkExt, + stream::{Fuse, StreamExt}, +}; +use grpcio::{ + ChannelBuilder, ClientDuplexReceiver, Environment, Server, ServerBuilder, StreamingCallSink, + WriteFlags, +}; +use kvproto::{ + brpb::{ + self, PrepareSnapshotBackupEventType, PrepareSnapshotBackupRequest, + PrepareSnapshotBackupRequestType, PrepareSnapshotBackupResponse, + }, + metapb::Region, + raft_cmdpb::RaftCmdResponse, +}; +use raftstore::store::{snapshot_backup::PrepareDiskSnapObserver, Callback, WriteResponse}; +use test_raftstore::*; +use tikv_util::{ + future::{block_on_timeout, paired_future_callback}, + worker::dummy_scheduler, + HandyRwLock, +}; + +pub struct Node { + service: Option, + pub rejector: Arc, + pub backup_client: Option, +} + +pub struct Suite { + pub cluster: Cluster, + pub nodes: HashMap, + grpc_env: Arc, +} + +impl Suite { + fn crate_node(&mut self, id: u64) { + let rej = Arc::new(PrepareDiskSnapObserver::default()); + let rej2 = rej.clone(); + let mut w = self.cluster.sim.wl(); + w.coprocessor_hooks + .entry(id) + .or_default() + .push(Box::new(move |host| { + rej2.register_to(host); + })); + self.nodes.insert( + id, + Node { + service: None, + rejector: rej, + backup_client: None, + }, + ); + } + + fn start_backup(&mut self, id: u64) { + let (sched, _) = dummy_scheduler(); + let w = self.cluster.sim.wl(); + let router = Arc::new(Mutex::new(w.get_router(id).unwrap())); + let env = BEnv::new(router, self.nodes[&id].rejector.clone(), None); + let service = backup::Service::new(sched, env); + let builder = ServerBuilder::new(Arc::clone(&self.grpc_env)) + .register_service(brpb::create_backup(service)); + let mut server = builder.bind("127.0.0.1", 0).build().unwrap(); + server.start(); + let (_, port) = server.bind_addrs().next().unwrap(); + let addr = format!("127.0.0.1:{}", port); + let channel = ChannelBuilder::new(self.grpc_env.clone()).connect(&addr); + println!("connecting channel to {} for store {}", addr, id); + let client = brpb::BackupClient::new(channel); + let node = self.nodes.get_mut(&id).unwrap(); + node.service = Some(server); + node.backup_client = Some(client); + } + + pub fn try_split(&mut self, split_key: &[u8]) -> WriteResponse { + let region = self.cluster.get_region(split_key); + let (tx, rx) = paired_future_callback(); + self.cluster + .split_region(®ion, split_key, Callback::write(tx)); + block_on(rx).unwrap() + } + + pub fn split(&mut self, split_key: &[u8]) { + let region = self.cluster.get_region(split_key); + self.try_split(split_key); + self.cluster.wait_region_split(®ion); + } + + fn backup(&self, id: u64) -> &brpb::BackupClient { + self.nodes[&id].backup_client.as_ref().unwrap() + } + + pub fn prepare_backup(&self, node: u64) -> PrepareBackup { + let cli = self.backup(node); + let (tx, rx) = cli.prepare_snapshot_backup().unwrap(); + PrepareBackup { + store_id: node, + tx, + rx: rx.fuse(), + } + } + + pub fn new(node_count: u64) -> Self { + Self::new_with_cfg(node_count, |_| {}) + } + + pub fn new_with_cfg(node_count: u64, cfg: impl FnOnce(&mut Config)) -> Self { + let cluster = new_server_cluster(42, node_count as usize); + let grpc_env = Arc::new(Environment::new(1)); + let mut suite = Suite { + cluster, + nodes: HashMap::default(), + grpc_env, + }; + for id in 1..=node_count { + suite.crate_node(id); + } + cfg(&mut suite.cluster.cfg); + suite.cluster.run(); + for id in 1..=node_count { + suite.start_backup(id); + } + suite + } +} + +pub struct PrepareBackup { + tx: StreamingCallSink, + rx: Fuse>, + + pub store_id: u64, +} + +impl PrepareBackup { + pub fn prepare(&mut self, lease_sec: u64) { + let mut req = PrepareSnapshotBackupRequest::new(); + req.set_ty(PrepareSnapshotBackupRequestType::UpdateLease); + req.set_lease_in_seconds(lease_sec); + block_on(async { + self.tx.send((req, WriteFlags::default())).await.unwrap(); + self.rx.next().await.unwrap().unwrap(); + }); + } + + pub fn wait_apply(&mut self, r: impl IntoIterator) { + let mut req = PrepareSnapshotBackupRequest::new(); + req.set_ty(PrepareSnapshotBackupRequestType::WaitApply); + req.set_regions(r.into_iter().collect()); + let mut regions = req + .get_regions() + .iter() + .map(|x| x.id) + .collect::>(); + block_on(async { + self.tx.send((req, WriteFlags::default())).await.unwrap(); + while !regions.is_empty() { + let resp = self.rx.next().await.unwrap().unwrap(); + assert_eq!(resp.ty, PrepareSnapshotBackupEventType::WaitApplyDone); + assert!(!resp.has_error(), "{resp:?}"); + assert!(regions.remove(&resp.get_region().id), "{regions:?}"); + } + }); + } + + pub fn send_wait_apply(&mut self, r: impl IntoIterator) { + let mut req = PrepareSnapshotBackupRequest::new(); + req.set_ty(PrepareSnapshotBackupRequestType::WaitApply); + req.set_regions(r.into_iter().collect()); + block_on(async { + self.tx.send((req, WriteFlags::default())).await.unwrap(); + }) + } + + pub fn send_finalize(mut self) -> bool { + block_on(self.tx.send({ + let mut req = PrepareSnapshotBackupRequest::new(); + req.set_ty(PrepareSnapshotBackupRequestType::Finish); + (req, WriteFlags::default()) + })) + .unwrap(); + block_on_timeout( + async { + while let Some(item) = self.rx.next().await { + let item = item.unwrap(); + if item.ty == PrepareSnapshotBackupEventType::UpdateLeaseResult { + return item.last_lease_is_valid; + } + } + false + }, + Duration::from_secs(2), + ) + .expect("take too long to finalize the stream") + } + + pub fn next(&mut self) -> PrepareSnapshotBackupResponse { + self.try_next().unwrap() + } + + pub fn try_next(&mut self) -> grpcio::Result { + block_on(self.rx.next()).unwrap() + } +} + +#[track_caller] +pub fn must_wait_apply_success(res: &PrepareSnapshotBackupResponse) -> u64 { + assert!(!res.has_error(), "{res:?}"); + assert_eq!(res.ty, PrepareSnapshotBackupEventType::WaitApplyDone); + res.get_region().id +} + +#[track_caller] +pub fn assert_success(resp: &RaftCmdResponse) { + assert!(!resp.get_header().has_error(), "{:?}", resp); +} + +#[track_caller] +pub fn assert_failure(resp: &RaftCmdResponse) { + assert!(resp.get_header().has_error(), "{:?}", resp); +} + +#[track_caller] +pub fn assert_failure_because(resp: &RaftCmdResponse, reason_contains: &str) { + assert!(resp.get_header().has_error(), "{:?}", resp); + assert!( + resp.get_header() + .get_error() + .get_message() + .contains(reason_contains), + "{:?}", + resp + ); +} diff --git a/components/test_backup/src/lib.rs b/components/test_backup/src/lib.rs index 3a5800e989b..b952d71f5a6 100644 --- a/components/test_backup/src/lib.rs +++ b/components/test_backup/src/lib.rs @@ -38,6 +38,8 @@ use tikv_util::{ }; use txn_types::TimeStamp; +pub mod disk_snap; + pub struct TestSuite { pub cluster: Cluster, pub endpoints: HashMap>, diff --git a/components/test_coprocessor/src/store.rs b/components/test_coprocessor/src/store.rs index 96f405d8f39..876e3a6d1b9 100644 --- a/components/test_coprocessor/src/store.rs +++ b/components/test_coprocessor/src/store.rs @@ -299,6 +299,26 @@ impl Store { .collect(); FixtureStore::new(data) } + + pub fn insert_all_null_row( + &mut self, + tbl: &Table, + ctx: Context, + with_checksum: bool, + extra_checksum: Option, + ) { + self.begin(); + let inserts = self + .insert_into(tbl) + .set(&tbl["id"], Datum::Null) + .set(&tbl["name"], Datum::Null) + .set(&tbl["count"], Datum::Null) + .set_v2(&tbl["id"], ScalarValue::Int(None)) + .set_v2(&tbl["name"], ScalarValue::Bytes(None)) + .set_v2(&tbl["count"], ScalarValue::Int(None)); + inserts.execute_with_v2_checksum(ctx, with_checksum, extra_checksum); + self.commit(); + } } /// A trait for a general implementation to convert to a Txn store. diff --git a/components/test_pd_client/src/pd.rs b/components/test_pd_client/src/pd.rs index c81230f6a16..3ad5dbdcb8c 100644 --- a/components/test_pd_client/src/pd.rs +++ b/components/test_pd_client/src/pd.rs @@ -547,7 +547,9 @@ impl PdCluster { fn get_store(&self, store_id: u64) -> Result { match self.stores.get(&store_id) { Some(s) if s.store.get_id() != 0 => Ok(s.store.clone()), - _ => Err(box_err!("store {} not found", store_id)), + // Matches PD error message. + // See https://github.com/tikv/pd/blob/v7.3.0/server/grpc_service.go#L777-L780 + _ => Err(box_err!("invalid store ID {}, not found", store_id)), } } @@ -1435,15 +1437,34 @@ impl TestPdClient { cluster.replication_status = Some(status); } - pub fn switch_replication_mode(&self, state: DrAutoSyncState, available_stores: Vec) { + pub fn switch_replication_mode( + &self, + state: Option, + available_stores: Vec, + ) { let mut cluster = self.cluster.wl(); let status = cluster.replication_status.as_mut().unwrap(); + if state.is_none() { + status.set_mode(ReplicationMode::Majority); + let mut dr = status.mut_dr_auto_sync(); + dr.state_id += 1; + return; + } + status.set_mode(ReplicationMode::DrAutoSync); let mut dr = status.mut_dr_auto_sync(); dr.state_id += 1; - dr.set_state(state); + dr.set_state(state.unwrap()); dr.available_stores = available_stores; } + pub fn switch_to_drautosync_mode(&self) { + let mut cluster = self.cluster.wl(); + let status = cluster.replication_status.as_mut().unwrap(); + status.set_mode(ReplicationMode::DrAutoSync); + let mut dr = status.mut_dr_auto_sync(); + dr.state_id += 1; + } + pub fn region_replication_status(&self, region_id: u64) -> RegionReplicationStatus { self.cluster .rl() diff --git a/components/test_raftstore-v2/src/cluster.rs b/components/test_raftstore-v2/src/cluster.rs index 08de4cc3aa1..d7e369e3a4f 100644 --- a/components/test_raftstore-v2/src/cluster.rs +++ b/components/test_raftstore-v2/src/cluster.rs @@ -37,7 +37,7 @@ use pd_client::PdClient; use raftstore::{ store::{ cmd_resp, initial_region, region_meta::RegionMeta, util::check_key_in_region, Bucket, - BucketRange, Callback, RegionSnapshot, TabletSnapManager, WriteResponse, + BucketRange, Callback, RaftCmdExtraOpts, RegionSnapshot, TabletSnapManager, WriteResponse, INIT_EPOCH_CONF_VER, INIT_EPOCH_VER, }, Error, Result, @@ -51,12 +51,12 @@ use tempfile::TempDir; use test_pd_client::TestPdClient; use test_raftstore::{ check_raft_cmd_request, is_error_response, new_admin_request, new_delete_cmd, - new_delete_range_cmd, new_get_cf_cmd, new_peer, new_prepare_merge, new_put_cf_cmd, + new_delete_range_cmd, new_get_cf_cmd, new_peer, new_prepare_merge, new_put_cf_cmd, new_put_cmd, new_region_detail_cmd, new_region_leader_cmd, new_request, new_status_request, new_store, new_tikv_config_with_api_ver, new_transfer_leader_cmd, sleep_ms, Config, Filter, FilterFactory, PartitionFilterFactory, RawEngine, }; -use tikv::{server::Result as ServerResult, storage::config::EngineType}; +use tikv::{config::TikvConfig, server::Result as ServerResult, storage::config::EngineType}; use tikv_util::{ box_err, box_try, debug, error, future::block_on_timeout, @@ -69,6 +69,9 @@ use tikv_util::{ }; use txn_types::WriteBatchFlags; +// MAX duration waiting for releasing store metas, default: 10s. +const MAX_WAIT_RELEASE_INTERVAL: u32 = 1000; + // We simulate 3 or 5 nodes, each has a store. // Sometimes, we use fixed id to test, which means the id // isn't allocated by pd, and node id, store id are same. @@ -283,9 +286,18 @@ pub trait Simulator { } fn async_command_on_node( + &mut self, + node_id: u64, + request: RaftCmdRequest, + ) -> BoxFuture<'static, RaftCmdResponse> { + self.async_command_on_node_with_opts(node_id, request, RaftCmdExtraOpts::default()) + } + + fn async_command_on_node_with_opts( &mut self, node_id: u64, mut request: RaftCmdRequest, + opts: RaftCmdExtraOpts, ) -> BoxFuture<'static, RaftCmdResponse> { let region_id = request.get_header().get_region_id(); @@ -316,7 +328,11 @@ pub trait Simulator { _ => unreachable!(), } } - PeerMsg::simple_write(Box::new(request.take_header()), write_encoder.encode()) + PeerMsg::simple_write_with_opt( + Box::new(request.take_header()), + write_encoder.encode(), + opts, + ) }; self.async_peer_msg_on_node(node_id, region_id, msg) @@ -389,10 +405,7 @@ impl, EK: KvEngine> Cluster { let mut tikv_cfg = new_tikv_config_with_api_ver(id, api_version); tikv_cfg.storage.engine = EngineType::RaftKv2; Cluster { - cfg: Config { - tikv: tikv_cfg, - prefer_mem: true, - }, + cfg: Config::new(tikv_cfg, true), count, tablet_registries: HashMap::default(), key_managers_map: HashMap::default(), @@ -415,6 +428,11 @@ impl, EK: KvEngine> Cluster { } } + pub fn set_cfg(&mut self, mut cfg: TikvConfig) { + cfg.cfg_path = self.cfg.tikv.cfg_path.clone(); + self.cfg.tikv = cfg; + } + pub fn id(&self) -> u64 { self.cfg.server.cluster_id } @@ -1263,6 +1281,43 @@ impl, EK: KvEngine> Cluster { panic!("find no region for {}", log_wrappers::hex_encode_upper(key)); } + pub fn async_request( + &mut self, + mut req: RaftCmdRequest, + ) -> BoxFuture<'static, RaftCmdResponse> { + let region_id = req.get_header().get_region_id(); + let leader = self.leader_of_region(region_id).unwrap(); + req.mut_header().set_peer(leader.clone()); + self.sim + .wl() + .async_command_on_node(leader.get_store_id(), req) + } + + pub fn async_request_with_opts( + &mut self, + mut req: RaftCmdRequest, + opts: RaftCmdExtraOpts, + ) -> Result> { + let region_id = req.get_header().get_region_id(); + let leader = self.leader_of_region(region_id).unwrap(); + req.mut_header().set_peer(leader.clone()); + Ok(self + .sim + .wl() + .async_command_on_node_with_opts(leader.get_store_id(), req, opts)) + } + + pub fn async_put( + &mut self, + key: &[u8], + value: &[u8], + ) -> Result> { + let mut region = self.get_region(key); + let reqs = vec![new_put_cmd(key, value)]; + let put = new_request(region.get_id(), region.take_region_epoch(), reqs, false); + Ok(self.async_request(put)) + } + pub fn must_put(&mut self, key: &[u8], value: &[u8]) { self.must_put_cf(CF_DEFAULT, key, value); } @@ -1666,6 +1721,50 @@ impl, EK: KvEngine> Cluster { } } + pub fn must_empty_region_removed_records(&mut self, region_id: u64) { + let timer = Instant::now(); + loop { + thread::sleep(Duration::from_millis(100)); + + let leader = match self.leader_of_region(region_id) { + None => continue, + Some(l) => l, + }; + let region_state = self.region_local_state(region_id, leader.get_store_id()); + if region_state.get_removed_records().is_empty() { + return; + } + if timer.saturating_elapsed() > Duration::from_secs(5) { + panic!( + "merged records and removed records must be empty, {:?}", + region_state + ); + } + } + } + + pub fn must_empty_region_merged_records(&mut self, region_id: u64) { + let timer = Instant::now(); + loop { + thread::sleep(Duration::from_millis(100)); + + let leader = match self.leader_of_region(region_id) { + None => continue, + Some(l) => l, + }; + let region_state = self.region_local_state(region_id, leader.get_store_id()); + if region_state.get_merged_records().is_empty() { + return; + } + if timer.saturating_elapsed() > Duration::from_secs(5) { + panic!( + "merged records and removed records must be empty, {:?}", + region_state + ); + } + } + } + pub fn get_snap_dir(&self, node_id: u64) -> String { self.sim.rl().get_snap_dir(node_id) } @@ -1780,15 +1879,17 @@ impl, EK: KvEngine> Cluster { } self.leaders.clear(); for store_meta in self.store_metas.values() { - while Arc::strong_count(store_meta) != 1 { + // Limits the loop count of checking. + let mut idx = 0; + while Arc::strong_count(store_meta) != 1 && idx < MAX_WAIT_RELEASE_INTERVAL { std::thread::sleep(Duration::from_millis(10)); + idx += 1; } } self.store_metas.clear(); for sst_worker in self.sst_workers.drain(..) { sst_worker.stop_worker(); } - debug!("all nodes are shut down."); } diff --git a/components/test_raftstore-v2/src/server.rs b/components/test_raftstore-v2/src/server.rs index 7b5d501a59f..5073304e17a 100644 --- a/components/test_raftstore-v2/src/server.rs +++ b/components/test_raftstore-v2/src/server.rs @@ -222,6 +222,11 @@ impl RaftExtension for TestExtension { self.extension.report_store_unreachable(store_id) } + #[inline] + fn report_store_maybe_tombstone(&self, store_id: u64) { + self.extension.report_store_maybe_tombstone(store_id) + } + #[inline] fn report_snapshot_status( &self, @@ -556,6 +561,7 @@ impl ServerCluster { Arc::clone(&importer), Some(store_meta), resource_manager.clone(), + Arc::new(region_info_accessor.clone()), ); // Create deadlock service. diff --git a/components/test_raftstore-v2/src/util.rs b/components/test_raftstore-v2/src/util.rs index 805394b1ea0..0efad0505e8 100644 --- a/components/test_raftstore-v2/src/util.rs +++ b/components/test_raftstore-v2/src/util.rs @@ -1,6 +1,11 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use std::{fmt::Write, path::Path, sync::Arc, thread, time::Duration}; +use std::{ + fmt::Write, + sync::Arc, + thread, + time::{Duration, Instant}, +}; use encryption_export::{data_key_manager_from_config, DataKeyManager}; use engine_rocks::{RocksEngine, RocksStatistics}; @@ -8,17 +13,20 @@ use engine_test::raft::RaftTestEngine; use engine_traits::{CfName, KvEngine, TabletRegistry, CF_DEFAULT}; use file_system::IoRateLimiter; use futures::future::BoxFuture; +use grpcio::{ChannelBuilder, Environment}; use kvproto::{ encryptionpb::EncryptionMethod, - kvrpcpb::Context, + kvrpcpb::{Context, DiskFullOpt, GetResponse, Mutation, PrewriteResponse}, metapb, raft_cmdpb::{CmdType, RaftCmdRequest, RaftCmdResponse}, + tikvpb::TikvClient, }; use raftstore::{store::ReadResponse, Result}; use rand::{prelude::SliceRandom, RngCore}; use server::common::ConfiguredRaftEngine; use tempfile::TempDir; -use test_raftstore::{new_get_cmd, new_put_cf_cmd, new_request, new_snap_cmd, Config}; +use test_pd_client::TestPdClient; +use test_raftstore::{new_get_cmd, new_put_cf_cmd, new_request, new_snap_cmd, sleep_ms, Config}; use tikv::{ server::KvEngineFactoryBuilder, storage::{ @@ -27,7 +35,8 @@ use tikv::{ }, }; use tikv_util::{ - config::ReadableDuration, escape, future::block_on_timeout, worker::LazyWorker, HandyRwLock, + config::ReadableDuration, escape, future::block_on_timeout, time::InstantExt, + worker::LazyWorker, HandyRwLock, }; use txn_types::Key; @@ -131,12 +140,12 @@ pub fn put_cf_till_size, EK: KvEngine>( } pub fn configure_for_encryption(config: &mut Config) { - let manifest_dir = Path::new(env!("CARGO_MANIFEST_DIR")); + let master_key = test_util::new_test_file_master_key(config.cfg_dir.as_ref().unwrap().path()); let cfg = &mut config.security.encryption; cfg.data_encryption_method = EncryptionMethod::Aes128Ctr; cfg.data_key_rotation_period = ReadableDuration(Duration::from_millis(100)); - cfg.master_key = test_util::new_test_file_master_key(manifest_dir); + cfg.master_key = master_key; } pub fn configure_for_snapshot(config: &mut Config) { @@ -447,3 +456,136 @@ pub fn wait_down_peers, EK: KvEngine>( peers, count, peer ); } + +pub fn wait_region_epoch_change, EK: KvEngine>( + cluster: &Cluster, + waited_region: &metapb::Region, + timeout: Duration, +) { + let timer = Instant::now(); + loop { + if waited_region.get_region_epoch().get_version() + == cluster + .get_region_epoch(waited_region.get_id()) + .get_version() + { + if timer.saturating_elapsed() > timeout { + panic!( + "region {:?}, region epoch is still not changed.", + waited_region + ); + } + } else { + break; + } + sleep_ms(10); + } +} + +pub struct PeerClient { + pub cli: TikvClient, + pub ctx: Context, +} + +impl PeerClient { + pub fn new( + cluster: &Cluster, EK>, + region_id: u64, + peer: metapb::Peer, + ) -> PeerClient { + let cli = { + let env = Arc::new(Environment::new(1)); + let channel = + ChannelBuilder::new(env).connect(&cluster.sim.rl().get_addr(peer.get_store_id())); + TikvClient::new(channel) + }; + let ctx = { + let epoch = cluster.get_region_epoch(region_id); + let mut ctx = Context::default(); + ctx.set_region_id(region_id); + ctx.set_peer(peer); + ctx.set_region_epoch(epoch); + ctx + }; + PeerClient { cli, ctx } + } + + pub fn kv_read(&self, key: Vec, ts: u64) -> GetResponse { + test_raftstore::kv_read(&self.cli, self.ctx.clone(), key, ts) + } + + pub fn must_kv_read_equal(&self, key: Vec, val: Vec, ts: u64) { + test_raftstore::must_kv_read_equal(&self.cli, self.ctx.clone(), key, val, ts) + } + + pub fn must_kv_write(&self, pd_client: &TestPdClient, kvs: Vec, pk: Vec) -> u64 { + test_raftstore::must_kv_write(pd_client, &self.cli, self.ctx.clone(), kvs, pk) + } + + pub fn must_kv_prewrite(&self, muts: Vec, pk: Vec, ts: u64) { + test_raftstore::must_kv_prewrite(&self.cli, self.ctx.clone(), muts, pk, ts) + } + + pub fn try_kv_prewrite( + &self, + muts: Vec, + pk: Vec, + ts: u64, + opt: DiskFullOpt, + ) -> PrewriteResponse { + let mut ctx = self.ctx.clone(); + ctx.disk_full_opt = opt; + test_raftstore::try_kv_prewrite(&self.cli, ctx, muts, pk, ts) + } + + pub fn must_kv_prewrite_async_commit(&self, muts: Vec, pk: Vec, ts: u64) { + test_raftstore::must_kv_prewrite_with( + &self.cli, + self.ctx.clone(), + muts, + vec![], + pk, + ts, + 0, + true, + false, + ) + } + + pub fn must_kv_prewrite_one_pc(&self, muts: Vec, pk: Vec, ts: u64) { + test_raftstore::must_kv_prewrite_with( + &self.cli, + self.ctx.clone(), + muts, + vec![], + pk, + ts, + 0, + false, + true, + ) + } + + pub fn must_kv_commit(&self, keys: Vec>, start_ts: u64, commit_ts: u64) { + test_raftstore::must_kv_commit( + &self.cli, + self.ctx.clone(), + keys, + start_ts, + commit_ts, + commit_ts, + ) + } + + pub fn must_kv_rollback(&self, keys: Vec>, start_ts: u64) { + test_raftstore::must_kv_rollback(&self.cli, self.ctx.clone(), keys, start_ts) + } + + pub fn must_kv_pessimistic_lock(&self, key: Vec, ts: u64) { + test_raftstore::must_kv_pessimistic_lock(&self.cli, self.ctx.clone(), key, ts) + } + + pub fn must_kv_pessimistic_rollback(&self, key: Vec, ts: u64) { + test_raftstore::must_kv_pessimistic_rollback(&self.cli, self.ctx.clone(), key, ts, ts) + } +} diff --git a/components/test_raftstore/src/cluster.rs b/components/test_raftstore/src/cluster.rs index 23edf0efab1..912b61e041a 100644 --- a/components/test_raftstore/src/cluster.rs +++ b/components/test_raftstore/src/cluster.rs @@ -4,7 +4,10 @@ use std::{ collections::hash_map::Entry as MapEntry, error::Error as StdError, result, - sync::{mpsc, Arc, Mutex, RwLock}, + sync::{ + mpsc::{self}, + Arc, Mutex, RwLock, + }, thread, time::Duration, }; @@ -19,7 +22,7 @@ use engine_traits::{ WriteBatch, WriteBatchExt, CF_DEFAULT, CF_RAFT, }; use file_system::IoRateLimiter; -use futures::{self, channel::oneshot, executor::block_on, future::BoxFuture}; +use futures::{self, channel::oneshot, executor::block_on, future::BoxFuture, StreamExt}; use kvproto::{ errorpb::Error as PbError, kvrpcpb::{ApiVersion, Context, DiskFullOpt}, @@ -49,9 +52,8 @@ use raftstore::{ use resource_control::ResourceGroupManager; use tempfile::TempDir; use test_pd_client::TestPdClient; -use tikv::server::Result as ServerResult; +use tikv::{config::TikvConfig, server::Result as ServerResult}; use tikv_util::{ - mpsc::future, thread_group::GroupProperties, time::{Instant, ThreadReadId}, worker::LazyWorker, @@ -192,10 +194,7 @@ impl Cluster { // TODO: In the future, maybe it's better to test both case where // `use_delete_range` is true and false Cluster { - cfg: Config { - tikv: new_tikv_config_with_api_ver(id, api_version), - prefer_mem: true, - }, + cfg: Config::new(new_tikv_config_with_api_ver(id, api_version), true), leaders: HashMap::default(), count, paths: vec![], @@ -217,6 +216,11 @@ impl Cluster { } } + pub fn set_cfg(&mut self, mut cfg: TikvConfig) { + cfg.cfg_path = self.cfg.tikv.cfg_path.clone(); + self.cfg.tikv = cfg; + } + // To destroy temp dir later. pub fn take_path(&mut self) -> Vec { std::mem::take(&mut self.paths) @@ -969,7 +973,7 @@ impl Cluster { pub fn async_request( &mut self, req: RaftCmdRequest, - ) -> Result> { + ) -> Result> { self.async_request_with_opts(req, Default::default()) } @@ -977,21 +981,24 @@ impl Cluster { &mut self, mut req: RaftCmdRequest, opts: RaftCmdExtraOpts, - ) -> Result> { + ) -> Result> { let region_id = req.get_header().get_region_id(); let leader = self.leader_of_region(region_id).unwrap(); req.mut_header().set_peer(leader.clone()); - let (cb, rx) = make_cb(&req); + let (cb, mut rx) = make_cb(&req); self.sim .rl() .async_command_on_node_with_opts(leader.get_store_id(), req, cb, opts)?; - Ok(rx) + Ok(Box::pin(async move { + let fut = rx.next(); + fut.await.unwrap() + })) } pub fn async_exit_joint( &mut self, region_id: u64, - ) -> Result> { + ) -> Result> { let region = block_on(self.pd_client.get_region_by_id(region_id)) .unwrap() .unwrap(); @@ -1007,7 +1014,7 @@ impl Cluster { &mut self, key: &[u8], value: &[u8], - ) -> Result> { + ) -> Result> { let mut region = self.get_region(key); let reqs = vec![new_put_cmd(key, value)]; let put = new_request(region.get_id(), region.take_region_epoch(), reqs, false); @@ -1018,7 +1025,7 @@ impl Cluster { &mut self, region_id: u64, peer: metapb::Peer, - ) -> Result> { + ) -> Result> { let region = block_on(self.pd_client.get_region_by_id(region_id)) .unwrap() .unwrap(); @@ -1031,7 +1038,7 @@ impl Cluster { &mut self, region_id: u64, peer: metapb::Peer, - ) -> Result> { + ) -> Result> { let region = block_on(self.pd_client.get_region_by_id(region_id)) .unwrap() .unwrap(); @@ -1296,6 +1303,34 @@ impl Cluster { ); } + pub fn wait_peer_role(&self, region_id: u64, store_id: u64, peer_id: u64, role: PeerRole) { + for _ in 0..100 { + if let Some(state) = self + .get_engine(store_id) + .get_msg_cf::( + engine_traits::CF_RAFT, + &keys::region_state_key(region_id), + ) + .unwrap() + { + let peer = state + .get_region() + .get_peers() + .iter() + .find(|p| p.get_id() == peer_id) + .unwrap(); + if peer.role == role { + return; + } + } + sleep_ms(10); + } + panic!( + "[region {}] peer state still not reach {:?}", + region_id, role + ); + } + pub fn wait_last_index( &mut self, region_id: u64, @@ -1461,6 +1496,7 @@ impl Cluster { split_keys: vec![split_key], callback: cb, source: "test".into(), + share_source_region_size: false, }, ) .unwrap(); @@ -1932,7 +1968,7 @@ impl Cluster { start_key: None, end_key: None, policy: CheckPolicy::Scan, - source: "test", + source: "bucket", cb, }, ) diff --git a/components/test_raftstore/src/config.rs b/components/test_raftstore/src/config.rs index a86b8eb1bf0..001e304ece8 100644 --- a/components/test_raftstore/src/config.rs +++ b/components/test_raftstore/src/config.rs @@ -4,12 +4,37 @@ use std::ops::{Deref, DerefMut}; use tikv::config::TikvConfig; -#[derive(Clone)] pub struct Config { + // temp dir to store the persisted configuration. + // We use a temp dir to ensure the original `common-test.toml` won't be + // changed by online config. + pub cfg_dir: Option, pub tikv: TikvConfig, pub prefer_mem: bool, } +impl Config { + pub fn new(mut tikv: TikvConfig, prefer_mem: bool) -> Self { + let cfg_dir = test_util::temp_dir("test-cfg", prefer_mem); + tikv.cfg_path = cfg_dir.path().join("tikv.toml").display().to_string(); + Self { + cfg_dir: Some(cfg_dir), + tikv, + prefer_mem, + } + } +} + +impl Clone for Config { + fn clone(&self) -> Self { + Self { + cfg_dir: None, + tikv: self.tikv.clone(), + prefer_mem: self.prefer_mem, + } + } +} + impl Deref for Config { type Target = TikvConfig; #[inline] diff --git a/components/test_raftstore/src/node.rs b/components/test_raftstore/src/node.rs index f429f27ff8b..6b743a62e1b 100644 --- a/components/test_raftstore/src/node.rs +++ b/components/test_raftstore/src/node.rs @@ -334,6 +334,7 @@ impl Simulator for NodeCluster { cm, CollectorRegHandle::new_for_test(), None, + DiskCheckRunner::dummy(), GrpcServiceManager::dummy(), Arc::new(AtomicU64::new(0)), )?; diff --git a/components/test_raftstore/src/router.rs b/components/test_raftstore/src/router.rs index 3b6b1e962c3..d6a135c9f9a 100644 --- a/components/test_raftstore/src/router.rs +++ b/components/test_raftstore/src/router.rs @@ -60,7 +60,7 @@ impl CasualRouter for MockRaftStoreRouter { fn send(&self, region_id: u64, msg: CasualMessage) -> RaftStoreResult<()> { let mut senders = self.senders.lock().unwrap(); if let Some(tx) = senders.get_mut(®ion_id) { - tx.try_send(PeerMsg::CasualMessage(msg)) + tx.try_send(PeerMsg::CasualMessage(Box::new(msg))) .map_err(|e| handle_send_error(region_id, e)) } else { Err(RaftStoreError::RegionNotFound(region_id)) @@ -76,7 +76,8 @@ impl SignificantRouter for MockRaftStoreRouter { ) -> RaftStoreResult<()> { let mut senders = self.senders.lock().unwrap(); if let Some(tx) = senders.get_mut(®ion_id) { - tx.force_send(PeerMsg::SignificantMsg(msg)).unwrap(); + tx.force_send(PeerMsg::SignificantMsg(Box::new(msg))) + .unwrap(); Ok(()) } else { error!("failed to send significant msg"; "msg" => ?msg); diff --git a/components/test_raftstore/src/server.rs b/components/test_raftstore/src/server.rs index 8d26bae968d..a6673ebb66e 100644 --- a/components/test_raftstore/src/server.rs +++ b/components/test_raftstore/src/server.rs @@ -37,8 +37,9 @@ use raftstore::{ store::{ fsm::{store::StoreMeta, ApplyRouter, RaftBatchSystem, RaftRouter}, msg::RaftCmdExtraOpts, - AutoSplitController, Callback, CheckLeaderRunner, LocalReader, RegionSnapshot, SnapManager, - SnapManagerBuilder, SplitCheckRunner, SplitConfigManager, StoreMetaDelegate, + AutoSplitController, Callback, CheckLeaderRunner, DiskCheckRunner, LocalReader, + RegionSnapshot, SnapManager, SnapManagerBuilder, SplitCheckRunner, SplitConfigManager, + StoreMetaDelegate, }, Result, }; @@ -113,8 +114,8 @@ impl StoreAddrResolver for AddressMap { fn resolve( &self, store_id: u64, - cb: Box) + Send>, - ) -> ServerResult<()> { + cb: Box) + Send>, + ) -> resolve::Result<()> { let addr = self.get(store_id); match addr { Some(addr) => cb(Ok(addr)), @@ -451,6 +452,7 @@ impl ServerCluster { Arc::clone(&importer), None, resource_manager.clone(), + Arc::new(region_info_accessor.clone()), ); // Create deadlock service. @@ -616,6 +618,7 @@ impl ServerCluster { concurrency_manager.clone(), collector_reg_handle, causal_ts_provider, + DiskCheckRunner::dummy(), GrpcServiceManager::dummy(), Arc::new(AtomicU64::new(0)), )?; diff --git a/components/test_raftstore/src/transport_simulate.rs b/components/test_raftstore/src/transport_simulate.rs index ef569e3987a..66568b0e2ba 100644 --- a/components/test_raftstore/src/transport_simulate.rs +++ b/components/test_raftstore/src/transport_simulate.rs @@ -268,6 +268,12 @@ pub trait FilterFactory { fn generate(&self, node_id: u64) -> Vec>; } +impl Fl, Fl: Filter + 'static> FilterFactory for F { + fn generate(&self, node_id: u64) -> Vec> { + vec![Box::new(self(node_id)) as _] + } +} + #[derive(Default)] pub struct DefaultFilterFactory(PhantomData); diff --git a/components/test_raftstore/src/util.rs b/components/test_raftstore/src/util.rs index 02a74136bb6..21d90097b22 100644 --- a/components/test_raftstore/src/util.rs +++ b/components/test_raftstore/src/util.rs @@ -17,7 +17,7 @@ use engine_rocks::{config::BlobRunMode, RocksEngine, RocksSnapshot, RocksStatist use engine_test::raft::RaftTestEngine; use engine_traits::{ CfName, CfNamesExt, Engines, Iterable, KvEngine, Peekable, RaftEngineDebug, RaftEngineReadOnly, - CF_DEFAULT, CF_RAFT, + CF_DEFAULT, CF_RAFT, CF_WRITE, }; use file_system::IoRateLimiter; use futures::{executor::block_on, future::BoxFuture, StreamExt}; @@ -56,7 +56,12 @@ use tikv::{ }; pub use tikv_util::store::{find_peer, new_learner_peer, new_peer}; use tikv_util::{ - config::*, escape, mpsc::future, time::ThreadReadId, worker::LazyWorker, HandyRwLock, + config::*, + escape, + mpsc::future, + time::{Instant, ThreadReadId}, + worker::LazyWorker, + HandyRwLock, }; use txn_types::Key; @@ -81,15 +86,14 @@ pub fn must_get( } debug!("last try to get {}", log_wrappers::hex_encode_upper(key)); let res = engine.get_value_cf(cf, &keys::data_key(key)).unwrap(); - if value.is_none() && res.is_none() - || value.is_some() && res.is_some() && value.unwrap() == &*res.unwrap() - { + if value == res.as_ref().map(|r| r.as_ref()) { return; } panic!( - "can't get value {:?} for key {}", + "can't get value {:?} for key {}, actual={:?}", value.map(escape), - log_wrappers::hex_encode_upper(key) + log_wrappers::hex_encode_upper(key), + res ) } @@ -610,6 +614,7 @@ pub fn must_error_read_on_peer( } } +#[track_caller] pub fn must_contains_error(resp: &RaftCmdResponse, msg: &str) { let header = resp.get_header(); assert!(header.has_error()); @@ -669,11 +674,11 @@ pub fn create_test_engine( ) } -pub fn configure_for_request_snapshot(cluster: &mut Cluster) { +pub fn configure_for_request_snapshot(config: &mut Config) { // We don't want to generate snapshots due to compact log. - cluster.cfg.raft_store.raft_log_gc_threshold = 1000; - cluster.cfg.raft_store.raft_log_gc_count_limit = Some(1000); - cluster.cfg.raft_store.raft_log_gc_size_limit = Some(ReadableSize::mb(20)); + config.raft_store.raft_log_gc_threshold = 1000; + config.raft_store.raft_log_gc_count_limit = Some(1000); + config.raft_store.raft_log_gc_size_limit = Some(ReadableSize::mb(20)); } pub fn configure_for_hibernate(config: &mut Config) { @@ -784,6 +789,14 @@ pub fn put_till_size( put_cf_till_size(cluster, CF_DEFAULT, limit, range) } +pub fn put_till_count( + cluster: &mut Cluster, + limit: u64, + range: &mut dyn Iterator, +) -> Vec { + put_cf_till_count(cluster, CF_WRITE, limit, range) +} + pub fn put_cf_till_size( cluster: &mut Cluster, cf: &'static str, @@ -816,6 +829,36 @@ pub fn put_cf_till_size( key.into_bytes() } +pub fn put_cf_till_count( + cluster: &mut Cluster, + cf: &'static str, + limit: u64, + range: &mut dyn Iterator, +) -> Vec { + assert!(limit > 0); + let mut len = 0; + let mut rng = rand::thread_rng(); + let mut key = String::new(); + let mut value = vec![0; 64]; + while len < limit { + let batch_size = std::cmp::min(5, limit - len); + let mut reqs = vec![]; + for _ in 0..batch_size { + key.clear(); + let key_id = range.next().unwrap(); + write!(key, "{:09}", key_id).unwrap(); + rng.fill_bytes(&mut value); + reqs.push(new_put_cf_cmd(cf, key.as_bytes(), &value)); + } + len += batch_size; + cluster.batch_put(key.as_bytes(), reqs).unwrap(); + // Approximate size of memtable is inaccurate for small data, + // we flush it to SST so we can use the size properties instead. + cluster.must_flush_cf(cf, true); + } + key.into_bytes() +} + pub fn new_mutation(op: Op, k: &[u8], v: &[u8]) -> Mutation { let mut mutation = Mutation::default(); mutation.set_op(op); @@ -959,6 +1002,7 @@ pub fn must_kv_prewrite_with( client: &TikvClient, ctx: Context, muts: Vec, + pessimistic_actions: Vec, pk: Vec, ts: u64, for_update_ts: u64, @@ -968,7 +1012,7 @@ pub fn must_kv_prewrite_with( let mut prewrite_req = PrewriteRequest::default(); prewrite_req.set_context(ctx); if for_update_ts != 0 { - prewrite_req.pessimistic_actions = vec![DoPessimisticCheck; muts.len()]; + prewrite_req.pessimistic_actions = pessimistic_actions; } prewrite_req.set_mutations(muts.into_iter().collect()); prewrite_req.primary_lock = pk; @@ -995,6 +1039,7 @@ pub fn try_kv_prewrite_with( client: &TikvClient, ctx: Context, muts: Vec, + pessimistic_actions: Vec, pk: Vec, ts: u64, for_update_ts: u64, @@ -1005,6 +1050,7 @@ pub fn try_kv_prewrite_with( client, ctx, muts, + pessimistic_actions, pk, ts, for_update_ts, @@ -1018,6 +1064,7 @@ pub fn try_kv_prewrite_with_impl( client: &TikvClient, ctx: Context, muts: Vec, + pessimistic_actions: Vec, pk: Vec, ts: u64, for_update_ts: u64, @@ -1027,7 +1074,7 @@ pub fn try_kv_prewrite_with_impl( let mut prewrite_req = PrewriteRequest::default(); prewrite_req.set_context(ctx); if for_update_ts != 0 { - prewrite_req.pessimistic_actions = vec![DoPessimisticCheck; muts.len()]; + prewrite_req.pessimistic_actions = pessimistic_actions; } prewrite_req.set_mutations(muts.into_iter().collect()); prewrite_req.primary_lock = pk; @@ -1047,7 +1094,7 @@ pub fn try_kv_prewrite( pk: Vec, ts: u64, ) -> PrewriteResponse { - try_kv_prewrite_with(client, ctx, muts, pk, ts, 0, false, false) + try_kv_prewrite_with(client, ctx, muts, vec![], pk, ts, 0, false, false) } pub fn try_kv_prewrite_pessimistic( @@ -1057,7 +1104,18 @@ pub fn try_kv_prewrite_pessimistic( pk: Vec, ts: u64, ) -> PrewriteResponse { - try_kv_prewrite_with(client, ctx, muts, pk, ts, ts, false, false) + let len = muts.len(); + try_kv_prewrite_with( + client, + ctx, + muts, + vec![DoPessimisticCheck; len], + pk, + ts, + ts, + false, + false, + ) } pub fn must_kv_prewrite( @@ -1067,7 +1125,7 @@ pub fn must_kv_prewrite( pk: Vec, ts: u64, ) { - must_kv_prewrite_with(client, ctx, muts, pk, ts, 0, false, false) + must_kv_prewrite_with(client, ctx, muts, vec![], pk, ts, 0, false, false) } pub fn must_kv_prewrite_pessimistic( @@ -1077,7 +1135,18 @@ pub fn must_kv_prewrite_pessimistic( pk: Vec, ts: u64, ) { - must_kv_prewrite_with(client, ctx, muts, pk, ts, ts, false, false) + let len = muts.len(); + must_kv_prewrite_with( + client, + ctx, + muts, + vec![DoPessimisticCheck; len], + pk, + ts, + ts, + false, + false, + ) } pub fn must_kv_commit( @@ -1233,6 +1302,50 @@ pub fn must_check_txn_status( resp } +pub fn must_kv_have_locks( + client: &TikvClient, + ctx: Context, + ts: u64, + start_key: &[u8], + end_key: &[u8], + expected_locks: &[( + // key + &[u8], + Op, + // start_ts + u64, + // for_update_ts + u64, + )], +) { + let mut req = ScanLockRequest::default(); + req.set_context(ctx); + req.set_limit(100); + req.set_start_key(start_key.to_vec()); + req.set_end_key(end_key.to_vec()); + req.set_max_version(ts); + let resp = client.kv_scan_lock(&req).unwrap(); + assert!(!resp.has_region_error(), "{:?}", resp.get_region_error()); + assert!(resp.error.is_none(), "{:?}", resp.get_error()); + + assert_eq!( + resp.locks.len(), + expected_locks.len(), + "lock count not match, expected: {:?}; got: {:?}", + expected_locks, + resp.locks + ); + + for (lock_info, (expected_key, expected_op, expected_start_ts, expected_for_update_ts)) in + resp.locks.into_iter().zip(expected_locks.iter()) + { + assert_eq!(lock_info.get_key(), *expected_key); + assert_eq!(lock_info.get_lock_type(), *expected_op); + assert_eq!(lock_info.get_lock_version(), *expected_start_ts); + assert_eq!(lock_info.get_lock_for_update_ts(), *expected_for_update_ts); + } +} + pub fn get_tso(pd_client: &TestPdClient) -> u64 { block_on(pd_client.get_tso()).unwrap().into_inner() } @@ -1310,17 +1423,33 @@ pub fn must_raw_put(client: &TikvClient, ctx: Context, key: Vec, value: Vec< put_req.set_context(ctx); put_req.key = key; put_req.value = value; - let put_resp = client.raw_put(&put_req).unwrap(); - assert!( - !put_resp.has_region_error(), - "{:?}", - put_resp.get_region_error() - ); - assert!( - put_resp.get_error().is_empty(), - "{:?}", - put_resp.get_error() - ); + + let retryable = |err: &kvproto::errorpb::Error| -> bool { err.has_max_timestamp_not_synced() }; + let start = Instant::now_coarse(); + loop { + let put_resp = client.raw_put(&put_req).unwrap(); + if put_resp.has_region_error() { + let err = put_resp.get_region_error(); + if retryable(err) && start.saturating_elapsed() < Duration::from_secs(5) { + debug!("must_raw_put meet region error"; "err" => ?err); + sleep_ms(100); + continue; + } + panic!( + "must_raw_put meet region error: {:?}, ctx: {:?}, key: {}, value {}", + err, + put_req.get_context(), + tikv_util::escape(&put_req.key), + tikv_util::escape(&put_req.value), + ); + } + assert!( + put_resp.get_error().is_empty(), + "must_raw_put meet error: {:?}", + put_resp.get_error() + ); + return; + } } pub fn must_raw_get(client: &TikvClient, ctx: Context, key: Vec) -> Option> { @@ -1441,11 +1570,31 @@ impl PeerClient { } pub fn must_kv_prewrite_async_commit(&self, muts: Vec, pk: Vec, ts: u64) { - must_kv_prewrite_with(&self.cli, self.ctx.clone(), muts, pk, ts, 0, true, false) + must_kv_prewrite_with( + &self.cli, + self.ctx.clone(), + muts, + vec![], + pk, + ts, + 0, + true, + false, + ) } pub fn must_kv_prewrite_one_pc(&self, muts: Vec, pk: Vec, ts: u64) { - must_kv_prewrite_with(&self.cli, self.ctx.clone(), muts, pk, ts, 0, false, true) + must_kv_prewrite_with( + &self.cli, + self.ctx.clone(), + muts, + vec![], + pk, + ts, + 0, + false, + true, + ) } pub fn must_kv_commit(&self, keys: Vec>, start_ts: u64, commit_ts: u64) { diff --git a/components/test_util/data/.gitignore b/components/test_util/data/.gitignore new file mode 100644 index 00000000000..0773e460402 --- /dev/null +++ b/components/test_util/data/.gitignore @@ -0,0 +1,2 @@ +ca.key +server.csr diff --git a/components/test_util/data/ca.pem b/components/test_util/data/ca.pem index e130a8eece9..05015192501 100644 --- a/components/test_util/data/ca.pem +++ b/components/test_util/data/ca.pem @@ -1,22 +1,19 @@ -----BEGIN CERTIFICATE----- -MIIDojCCAoqgAwIBAgIUdZFW8VQoZZzek8cA+5GGu6ZInjowDQYJKoZIhvcNAQEL -BQAwVzELMAkGA1UEBhMCQ04xEDAOBgNVBAgTB0JlaWppbmcxEDAOBgNVBAcTB0Jl -aWppbmcxEDAOBgNVBAoTB1BpbmdDQVAxEjAQBgNVBAMTCU15IG93biBDQTAeFw0x -OTA5MDIwNjEyMDBaFw0yNDA4MzEwNjEyMDBaMFcxCzAJBgNVBAYTAkNOMRAwDgYD -VQQIEwdCZWlqaW5nMRAwDgYDVQQHEwdCZWlqaW5nMRAwDgYDVQQKEwdQaW5nQ0FQ -MRIwEAYDVQQDEwlNeSBvd24gQ0EwggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEK -AoIBAQDcDtQ7UX+xlVY0vpklp1uUmPoFsN0U6fqRzHU+LvYS5AM5RPJMVLiKBiSi -zGsB+XPmXZ8H7rZZ+osZsEmDIF3HdyiSNpPNzRJKxsz4KVRzfoKZXL9D41TpuE27 -+7tN6qGytYrnAy8cHMA0S1TnQ0biOFTcXZrwh5lvlIcx7ceUamGuEl94tblxSSJl -2SkpHkKIDv0kcgoGmmh4y8SzAtmnwcCjkCSoITvvwKklp5830pFKOnpN9uZJzkXa -tuUSpSji/JG79nQfH91LtL7xMprORVtg9YAa3aJm0Uf33WFvaCTSrt//7CVK8nqK -xayS3u7dNH3GV9b81OGtlR76leFlAgMBAAGjZjBkMA4GA1UdDwEB/wQEAwIBBjAS -BgNVHRMBAf8ECDAGAQH/AgECMB0GA1UdDgQWBBS3hxTaN9B7eF8xr0DKLZ3b5vFn -rDAfBgNVHSMEGDAWgBS3hxTaN9B7eF8xr0DKLZ3b5vFnrDANBgkqhkiG9w0BAQsF -AAOCAQEAi9WiEvTQQjmb7ekXHf1tKwdLNu5akQXIwTKeZSWRSeMgqVQcoyTZMPBX -ythl6K3175RUIMtCwO4uZTOpRU1mTl0pIjoEcJGHYX91zyA5BjWahXZttvt7/hyX -UwJN9clBXLfZTCp1ysLCtarLcip4WxWNsxEwXFUisE2gbu3F9ELHAbRSVUe/CwC6 -8BkY+G+fovazjGoTV4NadJVFRzTR/zsWkBNllBOBTrop8FH23ePVh3hXafzJlcip -bDbRxNqSzNtLr88mwswklgiIHXF6PY2TkyscsXVkHPAswZnrv4lLov7M3VjL8ITA -uYm4Me5Tmj+6pb+Foky15+ehmicQbA== +MIIDITCCAgmgAwIBAgIUVe4Q3uw8yW0seqG9yQMfXrSXLHswDQYJKoZIhvcNAQEL +BQAwFzEVMBMGA1UEAwwMdGlrdl90ZXN0X2NhMCAXDTI0MDMyNzAwMDAwMFoYDzIx +MjQwMzI3MDAwMDAwWjAXMRUwEwYDVQQDDAx0aWt2X3Rlc3RfY2EwggEiMA0GCSqG +SIb3DQEBAQUAA4IBDwAwggEKAoIBAQC+cR9jZ0LtX4ztcupCEyrR8CNmw1TkIsOQ +rPhP43FkdggQN2vqkM9ZtKxlcODuNul748saEFoK1AGv4MgxgKcbt6sTucdz4oC4 +O1rM31eicU630PblPNU8Bstvlta2jCZAbERBhiAm1C3zQncodyVr9Oa2Ff9SRXcW +7icpv4CTsesPi19nF+EhBAuCifeI3Vj1Uvd5wvsK/m0D0gpp3Vp7CNYwHLv9gfPu +Jui0Q8NM5ENBcIfUBK8zOvr8a5glqV36KUA4m7yDXyYHIR2SrD/y3XDc6cbRgxKS +qbQMbc67H2XJHWjHgp7gv9rdU9HGxfv49j+TnxwYNPb3aflBgk1JAgMBAAGjYzBh +MB0GA1UdDgQWBBS+Qw4MGLTrjFTO2xlGwlj+yy1o7zAfBgNVHSMEGDAWgBS+Qw4M +GLTrjFTO2xlGwlj+yy1o7zASBgNVHRMBAf8ECDAGAQH/AgEAMAsGA1UdDwQEAwIB +BjANBgkqhkiG9w0BAQsFAAOCAQEAezNpYWbkxvIv/MfWxpbF+TBA1ssWT1xyuwUu +P9EdXj72XKnpkmZXxysyCSZR1ZH9XwuqHgQkegxQMKMeiv5UQLouTFEa5LUJxlQw +A3O1Ky1r6dv6p/JkOAbMxh+VoWAFCW5Ioo81rwZLGu3DS3+gsauDBpevIqLlL29H +FAQ+JO33pzuAP+PBHnO0Zi3ddvgDNULpHQhC8BUR9fI/NsxKuS4QwPGK1fnd/Qvg +w5aP0PZ8CNheIvVy4qkeUsh2kS7vghMpwa/KkCGurmwg2C6sgCTDKJVgkTOEutx4 +5LfVfuwAmWWnrqWwLtAyJI5SAUhXLtSu2DeyC5ppP5DFRBP3qw== -----END CERTIFICATE----- diff --git a/components/test_util/data/generate_certs.sh b/components/test_util/data/generate_certs.sh new file mode 100755 index 00000000000..08156aa5e68 --- /dev/null +++ b/components/test_util/data/generate_certs.sh @@ -0,0 +1,40 @@ +#!/bin/bash +# +# Copyright 2024 TiKV Project Authors. Licensed under Apache-2.0. + +set -euo pipefail + +SCRIPT_PATH="$(realpath "$0")" +CERT_DIR="$(dirname "$SCRIPT_PATH")" +CA_KEY="$CERT_DIR/ca.key" +CA_CERT="$CERT_DIR/ca.pem" +SERVER_KEY="$CERT_DIR/key.pem" +SERVER_CSR="$CERT_DIR/server.csr" +SERVER_CERT="$CERT_DIR/server.pem" +VALID_DAYS=3650 +RSA_KEY_SIZE=2048 + +# CA certs. +openssl genrsa -out "$CA_KEY" "$RSA_KEY_SIZE" +openssl req -new -x509 -days "$VALID_DAYS" -key "$CA_KEY" -out "$CA_CERT" \ + -subj "/CN=tikv_test_ca" \ + -addext "basicConstraints = critical,CA:TRUE,pathlen:0" \ + -addext "keyUsage = cRLSign, keyCertSign" +echo "CA certificate:" +openssl x509 -text -in "$CA_CERT" -noout + +# Server certs. +openssl genrsa -out "$SERVER_KEY" "$RSA_KEY_SIZE" +openssl req -new -key "$SERVER_KEY" -out "$SERVER_CSR" \ + -extensions v3_ca \ + -subj "/CN=tikv-server" \ + -addext "basicConstraints = critical, CA:FALSE" \ + -addext "keyUsage = critical, digitalSignature, keyEncipherment" \ + -addext "extendedKeyUsage = serverAuth, clientAuth" \ + -addext "subjectAltName = IP.1:172.16.5.40, IP.2:127.0.0.1" +openssl x509 -req -days "$VALID_DAYS" \ + -CA "$CA_CERT" -CAkey "$CA_KEY" -CAcreateserial \ + -copy_extensions copyall \ + -in "$SERVER_CSR" -out "$SERVER_CERT" +echo "Server certificate:" +openssl x509 -text -in "$SERVER_CERT" -noout diff --git a/components/test_util/data/key.pem b/components/test_util/data/key.pem index c7f9fa8c340..61ab0c3f029 100644 --- a/components/test_util/data/key.pem +++ b/components/test_util/data/key.pem @@ -1,27 +1,28 @@ ------BEGIN RSA PRIVATE KEY----- -MIIEogIBAAKCAQEAsRpq/E/VC82YxsC5LlKFvI9HJuchMtKskn53anW4rNE3sfN0 -WDS6qCyxNumUVBqO98J18xxbz/XkV7aP6TcXZrNgEqw07PZWTDoyZVi+n9HXyWwl -BeiE2WWrCESqsar+cXV5UE3oE7Y4CT56tMN+awKqnf1zLyRl9DlqSg1/GabheVzz -fGhdqddqdpAZcaOHH8UMEWdnZ4qTFaaGNRlrRy3W0VjzgIocQorpvvtZkQM5iCxx -z9wuF9/6gGdopTA0J2SvZKa+oI/867NLpN5Hx+cn/ThHhCTh1N34Ulloa0aiou72 -mGgyMIdQxYAsRnG62EHn+9aPtegIjQd13Be9/wIDAQABAoIBAHJ8v3iIKxNMP10M -rSlS032HqdluRLnUExdIhe3eWBnvze9NkIKM47Vf3te+u9J6sL1dil40kO2o6YoC -TJnYsVoEzzCC/lvJCxSP8pAthF1QjAx7yps9KtRWsu/PZAEipwW1iUzub/5+J09i -gnRkhE6tFJq5g0KQZxAwJZPlkaqEcZIOObfh9zD9hutvCPmXBtB600EbQU4XzyjP -KaU08LtNZVm4mhKMuhXuFt8LBkjjfuw6zNcjsvgMkyflFTLc/SgWWIpq1ALHQCsq -OiFfTPyuLy+8tGTbawvRIqiHHRd23XttPcfkdfWbNVTSBfodTOhXGFaVYbJ6EVA4 -OzVzftECgYEAz/D99wpWbjU8vye5cjKjZCY/+QnY0t76YsUmfD9+fQNBDSQnKCKj -6nO6oYFQ9RI/vPMfrNX0sqo5hKfufNBCr/MILDXR6vtcEuaqd84DgaPVPRjHef6v -paYUi0Enb3gF3LXYggTN1mz9leEW8BablTN/DLP5AAvMfM/XSkVzlIsCgYEA2gjc -mcUDL1smAvriFVmpD4IrPzaZ9kINOfFNqkp/+y7S0BZGeS5ESSodrs0CIojttp3o -9GL7QLhZ9DehJWfh2qfA5mvzKGzUeM2oapR2Ts/m3voS4ErPTm+cTBOjRe3gGSSN -4sAJ5LA071RfNjEZBSktow//WX/oWrhIyovnxt0CgYBxyge/4xlO77URSdSySEGf -MUs6pYfQRRKxb/9SaJB4KoqzfUAsN2CJkNDlRlWd9mGIrWZ89wwTpREapabdCD4l -+JFVWBJKS0ikUzOfoc3LaHLtHx0xhgxqUkrVtU62MfDLSXt0Etrs5vGRzf32Xfi/ -mdGBiw7MVqiM+FNwojbQZwKBgDly5E1P78rmhVl7qV5exYDkl2iMhnywYrPFtOUN -xDL2320csWz0l+F/S1rngYx/78KSUPMzsWgYKvuCPN+SQ5xNXzJXdzZLlqBN7/ZF -L/cMKJTP53FZxM2x8sjI09h1GPsG+quoVfL/yrLU1FF/FkyZ0QCKEooOfbaJoARe -YK+xAoGAfT0P200WsLKRl73XYJZNYQl5+h5s7Sk9J8QuPwFWqm/mGwYKTLI042jg -lsAym4krAR0c1CHTW3aHRimYpYbi7/kztZU1zUQgcGL+79afer3ZuFF7mGzR+I/r -yOQ2dEfmVASfl/fMh1qyExpcCaMuejaODWyILlxOwvnywHWMSCU= ------END RSA PRIVATE KEY----- +-----BEGIN PRIVATE KEY----- +MIIEvgIBADANBgkqhkiG9w0BAQEFAASCBKgwggSkAgEAAoIBAQCzMRjAbG2MVTMs +x7Rr+eeIi4jNyhvaQ8LcTB08BdgY8618blS037dB/85GxKTfZMdJZkCygbSsyiVJ +owqyucsT6IKYnZ/kDxfXXYEBIQcOYLnAEU1NNnQLPYSTth7xjnSKvg78gH+wY2LP +4z6QD64XboKa/fmVuByO/QlnZntkr4kiH6O5rJyt1Hm+OzVRYs5RumGU0Mpn3Bjd +xmGqB7+Wldbu2ECfOFFDaC/uizWVr1OD5lsdVau20ZEwZN37udRXsBc0qTp+CaA0 +sTFNCfqq9/Z3SVOGmWldvVOfF33+n9N+n1yyeZ8TlZ3JB2daKoDHDxLY0KKZtu6D +M9RlcMdJAgMBAAECggEAExOZ//pLG1KCtTS5h3k+ZfH86XOnzW/DUfKkdhxlKhrf +EE0qpWrftp4GFtCegM3YzX6eSqK2WmLW32UFIYvYC9GH/bliKURWFG526mjauHQh +sknlQeAXMI8nCxaraqnwzEh5jfYcIeoiveECaxHQRdxU+S0qof7Mw4g6tRBa+Jft +TUW1aa/m6p8s19xTKebtpCj+p1zwUdU8t4fw8eq5qFn4t1jlWaaCLgJv4OAE52IB +81LEJjG0vevVIvifsm+2t58IOqYJhEo0qMw2X0AMQDEOJZPeW1puFg4cdvh1bar6 +RDxVhX0IkDKz7w62VoGb0ZptZkO3vrG8OXN2TI+L0QKBgQC30ASVENK2l7Jo+n5m +Qxf7s4ZoJ9jjWjO8N+Mkd5/Cu+xePlv5EmcBZCDONSmAdtaZXDs9DVO8E4yFXecJ +fidQnvRhhRxrG/LEEwHNzR8lMlm5tc4wx7g2y844Qjan71O8gawUd6eZyRmVDnmk +st6DLUwyWTkwaa7VkDaGFFqjVQKBgQD5kIS0fiq1hhOzrPt6U2FCHybrNnWCyFN5 +ISYJpl1vn7YqFV2wgXwn+A9bcDi7DMK8hx1b49Xx2lwo9n7Cb5Pd0ykhdjo12hUQ +WBqiFEjInsQ84RvivyTzlrvBduVMRtWA8lxp4gFjXFf/avHzoRkM21IfU46Q9QNn +Y8rKTFJ8JQKBgGRgv6/b0QYPj0lOyen0Z8t3PUXxxLpbTvdRGcSXYvJIB4tryHQa +/Y8/675QP8q8mvKC8IKlnX2Ou2U1Y27GqpeXRmNe+qbvS0KSEqEdjA2XEnKc+u2e +k1WxNHt6hThuNK8zrRI8SZVswYCpt/oeB+9gtESmftmWTPipWW0c/mZFAoGBAIbK +pLJr9ptmmFuSUZ1nTX+OHdIq7ts9YcBpxAawQYUZqSUftyOvPCTGWV0Uxr9YjBGR +lKzd6N8hBmflgt93rlDATVXSamxNptTWEUR7WjhpcCpFl28nuEiMoEpE8mH5XDWy +MXHK7N8CsFC3LYld+I62Iqvi0HzAqR79ijkrcd21AoGBAIc+y9eKyMxFfHkz/pZH +cupJiF21Y3+cw7h2TRNvqSLvHTp3enCDo7WG0R0K74BHIvwTDcZxsxThm6KwB+y9 +WIuKQC064e5ASjdF1dfwFFlNpwphL2kebWuzIkpEVtCcGJPUuJ704R7tD3y8q4BN +aSrpjjRGIVr6mLcxXGgHJa5R +-----END PRIVATE KEY----- diff --git a/components/test_util/data/server.pem b/components/test_util/data/server.pem index 09200bd82f6..42deadba2db 100644 --- a/components/test_util/data/server.pem +++ b/components/test_util/data/server.pem @@ -1,22 +1,20 @@ -----BEGIN CERTIFICATE----- -MIIDlTCCAn2gAwIBAgIUGKdjy/Uqp64ZiwqMwpTMGP5tKT0wDQYJKoZIhvcNAQEL -BQAwVzELMAkGA1UEBhMCQ04xEDAOBgNVBAgTB0JlaWppbmcxEDAOBgNVBAcTB0Jl -aWppbmcxEDAOBgNVBAoTB1BpbmdDQVAxEjAQBgNVBAMTCU15IG93biBDQTAgFw0x -OTA5MDIwNjEzMDBaGA8yMTE5MDgwOTA2MTMwMFowFjEUMBIGA1UEAxMLdGlrdi1z -ZXJ2ZXIwggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEKAoIBAQCxGmr8T9ULzZjG -wLkuUoW8j0cm5yEy0qySfndqdbis0Tex83RYNLqoLLE26ZRUGo73wnXzHFvP9eRX -to/pNxdms2ASrDTs9lZMOjJlWL6f0dfJbCUF6ITZZasIRKqxqv5xdXlQTegTtjgJ -Pnq0w35rAqqd/XMvJGX0OWpKDX8ZpuF5XPN8aF2p12p2kBlxo4cfxQwRZ2dnipMV -poY1GWtHLdbRWPOAihxCium++1mRAzmILHHP3C4X3/qAZ2ilMDQnZK9kpr6gj/zr -s0uk3kfH5yf9OEeEJOHU3fhSWWhrRqKi7vaYaDIwh1DFgCxGcbrYQef71o+16AiN -B3XcF73/AgMBAAGjgZcwgZQwDgYDVR0PAQH/BAQDAgWgMB0GA1UdJQQWMBQGCCsG -AQUFBwMBBggrBgEFBQcDAjAMBgNVHRMBAf8EAjAAMB0GA1UdDgQWBBTw7yUYqbAv -BJw3zZctLUfUi0vyqzAfBgNVHSMEGDAWgBS3hxTaN9B7eF8xr0DKLZ3b5vFnrDAV -BgNVHREEDjAMhwSsEAUohwR/AAABMA0GCSqGSIb3DQEBCwUAA4IBAQCBljfge2fC -5X+tt1v7AkWoH5xpymEVvuIWWJmT/6FNTn1rdnIaxWCQzJbBCXjZS/75lKnwfrTB -ZK7iMv1GQaBevT/qm+7GcApsr5nFrI/MvzrvY+XRqvU8gsRhUjHYI+JPLGWxhzZD -pQdJTAGvsDLHu1VVdHR2KsE4M8ceGq58f7zPSq/suf+8SYEOFP8zfuXX1HfUrFVe -69ZQw8PZh4EYL0PYtE5BYfe9iJyFNNtZiejiribMQz/NtNkKM3M+Hm40ULGuwHXq -bKDjDq1PvmpVb/kKO/xADTIAbqproXETZ4W2keI3hwm6NxysvEbYV9+puQBXQqwT -KOt9Lo4ofSAF +MIIDVTCCAj2gAwIBAgIUUCvVn7LZjm7FD+xeAd5g1oKFSrIwDQYJKoZIhvcNAQEL +BQAwFzEVMBMGA1UEAwwMdGlrdl90ZXN0X2NhMCAXDTI0MDMyNzAwMDAwMFoYDzIx +MjQwMzI3MDAwMDAwWjAWMRQwEgYDVQQDDAt0aWt2LXNlcnZlcjCCASIwDQYJKoZI +hvcNAQEBBQADggEPADCCAQoCggEBALMxGMBsbYxVMyzHtGv554iLiM3KG9pDwtxM +HTwF2BjzrXxuVLTft0H/zkbEpN9kx0lmQLKBtKzKJUmjCrK5yxPogpidn+QPF9dd +gQEhBw5gucARTU02dAs9hJO2HvGOdIq+DvyAf7BjYs/jPpAPrhdugpr9+ZW4HI79 +CWdme2SviSIfo7msnK3Ueb47NVFizlG6YZTQymfcGN3GYaoHv5aV1u7YQJ84UUNo +L+6LNZWvU4PmWx1Vq7bRkTBk3fu51FewFzSpOn4JoDSxMU0J+qr39ndJU4aZaV29 +U58Xff6f036fXLJ5nxOVnckHZ1oqgMcPEtjQopm27oMz1GVwx0kCAwEAAaOBlzCB +lDAMBgNVHRMBAf8EAjAAMA4GA1UdDwEB/wQEAwIFoDAdBgNVHSUEFjAUBggrBgEF +BQcDAQYIKwYBBQUHAwIwFQYDVR0RBA4wDIcErBAFKIcEfwAAATAdBgNVHQ4EFgQU +Z4ACSrFMAFHP3iQAlZihuxlTk64wHwYDVR0jBBgwFoAUvkMODBi064xUztsZRsJY +/sstaO8wDQYJKoZIhvcNAQELBQADggEBALxhZMiXDQvUJCtSGPaCJhvs51O7Sb+8 +xrByuQrtXhcNVsOcq+0OjT/roUzD0x5mf75cTcJm6XZuzg2BPgN7wQU5GPuhOcJv +XFx4uoRDNBzN5FlxZu+ln4Qqw/M/4zsRwD3qkp/J50RpbCOmf1x/b1M6+s1uQcT5 +6sMErUXnOzvY5ey4vCJFiveYu5Z7GIHPB8xlhJtiu3T8FN1o3Us75evFk7hHfJjf +zU1Efd6W9RU/bEPAPvqnLBkSHdx7Urw0hNHlW2IDjaX1zIV5Ibeiw61olyQAiXjy +N3VJrMbuSTRv5BZxp4sKwnan4dAtfXvSSle36pYhN5UTjD72NNlLe8A= -----END CERTIFICATE----- diff --git a/components/test_util/src/lib.rs b/components/test_util/src/lib.rs index 453ed7fb7f1..d4de9fdc58e 100644 --- a/components/test_util/src/lib.rs +++ b/components/test_util/src/lib.rs @@ -18,6 +18,7 @@ use std::{ fmt::Debug, sync::atomic::{AtomicU16, Ordering}, thread, + time::Duration, }; use rand::Rng; @@ -154,3 +155,21 @@ pub fn assert_eq_debug(lhs: &C, rhs: &C) { lhs_diff, rhs_diff ); } + +#[track_caller] +pub fn eventually(tick: Duration, total: Duration, mut check: impl FnMut() -> bool) { + let start = std::time::Instant::now(); + loop { + if check() { + return; + } + if start.elapsed() < total { + std::thread::sleep(tick); + continue; + } + panic!( + "failed to pass the check after {:?} elapsed", + start.elapsed() + ); + } +} diff --git a/components/tidb_query_datatype/src/codec/convert.rs b/components/tidb_query_datatype/src/codec/convert.rs index 418841547ca..e4e2a3d2e8f 100644 --- a/components/tidb_query_datatype/src/codec/convert.rs +++ b/components/tidb_query_datatype/src/codec/convert.rs @@ -2354,6 +2354,13 @@ mod tests { ft.set_flen(flen); ft.set_decimal(decimal); let nd = produce_dec_with_specified_tp(&mut ctx, dec, &ft).unwrap(); + assert_eq!( + nd.frac_cnt(), + nd.result_frac_cnt(), + "frac_cnt {} is not equal to result_frac_cnt {}", + nd.frac_cnt(), + nd.result_frac_cnt() + ); assert_eq!(nd, want, "{}, {}, {}, {}, {}", dec, nd, want, flen, decimal); } } @@ -2765,6 +2772,13 @@ mod tests { match &expect { Ok(d) => { assert!(r.is_ok(), "{}", log); + assert_eq!( + d.frac_cnt(), + d.result_frac_cnt(), + "frac_cnt {} is not equal to result_frac_cnt {}", + d.frac_cnt(), + d.result_frac_cnt() + ); assert_eq!(&r.unwrap(), d, "{}", log); } Err(Error::Eval(..)) => { diff --git a/components/tidb_query_datatype/src/codec/mysql/decimal.rs b/components/tidb_query_datatype/src/codec/mysql/decimal.rs index 143ec6c7760..077c5f50365 100644 --- a/components/tidb_query_datatype/src/codec/mysql/decimal.rs +++ b/components/tidb_query_datatype/src/codec/mysql/decimal.rs @@ -590,17 +590,24 @@ fn do_div_mod_impl( rhs: &Decimal, mut frac_incr: u8, do_mod: bool, + result_frac_cnt: Option, ) -> Option> { let r_frac_cnt = word_cnt!(rhs.frac_cnt) * DIGITS_PER_WORD; let (r_idx, r_prec) = rhs.remove_leading_zeroes(rhs.int_cnt + r_frac_cnt); if r_prec == 0 { + // short-circuit everything: rhs == 0 return None; } let l_frac_cnt = word_cnt!(lhs.frac_cnt) * DIGITS_PER_WORD; let (l_idx, l_prec) = lhs.remove_leading_zeroes(lhs.int_cnt + l_frac_cnt); if l_prec == 0 { - return Some(Res::Ok(Decimal::zero())); + // short-circuit everything: lhs == 0 + if let Some(result_frac) = result_frac_cnt { + return Some(Res::Ok(Decimal::new(0, result_frac, false))); + } else { + return Some(Res::Ok(Decimal::zero())); + } } frac_incr = frac_incr.saturating_sub(l_frac_cnt - lhs.frac_cnt + r_frac_cnt - rhs.frac_cnt); @@ -784,8 +791,9 @@ fn do_div_mod_impl( Some(res) } +#[allow(dead_code)] fn do_div_mod(lhs: &Decimal, rhs: &Decimal, frac_incr: u8, do_mod: bool) -> Option> { - do_div_mod_impl(lhs, rhs, frac_incr, do_mod) + do_div_mod_impl(lhs, rhs, frac_incr, do_mod, None) } /// `do_mul` multiplies two decimals. @@ -798,6 +806,9 @@ fn do_mul(lhs: &Decimal, rhs: &Decimal) -> Res { i32::from(word_cnt!(rhs.int_cnt)), i32::from(word_cnt!(rhs.frac_cnt)), ); + + let old_r_int_word_cnt = r_int_word_cnt; + let (int_word_to, frac_word_to) = ( word_cnt!(lhs.int_cnt + rhs.int_cnt) as usize, l_frac_word_cnt + r_frac_word_cnt, @@ -823,7 +834,7 @@ fn do_mul(lhs: &Decimal, rhs: &Decimal) -> Res { l_frac_word_cnt = 0; r_frac_word_cnt = 0; } else { - old_frac_word_to -= int_word_to as i32; + old_frac_word_to -= frac_word_to as i32; old_int_word_to = old_frac_word_to / 2; if l_frac_word_cnt <= r_frac_word_cnt { l_frac_word_cnt -= old_int_word_to; @@ -835,41 +846,43 @@ fn do_mul(lhs: &Decimal, rhs: &Decimal) -> Res { } } - let mut start_to = int_word_to + frac_word_to; - let (offset_min, offset_max) = (0, i32::from(WORD_BUF_LEN)); - let r_start = num::clamp(r_int_word_cnt + r_frac_word_cnt, offset_min, offset_max) as usize; - let left_stop = num::clamp(l_int_word_cnt + l_frac_word_cnt, offset_min, offset_max) as usize; - for l_idx in (0..left_stop).rev() { - if start_to < r_start { - break; - } + let mut start_to = (int_word_to + frac_word_to - 1) as isize; + let r_start = old_r_int_word_cnt + r_frac_word_cnt - 1; + let r_stop = old_r_int_word_cnt - r_int_word_cnt; + let mut l_idx = l_int_word_cnt + l_frac_word_cnt - 1; + + while l_idx >= 0 { let (mut carry, mut idx_to) = (0, start_to); - start_to -= 1; - for r_idx in (0..r_start).rev() { - idx_to -= 1; - let p = u64::from(lhs.word_buf[l_idx]) * u64::from(rhs.word_buf[r_idx]); + let mut r_idx = r_start; + while r_idx >= r_stop { + let p = + u64::from(lhs.word_buf[l_idx as usize]) * u64::from(rhs.word_buf[r_idx as usize]); let hi = p / u64::from(WORD_BASE); let lo = p - hi * u64::from(WORD_BASE); add( - dec.word_buf[idx_to], + dec.word_buf[idx_to as usize], lo as u32, &mut carry, - &mut dec.word_buf[idx_to], + &mut dec.word_buf[idx_to as usize], ); carry += hi as u32; + r_idx -= 1; + idx_to -= 1; } while carry > 0 { - if idx_to == 0 { + if idx_to < 0 { return Res::Overflow(dec); } - idx_to -= 1; add( - dec.word_buf[idx_to], + dec.word_buf[idx_to as usize], 0, &mut carry, - &mut dec.word_buf[idx_to], + &mut dec.word_buf[idx_to as usize], ); + idx_to -= 1; } + l_idx -= 1; + start_to -= 1; } // Now we have to check for -0.000 case @@ -966,7 +979,7 @@ impl Decimal { Decimal { int_cnt, frac_cnt, - result_frac_cnt: 0, + result_frac_cnt: frac_cnt, negative, word_buf: [0; 9], } @@ -1183,10 +1196,12 @@ impl Decimal { res.word_buf[idx as usize] = 0; } res.frac_cnt = frac as u8; + res.result_frac_cnt = res.frac_cnt; return res; } if frac >= res.frac_cnt as i8 { res.frac_cnt = frac as u8; + res.result_frac_cnt = res.frac_cnt; return res; } @@ -1329,6 +1344,7 @@ impl Decimal { dec.int_cnt = 1; dec.negative = false; dec.frac_cnt = cmp::max(0, frac) as u8; + dec.result_frac_cnt = dec.frac_cnt; for i in 0..idx { dec.word_buf[i as usize] = 0; } @@ -1342,6 +1358,7 @@ impl Decimal { dec.int_cnt += 1; } dec.frac_cnt = cmp::max(0, frac) as u8; + dec.result_frac_cnt = dec.frac_cnt; dec } @@ -1655,35 +1672,39 @@ impl Decimal { if inner_idx != 0 { d.word_buf[word_idx] = word * TEN_POW[DIGITS_PER_WORD as usize - inner_idx]; } - if end_idx < bs.len() && (bs[end_idx] == b'e' || bs[end_idx] == b'E') { - let exp = convert::bytes_to_int_without_context(&bs[end_idx + 1..])?; - if exp > i64::from(i32::MAX) / 2 { - return Ok(Res::Overflow(max_or_min_dec( - d.negative, - WORD_BUF_LEN * DIGITS_PER_WORD, - 0, - ))); - } - if exp < i64::from(i32::MIN) / 2 && !d.is_overflow() { - return Ok(Res::Truncated(Self::zero())); - } - if !d.is_overflow() { - let is_truncated = d.is_truncated(); - d = match d.unwrap().shift(exp as isize) { - Res::Overflow(v) => Res::Overflow(max_or_min_dec( - v.negative, + if end_idx < bs.len() { + if bs[end_idx] == b'e' || bs[end_idx] == b'E' { + let exp = convert::bytes_to_int_without_context(&bs[end_idx + 1..])?; + if exp > i64::from(i32::MAX) / 2 { + d = Res::Overflow(max_or_min_dec( + d.negative, WORD_BUF_LEN * DIGITS_PER_WORD, 0, - )), - Res::Ok(v) => { - if is_truncated { - Res::Truncated(v) - } else { - Res::Ok(v) + )); + } + if exp < i64::from(i32::MIN) / 2 && !d.is_overflow() { + d = Res::Truncated(Self::zero()); + } + if !d.is_overflow() { + let is_truncated = d.is_truncated(); + d = match d.unwrap().shift(exp as isize) { + Res::Overflow(v) => Res::Overflow(max_or_min_dec( + v.negative, + WORD_BUF_LEN * DIGITS_PER_WORD, + 0, + )), + Res::Ok(v) => { + if is_truncated { + Res::Truncated(v) + } else { + Res::Ok(v) + } } - } - res => res, - }; + res => res, + }; + } + } else if bs[end_idx..].iter().any(|c| !c.is_ascii_whitespace()) { + d = Res::Truncated(d.unwrap()); } } if d.word_buf.iter().all(|c| *c == 0) { @@ -1704,7 +1725,7 @@ impl Decimal { fn div(&self, rhs: &Decimal, frac_incr: u8) -> Option> { let result_frac_cnt = cmp::min(self.result_frac_cnt.saturating_add(frac_incr), MAX_FRACTION); - let mut res = do_div_mod(self, rhs, frac_incr, false); + let mut res = do_div_mod_impl(self, rhs, frac_incr, false, Some(result_frac_cnt)); if let Some(ref mut dec) = res { dec.result_frac_cnt = result_frac_cnt; } @@ -1715,6 +1736,16 @@ impl Decimal { let len = word_cnt!(self.int_cnt) + word_cnt!(self.frac_cnt); self.word_buf[0..len as usize].iter().all(|&x| x == 0) } + + #[cfg(test)] + pub fn result_frac_cnt(&self) -> u8 { + self.result_frac_cnt + } + + #[cfg(test)] + pub fn frac_cnt(&self) -> u8 { + self.frac_cnt + } } macro_rules! enable_conv_for_int { @@ -2362,7 +2393,7 @@ impl<'a, 'b> Rem<&'a Decimal> for &'b Decimal { type Output = Option>; fn rem(self, rhs: &'a Decimal) -> Self::Output { let result_frac_cnt = cmp::max(self.result_frac_cnt, rhs.result_frac_cnt); - let mut res = do_div_mod_impl(self, rhs, 0, true); + let mut res = do_div_mod_impl(self, rhs, 0, true, Some(result_frac_cnt)); if let Some(ref mut dec) = res { dec.result_frac_cnt = result_frac_cnt; } @@ -2413,7 +2444,7 @@ mod tests { use super::{DEFAULT_DIV_FRAC_INCR, WORD_BUF_LEN, *}; use crate::{ - codec::error::ERR_DATA_OUT_OF_RANGE, + codec::error::*, expr::{EvalConfig, Flag}, }; @@ -2949,11 +2980,17 @@ mod tests { for (dec_str, scale, half_exp, trunc_exp, ceil_exp) in cases { let dec = dec_str.parse::().unwrap(); - let res = dec.round(scale, RoundMode::HalfEven).map(|d| d.to_string()); + let round_dec = dec.round(scale, RoundMode::HalfEven); + assert_eq!(round_dec.frac_cnt, round_dec.result_frac_cnt); + let res = round_dec.map(|d| d.to_string()); assert_eq!(res, half_exp.map(|s| s.to_owned())); - let res = dec.round(scale, RoundMode::Truncate).map(|d| d.to_string()); + let round_dec = dec.round(scale, RoundMode::Truncate); + assert_eq!(round_dec.frac_cnt, round_dec.result_frac_cnt); + let res = round_dec.map(|d| d.to_string()); assert_eq!(res, trunc_exp.map(|s| s.to_owned())); - let res = dec.round(scale, RoundMode::Ceiling).map(|d| d.to_string()); + let round_dec = dec.round(scale, RoundMode::Ceiling); + assert_eq!(round_dec.frac_cnt, round_dec.result_frac_cnt); + let res = round_dec.map(|d| d.to_string()); assert_eq!(res, ceil_exp.map(|s| s.to_owned())); } } @@ -2964,8 +3001,8 @@ mod tests { let cases = vec![ (WORD_BUF_LEN, b"12345" as &'static [u8], Res::Ok("12345")), (WORD_BUF_LEN, b"12345.", Res::Ok("12345")), - (WORD_BUF_LEN, b"123.45.", Res::Ok("123.45")), - (WORD_BUF_LEN, b"-123.45.", Res::Ok("-123.45")), + (WORD_BUF_LEN, b"123.45.", Res::Truncated("123.45")), + (WORD_BUF_LEN, b"-123.45.", Res::Truncated("-123.45")), ( WORD_BUF_LEN, b".00012345000098765", @@ -3013,8 +3050,11 @@ mod tests { (WORD_BUF_LEN, b"2.2E-1", Res::Ok("0.22")), (WORD_BUF_LEN, b"2.23E2", Res::Ok("223")), (WORD_BUF_LEN, b"2.23E2abc", Res::Ok("223")), - (WORD_BUF_LEN, b"2.23a2", Res::Ok("2.23")), - (WORD_BUF_LEN, b"223\xE0\x80\x80", Res::Ok("223")), + (WORD_BUF_LEN, b"2.23a2", Res::Truncated("2.23")), + (WORD_BUF_LEN, b"223\xE0\x80\x80", Res::Truncated("223")), + (WORD_BUF_LEN, b"223 ", Res::Ok("223")), + (WORD_BUF_LEN, b"223.2 ", Res::Ok("223.2")), + (WORD_BUF_LEN, b"223.2 .", Res::Truncated("223.2")), (WORD_BUF_LEN, b"1e -1", Res::Ok("0.1")), (WORD_BUF_LEN, b"1e001", Res::Ok("10")), (WORD_BUF_LEN, b"1e00", Res::Ok("1")), @@ -3348,6 +3388,32 @@ mod tests { } } + #[test] + fn test_mul_truncated() { + let cases = vec![( + "999999999999999999999999999999999.9999", + "766507373740683764182618847769240.9770", + Res::Truncated( + "766507373740683764182618847769239999923349262625931623581738115223.07600000", + ), + Res::Truncated( + "766507373740683764182618847769240210492626259316235817381152230759.02300000", + ), + )]; + + for (lhs_str, rhs_str, exp_str, rev_exp_str) in cases { + let lhs: Decimal = lhs_str.parse().unwrap(); + let rhs: Decimal = rhs_str.parse().unwrap(); + let exp = exp_str.map(|s| s.to_owned()); + let res = (&lhs * &rhs).map(|d| d.to_string()); + assert_eq!(res, exp); + + let exp = rev_exp_str.map(|s| s.to_owned()); + let res = (&rhs * &lhs).map(|d| d.to_string()); + assert_eq!(res, exp); + } + } + #[test] fn test_div_mod() { let cases = vec![ @@ -3545,17 +3611,28 @@ mod tests { assert_eq!(res, rem_exp.map(|s| s.to_owned())); } - let div_cases = vec![( - "-43791957044243810000000000000000000000000000000000000000000000000000000000000", - "-0.0000000000000000000000000000000000000000000000000012867433602814482", - Res::Overflow( - "34033171179267041433424155279291553259014210153022524070386565694757521640", + let div_cases = vec![ + ( + "-43791957044243810000000000000000000000000000000000000000000000000000000000000", + "-0.0000000000000000000000000000000000000000000000000012867433602814482", + Res::Overflow( + "34033171179267041433424155279291553259014210153022524070386565694757521640", + ), ), - )]; - for (lhs_str, rhs_str, rem_exp) in div_cases { + ("0", "0.5", Res::Ok("0.0000")), + ]; + for (lhs_str, rhs_str, div_exp) in div_cases { let lhs: Decimal = lhs_str.parse().unwrap(); let rhs: Decimal = rhs_str.parse().unwrap(); let res = (&lhs / &rhs).unwrap().map(|d| d.to_string()); + assert_eq!(res, div_exp.map(|s| s.to_owned())) + } + + let rem_cases = vec![("0", "0.5", Res::Ok("0.0"))]; + for (lhs_str, rhs_str, rem_exp) in rem_cases { + let lhs: Decimal = lhs_str.parse().unwrap(); + let rhs: Decimal = rhs_str.parse().unwrap(); + let res = (lhs % rhs).unwrap().map(|d| d.to_string()); assert_eq!(res, rem_exp.map(|s| s.to_owned())) } } @@ -3678,19 +3755,20 @@ mod tests { #[test] fn test_bytes_to_decimal() { + let mut ctx = EvalContext::default(); let cases: Vec<(&[u8], Decimal)> = vec![ ( b"123456.1", - ConvertTo::::convert(&123456.1, &mut EvalContext::default()).unwrap(), + ConvertTo::::convert(&123456.1, &mut ctx).unwrap(), ), ( b"-123456.1", - ConvertTo::::convert(&-123456.1, &mut EvalContext::default()).unwrap(), + ConvertTo::::convert(&-123456.1, &mut ctx).unwrap(), ), (b"123456", Decimal::from(123456)), (b"-123456", Decimal::from(-123456)), + (b"1 ", Decimal::from(1)), ]; - let mut ctx = EvalContext::default(); for (s, expect) in cases { let got: Decimal = s.convert(&mut ctx).unwrap(); assert_eq!(got, expect, "from {:?}, expect: {} got: {}", s, expect, got); @@ -3709,6 +3787,36 @@ mod tests { assert_eq!(val, max, "expect: {}, got: {}", val, max); assert_eq!(ctx.warnings.warning_cnt, 1); assert_eq!(ctx.warnings.warnings[0].get_code(), ERR_DATA_OUT_OF_RANGE); + + // Truncate cases + let truncate_cases: Vec<(&[u8], Decimal)> = vec![ + ( + b"123.45.", + ConvertTo::::convert(&123.45, &mut ctx).unwrap(), + ), + ( + b"-123.45.", + ConvertTo::::convert(&-123.45, &mut ctx).unwrap(), + ), + ( + b"1.1.1.1.1", + ConvertTo::::convert(&1.1, &mut ctx).unwrap(), + ), + (b"1asf", Decimal::from(1)), + (b"1 1", Decimal::from(1)), + ]; + for (s, expect) in truncate_cases { + let val: Result = s.convert(&mut ctx); + assert!(val.is_err(), "expected error, but got {:?}", val); + assert_eq!(val.unwrap_err().code(), WARN_DATA_TRUNCATED); + + let mut truncate_as_warning_ctx = EvalContext::new(std::sync::Arc::new( + EvalConfig::from_flag(Flag::TRUNCATE_AS_WARNING), + )); + let got: Decimal = s.convert(&mut truncate_as_warning_ctx).unwrap(); + assert_eq!(got, expect, "from {:?}, expect: {} got: {}", s, expect, got); + assert_eq!(truncate_as_warning_ctx.warnings.warning_cnt, 1); + } } #[test] diff --git a/components/tidb_query_datatype/src/codec/mysql/json/binary.rs b/components/tidb_query_datatype/src/codec/mysql/json/binary.rs index c965247b8da..5028967ac59 100644 --- a/components/tidb_query_datatype/src/codec/mysql/json/binary.rs +++ b/components/tidb_query_datatype/src/codec/mysql/json/binary.rs @@ -167,7 +167,8 @@ mod tests { (r#"["d1","d2"]"#, JsonType::Array), (r#"-3"#, JsonType::I64), (r#"3"#, JsonType::I64), - (r#"18446744073709551615"#, JsonType::Double), + (r#"18446744073709551615"#, JsonType::U64), + (r#"18446744073709551616"#, JsonType::Double), (r#"3.0"#, JsonType::Double), (r#"null"#, JsonType::Literal), (r#"true"#, JsonType::Literal), diff --git a/components/tidb_query_datatype/src/codec/mysql/json/json_contains.rs b/components/tidb_query_datatype/src/codec/mysql/json/json_contains.rs index 46de1af9e0b..db8ec3331eb 100644 --- a/components/tidb_query_datatype/src/codec/mysql/json/json_contains.rs +++ b/components/tidb_query_datatype/src/codec/mysql/json/json_contains.rs @@ -86,6 +86,7 @@ mod tests { (r#"{"a":{"a":1},"b":2}"#, r#"{"b":3}"#, false), (r#"[1,2,[1,[5,{"a":[2,3]}]]]"#, r#"[1,{"a":[3]}]"#, true), (r#"[1,2,[1,[5,{"a":[2,3]}]]]"#, r#"[10,{"a":[3]}]"#, false), + (r#"[9223372036854775807]"#, r#"9223372036854775808"#, false), ]; for (i, (js, value, expected)) in test_cases.drain(..).enumerate() { let j = js.parse(); diff --git a/components/tidb_query_datatype/src/codec/mysql/json/json_type.rs b/components/tidb_query_datatype/src/codec/mysql/json/json_type.rs index 70321080ef7..bf48791b298 100644 --- a/components/tidb_query_datatype/src/codec/mysql/json/json_type.rs +++ b/components/tidb_query_datatype/src/codec/mysql/json/json_type.rs @@ -65,7 +65,8 @@ mod tests { (r#"["a", "b"]"#, JSON_TYPE_ARRAY), ("-5", JSON_TYPE_INTEGER), ("5", JSON_TYPE_INTEGER), - ("18446744073709551615", JSON_TYPE_DOUBLE), + ("18446744073709551615", JSON_TYPE_UNSIGNED_INTEGER), + ("18446744073709551616", JSON_TYPE_DOUBLE), ("5.6", JSON_TYPE_DOUBLE), (r#""hello, world""#, JSON_TYPE_STRING), ("true", JSON_TYPE_BOOLEAN), diff --git a/components/tidb_query_datatype/src/codec/mysql/json/serde.rs b/components/tidb_query_datatype/src/codec/mysql/json/serde.rs index 4bf487eefc1..598e27347c8 100644 --- a/components/tidb_query_datatype/src/codec/mysql/json/serde.rs +++ b/components/tidb_query_datatype/src/codec/mysql/json/serde.rs @@ -192,10 +192,10 @@ impl<'de> Visitor<'de> for JsonVisitor { where E: de::Error, { - if v > (i64::MAX as u64) { - Ok(Json::from_f64(v as f64).map_err(de::Error::custom)?) + if v < i64::MAX as u64 { + Json::from_i64(v as i64).map_err(de::Error::custom) } else { - Ok(Json::from_i64(v as i64).map_err(de::Error::custom)?) + Json::from_u64(v).map_err(de::Error::custom) } } @@ -286,6 +286,10 @@ mod tests { r#"9223372036854775807"#, Json::from_i64(9223372036854775807), ), + ( + r#"9223372036854775808"#, + Json::from_u64(9223372036854775808), + ), ]; for (json_str, json) in cases { diff --git a/components/tidb_query_datatype/src/codec/mysql/time/mod.rs b/components/tidb_query_datatype/src/codec/mysql/time/mod.rs index 4c6c2f676d7..41131fc5933 100644 --- a/components/tidb_query_datatype/src/codec/mysql/time/mod.rs +++ b/components/tidb_query_datatype/src/codec/mysql/time/mod.rs @@ -1349,10 +1349,7 @@ impl Time { ) -> Result { let dur = chrono::Duration::nanoseconds(duration.to_nanos()); - let time = Utc::today() - .and_hms(0, 0, 0) - .checked_add_signed(dur) - .map(|utc| utc.with_timezone(&ctx.cfg.tz)); + let time = Utc::today().and_hms(0, 0, 0).checked_add_signed(dur); let time = time.ok_or::(box_err!("parse from duration {} overflows", duration))?; @@ -2404,15 +2401,19 @@ mod tests { #[test] fn test_parse_time_with_tz() -> Result<()> { - let ctx_with_tz = |tz: &str| { + let ctx_with_tz = |tz: &str, by_offset: bool| { let mut cfg = EvalConfig::default(); - let raw = tz.as_bytes(); - // brutally turn timezone in format +08:00 into offset in minute - let offset = if raw[0] == b'-' { -1 } else { 1 } - * ((raw[1] - b'0') as i64 * 10 + (raw[2] - b'0') as i64) - * 60 - + ((raw[4] - b'0') as i64 * 10 + (raw[5] - b'0') as i64); - cfg.set_time_zone_by_offset(offset * 60).unwrap(); + if by_offset { + let raw = tz.as_bytes(); + // brutally turn timezone in format +08:00 into offset in minute + let offset = if raw[0] == b'-' { -1 } else { 1 } + * ((raw[1] - b'0') as i64 * 10 + (raw[2] - b'0') as i64) + * 60 + + ((raw[4] - b'0') as i64 * 10 + (raw[5] - b'0') as i64); + cfg.set_time_zone_by_offset(offset * 60).unwrap(); + } else { + cfg.set_time_zone_by_name(tz).unwrap(); + } let warnings = cfg.new_eval_warnings(); EvalContext { cfg: Arc::new(cfg), @@ -2421,6 +2422,7 @@ mod tests { }; struct Case { tz: &'static str, + by_offset: bool, t: &'static str, r: Option<&'static str>, tp: TimeType, @@ -2428,60 +2430,70 @@ mod tests { let cases = vec![ Case { tz: "+00:00", + by_offset: true, t: "2020-10-10T10:10:10Z", r: Some("2020-10-10 10:10:10.000000"), tp: TimeType::DateTime, }, Case { tz: "+00:00", + by_offset: true, t: "2020-10-10T10:10:10+", r: None, tp: TimeType::DateTime, }, Case { tz: "+00:00", + by_offset: true, t: "2020-10-10T10:10:10+14:01", r: None, tp: TimeType::DateTime, }, Case { tz: "+00:00", + by_offset: true, t: "2020-10-10T10:10:10-00:00", r: None, tp: TimeType::DateTime, }, Case { tz: "-08:00", + by_offset: true, t: "2020-10-10T10:10:10-08", r: Some("2020-10-10 10:10:10.000000"), tp: TimeType::DateTime, }, Case { tz: "+08:00", + by_offset: true, t: "2020-10-10T10:10:10+08:00", r: Some("2020-10-10 10:10:10.000000"), tp: TimeType::DateTime, }, Case { tz: "+08:00", + by_offset: true, t: "2020-10-10T10:10:10+08:00", r: Some("2020-10-10 10:10:10.000000"), tp: TimeType::Timestamp, }, Case { tz: "+08:00", + by_offset: true, t: "2022-06-02T10:10:10Z", r: Some("2022-06-02 18:10:10.000000"), tp: TimeType::DateTime, }, Case { tz: "-08:00", + by_offset: true, t: "2022-06-02T10:10:10Z", r: Some("2022-06-02 02:10:10.000000"), tp: TimeType::DateTime, }, Case { tz: "+06:30", + by_offset: true, t: "2022-06-02T10:10:10-05:00", r: Some("2022-06-02 21:40:10.000000"), tp: TimeType::DateTime, @@ -2489,26 +2501,45 @@ mod tests { // Time with fraction Case { tz: "+08:00", + by_offset: true, t: "2022-06-02T10:10:10.123Z", r: Some("2022-06-02 18:10:10.123000"), tp: TimeType::DateTime, }, Case { tz: "-08:00", + by_offset: true, t: "2022-06-02T10:10:10.123Z", r: Some("2022-06-02 02:10:10.123000"), tp: TimeType::DateTime, }, Case { tz: "+06:30", + by_offset: true, t: "2022-06-02T10:10:10.654321-05:00", r: Some("2022-06-02 21:40:10.654321"), tp: TimeType::DateTime, }, + Case { + // Note: this case may fail if Brazil observes DST again. + // See https://github.com/pingcap/tidb/issues/49586 + tz: "Brazil/East", + by_offset: false, + t: "2023-11-30T17:02:00.654321+00:00", + r: Some("2023-11-30 14:02:00.654321"), + tp: TimeType::DateTime, + }, ]; let mut result: Vec> = vec![]; - for Case { tz, t, r: _, tp } in &cases { - let mut ctx = ctx_with_tz(tz); + for Case { + tz, + by_offset, + t, + r: _, + tp, + } in &cases + { + let mut ctx = ctx_with_tz(tz, *by_offset); let parsed = Time::parse(&mut ctx, t, *tp, 6, true); match parsed { Ok(p) => result.push(Some(p.to_string())), diff --git a/components/tidb_query_datatype/src/codec/row/v2/row_slice.rs b/components/tidb_query_datatype/src/codec/row/v2/row_slice.rs index da117c96e2c..4a0a171eb60 100644 --- a/components/tidb_query_datatype/src/codec/row/v2/row_slice.rs +++ b/components/tidb_query_datatype/src/codec/row/v2/row_slice.rs @@ -233,7 +233,11 @@ impl RowSlice<'_> { RowSlice::Big { offsets, values, .. } => { - let last_slice_idx = offsets.get(non_null_col_num - 1).unwrap() as usize; + let last_slice_idx = if non_null_col_num == 0 { + 0 + } else { + offsets.get(non_null_col_num - 1).unwrap() as usize + }; let slice = values.slice; *values = LeBytes::new(&slice[..last_slice_idx]); &slice[last_slice_idx..] @@ -241,7 +245,11 @@ impl RowSlice<'_> { RowSlice::Small { offsets, values, .. } => { - let last_slice_idx = offsets.get(non_null_col_num - 1).unwrap() as usize; + let last_slice_idx = if non_null_col_num == 0 { + 0 + } else { + offsets.get(non_null_col_num - 1).unwrap() as usize + }; let slice = values.slice; *values = LeBytes::new(&slice[..last_slice_idx]); &slice[last_slice_idx..] diff --git a/components/tidb_query_datatype/src/def/field_type.rs b/components/tidb_query_datatype/src/def/field_type.rs index 06f4454b36d..8a56ac5ac68 100644 --- a/components/tidb_query_datatype/src/def/field_type.rs +++ b/components/tidb_query_datatype/src/def/field_type.rs @@ -140,7 +140,10 @@ impl Collation { } pub fn is_bin_collation(&self) -> bool { - matches!(self, Collation::Utf8Mb4Bin | Collation::Latin1Bin) + matches!( + self, + Collation::Utf8Mb4Bin | Collation::Latin1Bin | Collation::Utf8Mb40900Bin + ) } } @@ -333,6 +336,10 @@ pub trait FieldTypeAccessor { .map(|col| col.is_bin_collation()) .unwrap_or(false) || self.is_varchar_like()) + && self + .collation() + .map(|col| col != Collation::Utf8Mb40900Bin) + .unwrap_or(false) } } @@ -455,6 +462,7 @@ mod tests { use std::i32; use super::*; + use crate::builder::FieldTypeBuilder; fn field_types() -> Vec { vec![ @@ -583,4 +591,31 @@ mod tests { } } } + + #[test] + fn test_need_restored_data() { + let cases = vec![ + (FieldTypeTp::String, Collation::Binary, false), + (FieldTypeTp::VarString, Collation::Binary, false), + (FieldTypeTp::String, Collation::Utf8Mb4Bin, false), + (FieldTypeTp::VarString, Collation::Utf8Mb4Bin, true), + (FieldTypeTp::String, Collation::Utf8Mb4GeneralCi, true), + (FieldTypeTp::VarString, Collation::Utf8Mb4GeneralCi, true), + (FieldTypeTp::String, Collation::Utf8Mb4UnicodeCi, true), + (FieldTypeTp::VarString, Collation::Utf8Mb4UnicodeCi, true), + (FieldTypeTp::String, Collation::Utf8Mb40900AiCi, true), + (FieldTypeTp::VarString, Collation::Utf8Mb40900AiCi, true), + (FieldTypeTp::String, Collation::Utf8Mb40900Bin, false), + (FieldTypeTp::VarString, Collation::Utf8Mb40900Bin, false), + (FieldTypeTp::String, Collation::GbkBin, true), + (FieldTypeTp::VarString, Collation::GbkBin, true), + (FieldTypeTp::String, Collation::GbkChineseCi, true), + (FieldTypeTp::VarString, Collation::GbkChineseCi, true), + ]; + + for (tp, collation, result) in cases { + let ft = FieldTypeBuilder::new().tp(tp).collation(collation).build(); + assert_eq!(ft.need_restored_data(), result) + } + } } diff --git a/components/tidb_query_expr/src/impl_arithmetic.rs b/components/tidb_query_expr/src/impl_arithmetic.rs index 2f48fec4693..5c34a8431b5 100644 --- a/components/tidb_query_expr/src/impl_arithmetic.rs +++ b/components/tidb_query_expr/src/impl_arithmetic.rs @@ -317,7 +317,12 @@ impl ArithmeticOp for DecimalMultiply { type T = Decimal; fn calc(lhs: &Decimal, rhs: &Decimal) -> Result> { - let res: codec::Result = (lhs * rhs).into(); + let res: codec::Result = match lhs * rhs { + codec::mysql::Res::Ok(t) => Ok(t), + codec::mysql::Res::Truncated(t) => Ok(t), + other => other.into(), + }; + Ok(Some(res?)) } } @@ -854,7 +859,14 @@ mod tests { #[test] fn test_multiply_decimal() { - let test_cases = vec![("1.1", "2.2", "2.42")]; + let test_cases = vec![ + ("1.1", "2.2", "2.42"), + ( + "999999999999999999999999999999999.9999", + "766507373740683764182618847769240.9770", + "766507373740683764182618847769239999923349262625931623581738115223.07600000", + ), + ]; for (lhs, rhs, expected) in test_cases { let expected: Option = expected.parse().ok(); let output = RpnFnScalarEvaluator::new() diff --git a/components/tidb_query_expr/src/impl_cast.rs b/components/tidb_query_expr/src/impl_cast.rs index 76e90f79c5b..6ce43695213 100644 --- a/components/tidb_query_expr/src/impl_cast.rs +++ b/components/tidb_query_expr/src/impl_cast.rs @@ -1385,8 +1385,9 @@ fn cast_string_as_json( let mut vec; if typ.tp() == FieldTypeTp::String { vec = (*val).to_owned(); - // the `flen` of string is always greater than zero - vec.resize(typ.flen().try_into().unwrap(), 0); + if typ.flen() > 0 { + vec.resize(typ.flen().try_into().unwrap(), 0); + } buf = &vec; } @@ -1612,7 +1613,7 @@ mod tests { mysql::{ charset::*, decimal::{max_decimal, max_or_min_dec}, - Decimal, Duration, Json, RoundMode, Time, TimeType, MAX_FSP, MIN_FSP, + Decimal, Duration, Json, RoundMode, Time, TimeType, Tz, MAX_FSP, MIN_FSP, }, }, expr::{EvalConfig, EvalContext, Flag}, @@ -2933,13 +2934,20 @@ mod tests { fn test_cast_duration_as_time() { use chrono::Datelike; - let cases = vec!["11:30:45.123456", "-35:30:46"]; + let cases = vec!["11:30:45.123456", "-35:30:46", "25:59:59.999999"]; for case in cases { - let mut ctx = EvalContext::default(); - + let mut cfg = EvalConfig::default(); + cfg.tz = Tz::from_tz_name("America/New_York").unwrap(); + let mut ctx = EvalContext::new(Arc::new(cfg)); let duration = Duration::parse(&mut ctx, case, MAX_FSP).unwrap(); + + let mut cfg2 = EvalConfig::default(); + cfg2.tz = Tz::from_tz_name("Asia/Tokyo").unwrap(); + let ctx2 = EvalContext::new(Arc::new(cfg2)); + let now = RpnFnScalarEvaluator::new() + .context(ctx2) .push_param(duration) .return_field_type( FieldTypeBuilder::new() @@ -7014,6 +7022,17 @@ mod tests { Json::from_opaque(FieldTypeTp::String, &[97]).unwrap(), true, ), + ( + FieldTypeBuilder::new() + .tp(FieldTypeTp::VarChar) + .flen(UNSPECIFIED_LENGTH) + .charset(CHARSET_BIN) + .collation(Collation::Binary) + .build(), + "a".to_string(), + Json::from_opaque(FieldTypeTp::String, &[97]).unwrap(), + true, + ), ]; for (arg_type, input, expect, parse_to_json) in cs { let arg_value = ScalarValue::Bytes(Some(input.clone().into_bytes())); diff --git a/components/tidb_query_expr/src/impl_json.rs b/components/tidb_query_expr/src/impl_json.rs index 68132ae08e2..f24dea0e5c8 100644 --- a/components/tidb_query_expr/src/impl_json.rs +++ b/components/tidb_query_expr/src/impl_json.rs @@ -493,7 +493,7 @@ mod tests { (Some(r#"null"#), Some("NULL")), (Some(r#"-3"#), Some("INTEGER")), (Some(r#"3"#), Some("INTEGER")), - (Some(r#"9223372036854775808"#), Some("DOUBLE")), + (Some(r#"9223372036854775808"#), Some("UNSIGNED INTEGER")), (Some(r#"3.14"#), Some("DOUBLE")), (Some(r#"[1, 2, 3]"#), Some("ARRAY")), (Some(r#"{"name": 123}"#), Some("OBJECT")), diff --git a/components/tidb_query_expr/src/impl_math.rs b/components/tidb_query_expr/src/impl_math.rs index beeeef288b4..8817249b628 100644 --- a/components/tidb_query_expr/src/impl_math.rs +++ b/components/tidb_query_expr/src/impl_math.rs @@ -280,7 +280,7 @@ fn sqrt(arg: &Real) -> Result> { #[inline] #[rpn_fn] fn radians(arg: &Real) -> Result> { - Ok(Real::new(**arg * std::f64::consts::PI / 180_f64).ok()) + Ok(Real::new(**arg * (std::f64::consts::PI / 180_f64)).ok()) } #[inline] @@ -353,7 +353,12 @@ fn rand_with_seed_first_gen(seed: Option<&i64>) -> Result> { #[inline] #[rpn_fn] fn degrees(arg: &Real) -> Result> { - Ok(Real::new(arg.to_degrees()).ok()) + let ret = arg.to_degrees(); + if ret.is_infinite() { + Err(Error::overflow("DOUBLE", format!("degrees({})", arg)).into()) + } else { + Ok(Real::new(ret).ok()) + } } #[inline] @@ -387,16 +392,20 @@ pub fn conv(n: BytesRef, from_base: &Int, to_base: &Int) -> Result let s = s.trim(); let from_base = IntWithSign::from_int(*from_base); let to_base = IntWithSign::from_int(*to_base); - Ok(if is_valid_base(from_base) && is_valid_base(to_base) { + if is_valid_base(from_base) && is_valid_base(to_base) { if let Some((num_str, is_neg)) = extract_num_str(s, from_base) { - let num = extract_num(num_str.as_ref(), is_neg, from_base); - Some(num.format_to_base(to_base).into_bytes()) + match extract_num(num_str.as_ref(), is_neg, from_base) { + Some(num) => Ok(Some(num.format_to_base(to_base).into_bytes())), + None => { + Err(Error::overflow("BIGINT UNSIGNED", format!("conv({})", num_str)).into()) + } + } } else { - Some(b"0".to_vec()) + Ok(Some(b"0".to_vec())) } } else { - None - }) + Ok(None) + } } #[inline] @@ -566,7 +575,9 @@ impl IntWithSign { // Shrink num to fit the boundary of i64. fn shrink_from_signed_uint(num: u64, is_neg: bool) -> IntWithSign { let value = if is_neg { - num.min(-Int::min_value() as u64) + // Avoid int64 overflow error. + // -int64_min = int64_max + 1 + num.min(Int::max_value() as u64 + 1) } else { num.min(Int::max_value() as u64) }; @@ -594,7 +605,8 @@ impl IntWithSign { let IntWithSign(value, is_neg) = self; let IntWithSign(to_base, should_ignore_sign) = to_base; let mut real_val = value as i64; - if is_neg && !should_ignore_sign { + // real_val > 0 is to avoid overflow issue when value is -int64_min. + if is_neg && !should_ignore_sign && real_val > 0 { real_val = -real_val; } let mut ret = IntWithSign::format_radix(real_val as u64, to_base as u32); @@ -629,14 +641,17 @@ fn extract_num_str(s: &str, from_base: IntWithSign) -> Option<(String, bool)> { } } -fn extract_num(num_s: &str, is_neg: bool, from_base: IntWithSign) -> IntWithSign { +fn extract_num(num_s: &str, is_neg: bool, from_base: IntWithSign) -> Option { let IntWithSign(from_base, signed) = from_base; - let value = u64::from_str_radix(num_s, from_base as u32).unwrap(); - if signed { + let value = match u64::from_str_radix(num_s, from_base as u32) { + Ok(v) => v, + Err(_) => return None, + }; + Some(if signed { IntWithSign::shrink_from_signed_uint(value, is_neg) } else { IntWithSign::from_signed_uint(value, is_neg) - } + }) } // Returns (isize, is_positive): convert an i64 to usize, and whether the input @@ -1172,6 +1187,10 @@ mod tests { ), (Some(f64::NAN), None), (Some(f64::INFINITY), Some(Real::new(f64::INFINITY).unwrap())), + ( + Some(1.0E308), + Some(Real::new(1.0E308 * (std::f64::consts::PI / 180_f64)).unwrap()), + ), ]; for (input, expect) in test_cases { let output = RpnFnScalarEvaluator::new() @@ -1211,25 +1230,34 @@ mod tests { #[test] fn test_degrees() { let tests_cases = vec![ - (None, None), - (Some(f64::NAN), None), - (Some(0f64), Some(Real::new(0f64).unwrap())), - (Some(1f64), Some(Real::new(57.29577951308232_f64).unwrap())), + (None, None, false), + (Some(f64::NAN), None, false), + (Some(0f64), Some(Real::new(0f64).unwrap()), false), + ( + Some(1f64), + Some(Real::new(57.29577951308232_f64).unwrap()), + false, + ), ( Some(std::f64::consts::PI), Some(Real::new(180.0_f64).unwrap()), + false, ), ( Some(-std::f64::consts::PI / 2.0_f64), Some(Real::new(-90.0_f64).unwrap()), + false, ), + (Some(1.0E307), None, true), ]; - for (input, expect) in tests_cases { + for (input, expect, is_err) in tests_cases { let output = RpnFnScalarEvaluator::new() .push_param(input) - .evaluate(ScalarFuncSig::Degrees) - .unwrap(); - assert_eq!(expect, output, "{:?}", input); + .evaluate(ScalarFuncSig::Degrees); + assert_eq!(is_err, output.is_err()); + if let Ok(out) = output { + assert_eq!(expect, out, "{:?}", input); + } } } @@ -1605,6 +1633,18 @@ mod tests { ("+", 10, 8, "0"), ("-", 10, 8, "0"), ("", 2, 16, "0"), + ( + "18446744073709551615", + 10, + 2, + "1111111111111111111111111111111111111111111111111111111111111111", + ), + ( + "-18446744073709551615", + -10, + 2, + "1000000000000000000000000000000000000000000000000000000000000000", + ), ]; for (n, f, t, e) in tests { let n = Some(n.as_bytes().to_vec()); @@ -1621,17 +1661,37 @@ mod tests { } let invalid_tests = vec![ - (None, Some(10), Some(10), None), - (Some(b"a6a".to_vec()), Some(1), Some(8), None), + (None, Some(10), Some(10)), + (Some(b"111".to_vec()), None, Some(7)), + (Some(b"112".to_vec()), Some(10), None), + (None, None, None), + (Some(b"222".to_vec()), Some(2), Some(100)), + (Some(b"333".to_vec()), Some(37), Some(2)), + (Some(b"a6a".to_vec()), Some(1), Some(8)), ]; - for (n, f, t, e) in invalid_tests { + for (n, f, t) in invalid_tests { let got = RpnFnScalarEvaluator::new() .push_param(n) .push_param(f) .push_param(t) .evaluate::(ScalarFuncSig::Conv) .unwrap(); - assert_eq!(got, e); + assert_eq!(got, None); + } + + let error_tests = vec![ + ("18446744073709551616", Some(10), Some(10)), + ("100000000000000000001", Some(10), Some(8)), + ("-18446744073709551616", Some(-10), Some(4)), + ]; + for (n, f, t) in error_tests { + let n = Some(n.as_bytes().to_vec()); + let got = RpnFnScalarEvaluator::new() + .push_param(n) + .push_param(f) + .push_param(t) + .evaluate::(ScalarFuncSig::Conv); + got.unwrap_err(); } } diff --git a/components/tidb_query_expr/src/impl_string.rs b/components/tidb_query_expr/src/impl_string.rs index f3b9b03c287..25c9294d533 100644 --- a/components/tidb_query_expr/src/impl_string.rs +++ b/components/tidb_query_expr/src/impl_string.rs @@ -635,15 +635,22 @@ fn field(args: &[Option<&T>]) -> Result #[rpn_fn(nullable, varg, min_args = 1)] #[inline] -fn field_bytes(args: &[Option]) -> Result> { +fn field_bytes(args: &[Option]) -> Result> { Ok(Some(match args[0] { // As per the MySQL doc, if the first argument is NULL, this function always returns 0. None => 0, - Some(val) => args - .iter() - .skip(1) - .position(|&i| i == Some(val)) - .map_or(0, |pos| (pos + 1) as i64), + Some(val) => { + for (pos, arg) in args.iter().enumerate().skip(1) { + if arg.is_none() { + continue; + } + match C::sort_compare(val, arg.unwrap()) { + Ok(Ordering::Equal) => return Ok(Some(pos as i64)), + _ => continue, + } + } + 0 + } })) } @@ -3214,6 +3221,7 @@ mod tests { Some(b"baz".to_vec()), ], Some(1), + Collation::Utf8Mb4Bin, ), ( vec![ @@ -3223,6 +3231,7 @@ mod tests { Some(b"hello".to_vec()), ], Some(0), + Collation::Utf8Mb4Bin, ), ( vec![ @@ -3232,6 +3241,7 @@ mod tests { Some(b"hello".to_vec()), ], Some(3), + Collation::Utf8Mb4Bin, ), ( vec![ @@ -3244,6 +3254,7 @@ mod tests { Some(b"Hello".to_vec()), ], Some(6), + Collation::Utf8Mb4Bin, ), ( vec![ @@ -3252,14 +3263,37 @@ mod tests { Some(b"Hello World!".to_vec()), ], Some(0), + Collation::Utf8Mb4Bin, + ), + ( + vec![None, None, Some(b"Hello World!".to_vec())], + Some(0), + Collation::Utf8Mb4Bin, + ), + ( + vec![Some(b"Hello World!".to_vec())], + Some(0), + Collation::Utf8Mb4Bin, + ), + ( + vec![ + Some(b"a".to_vec()), + Some(b"A".to_vec()), + Some(b"a".to_vec()), + ], + Some(1), + Collation::Utf8Mb4GeneralCi, ), - (vec![None, None, Some(b"Hello World!".to_vec())], Some(0)), - (vec![Some(b"Hello World!".to_vec())], Some(0)), ]; - for (args, expect_output) in test_cases { + for (args, expect_output, collation) in test_cases { let output = RpnFnScalarEvaluator::new() .push_params(args) + .return_field_type( + FieldTypeBuilder::new() + .tp(FieldTypeTp::Long) + .collation(collation), + ) .evaluate(ScalarFuncSig::FieldString) .unwrap(); assert_eq!(output, expect_output); diff --git a/components/tidb_query_expr/src/lib.rs b/components/tidb_query_expr/src/lib.rs index c2ef6722148..61fb3612b63 100644 --- a/components/tidb_query_expr/src/lib.rs +++ b/components/tidb_query_expr/src/lib.rs @@ -409,6 +409,14 @@ fn map_lower_utf8_sig(value: ScalarFuncSig, children: &[Expr]) -> Result Result { + Ok(match_template_collator! { + TT, match ret_field_type.as_accessor().collation().map_err(tidb_query_datatype::codec::Error::from)? { + Collation::TT => field_bytes_fn_meta::() + } + }) +} + #[rustfmt::skip] fn map_expr_node_to_rpn_func(expr: &Expr) -> Result { let value = expr.get_sig(); @@ -787,7 +795,7 @@ fn map_expr_node_to_rpn_func(expr: &Expr) -> Result { ScalarFuncSig::Locate3Args => locate_3_args_fn_meta(), ScalarFuncSig::FieldInt => field_fn_meta::(), ScalarFuncSig::FieldReal => field_fn_meta::(), - ScalarFuncSig::FieldString => field_bytes_fn_meta(), + ScalarFuncSig::FieldString => map_field_string_sig(ft)?, ScalarFuncSig::Elt => elt_fn_meta(), ScalarFuncSig::MakeSet => make_set_fn_meta(), ScalarFuncSig::Space => space_fn_meta(), diff --git a/components/tikv_alloc/src/default.rs b/components/tikv_alloc/src/default.rs index 2674331c3cd..5133d76e172 100644 --- a/components/tikv_alloc/src/default.rs +++ b/components/tikv_alloc/src/default.rs @@ -8,6 +8,7 @@ use crate::AllocStats; pub fn dump_stats() -> String { String::new() } + pub fn dump_prof(_path: &str) -> ProfResult<()> { Err(ProfError::MemProfilingNotEnabled) } @@ -24,6 +25,14 @@ pub fn deactivate_prof() -> ProfResult<()> { Err(ProfError::MemProfilingNotEnabled) } +pub fn set_prof_sample(_rate: u64) -> ProfResult<()> { + Err(ProfError::MemProfilingNotEnabled) +} + +pub fn is_profiling_active() -> bool { + false +} + /// # Safety /// /// It is safe. The unsafe marker is just for matching the function signature. diff --git a/components/tikv_alloc/src/jemalloc.rs b/components/tikv_alloc/src/jemalloc.rs index 876afa9fcd5..245f6280b71 100644 --- a/components/tikv_alloc/src/jemalloc.rs +++ b/components/tikv_alloc/src/jemalloc.rs @@ -133,7 +133,7 @@ pub fn remove_thread_memory_accessor() { use std::thread::ThreadId; -pub use self::profiling::{activate_prof, deactivate_prof, dump_prof}; +pub use self::profiling::*; pub fn dump_stats() -> String { let mut buf = Vec::with_capacity(1024); @@ -311,6 +311,21 @@ mod profiling { // C string should end with a '\0'. const PROF_ACTIVE: &[u8] = b"prof.active\0"; const PROF_DUMP: &[u8] = b"prof.dump\0"; + const PROF_RESET: &[u8] = b"prof.reset\0"; + const OPT_PROF: &[u8] = b"opt.prof\0"; + + pub fn set_prof_sample(rate: u64) -> ProfResult<()> { + let rate = (rate as f64).log2().ceil() as usize; + unsafe { + if let Err(e) = tikv_jemalloc_ctl::raw::write(PROF_RESET, rate) { + return Err(ProfError::JemallocError(format!( + "failed to set prof sample: {}", + e + ))); + } + } + Ok(()) + } pub fn activate_prof() -> ProfResult<()> { unsafe { @@ -351,22 +366,44 @@ mod profiling { Ok(()) } + pub fn is_profiling_active() -> bool { + match unsafe { tikv_jemalloc_ctl::raw::read(PROF_ACTIVE) } { + Err(e) => { + panic!("is_profiling_active: {:?}", e); + } + Ok(prof) => prof, + } + } + + pub fn is_profiling_enabled() -> bool { + match unsafe { tikv_jemalloc_ctl::raw::read(OPT_PROF) } { + Err(e) => { + // Shouldn't be possible since mem-profiling is set + panic!("is_profiling_enabled: {:?}", e); + } + Ok(prof) => prof, + } + } + #[cfg(test)] mod tests { use std::fs; use tempfile::Builder; - const OPT_PROF: &[u8] = b"opt.prof\0"; + use super::*; - fn is_profiling_on() -> bool { - match unsafe { tikv_jemalloc_ctl::raw::read(OPT_PROF) } { - Err(e) => { - // Shouldn't be possible since mem-profiling is set - panic!("is_profiling_on: {:?}", e); - } - Ok(prof) => prof, - } + #[test] + #[ignore = "#ifdef MALLOC_CONF"] + fn test_profiling_active() { + // Make sure somebody has turned on profiling + assert!(is_profiling_enabled(), "set MALLOC_CONF=prof:true"); + activate_prof().unwrap(); + assert!(is_profiling_active()); + deactivate_prof().unwrap(); + assert!(!is_profiling_active()); + + super::set_prof_sample(512 * 1024 * 1024).unwrap(); } // Only trigger this test with jemallocs `opt.prof` set to @@ -382,7 +419,7 @@ mod profiling { #[ignore = "#ifdef MALLOC_CONF"] fn test_profiling_memory_ifdef_malloc_conf() { // Make sure somebody has turned on profiling - assert!(is_profiling_on(), "set MALLOC_CONF=prof:true"); + assert!(is_profiling_enabled(), "set MALLOC_CONF=prof:true"); let dir = Builder::new() .prefix("test_profiling_memory") @@ -391,11 +428,11 @@ mod profiling { let os_path = dir.path().to_path_buf().join("test1.dump").into_os_string(); let path = os_path.into_string().unwrap(); - super::dump_prof(&path).unwrap(); + dump_prof(&path).unwrap(); let os_path = dir.path().to_path_buf().join("test2.dump").into_os_string(); let path = os_path.into_string().unwrap(); - super::dump_prof(&path).unwrap(); + dump_prof(&path).unwrap(); let files = fs::read_dir(dir.path()).unwrap().count(); assert_eq!(files, 2); @@ -431,4 +468,10 @@ mod profiling { pub fn deactivate_prof() -> ProfResult<()> { Err(ProfError::MemProfilingNotEnabled) } + pub fn set_prof_sample(_rate: u64) -> ProfResult<()> { + Err(ProfError::MemProfilingNotEnabled) + } + pub fn is_profiling_active() -> bool { + false + } } diff --git a/components/tikv_kv/src/raft_extension.rs b/components/tikv_kv/src/raft_extension.rs index 26c9e687ef6..7ab4c1c030d 100644 --- a/components/tikv_kv/src/raft_extension.rs +++ b/components/tikv_kv/src/raft_extension.rs @@ -32,6 +32,9 @@ pub trait RaftExtension: Clone + Send { /// Report the target store is unreachable. fn report_store_unreachable(&self, _store_id: u64) {} + /// Report the target store may be tombstone. + fn report_store_maybe_tombstone(&self, _store_id: u64) {} + /// Report the status of snapshot. fn report_snapshot_status(&self, _region_id: u64, _to_peer_id: u64, _status: SnapshotStatus) {} diff --git a/components/tikv_util/Cargo.toml b/components/tikv_util/Cargo.toml index 6de354fa259..4e509c7f153 100644 --- a/components/tikv_util/Cargo.toml +++ b/components/tikv_util/Cargo.toml @@ -9,7 +9,8 @@ failpoints = ["fail/failpoints"] test-cgroup = [] [dependencies] -async-speed-limit = "0.4.0" +# TODO: use `async-speed-limit` in crates.io after new version(v0.4.2) is released. +async-speed-limit = { git = "https://github.com/tikv/async-speed-limit", branch = "master" } backtrace = "0.3.9" byteorder = "1.2" bytes = "1.0" @@ -23,6 +24,7 @@ crossbeam-skiplist = "0.1" derive_more = "0.99.3" error_code = { workspace = true } fail = "0.5" +fs2 = "0.4" futures = { version = "0.3", features = ["compat", "thread-pool"] } futures-util = { version = "0.3", default-features = false, features = ["io"] } grpcio = { workspace = true } @@ -30,7 +32,10 @@ http = "0.2.0" kvproto = { workspace = true } lazy_static = "1.3" libc = "0.2" -log = { version = "0.4", features = ["max_level_trace", "release_max_level_debug"] } +log = { version = "0.4", features = [ + "max_level_trace", + "release_max_level_debug", +] } log_wrappers = { workspace = true } mnt = "0.3.1" nix = "0.24" @@ -38,6 +43,7 @@ num-traits = "0.2" num_cpus = "1" online_config = { workspace = true } openssl = "0.10" +ordered-float = "2.6" parking_lot_core = "0.9.1" pin-project = "1.0" prometheus = { version = "0.13", features = ["nightly"] } @@ -57,7 +63,7 @@ thiserror = "1.0" tikv_alloc = { workspace = true } time = "0.1" tokio = { version = "1.5", features = ["rt-multi-thread"] } -tokio-executor = "0.1" +tokio-executor = { workspace = true } tokio-timer = { workspace = true } tracker = { workspace = true } url = "2" diff --git a/components/tikv_util/src/codec/stream_event.rs b/components/tikv_util/src/codec/stream_event.rs index 5b00cad6372..3c1a04f77e3 100644 --- a/components/tikv_util/src/codec/stream_event.rs +++ b/components/tikv_util/src/codec/stream_event.rs @@ -6,6 +6,13 @@ use bytes::{Buf, Bytes}; use crate::{codec::Result, Either}; +// Note: maybe allow them to be different lifetime. +// But not necessary for now, so keep it simple...? +pub struct Rewrite<'a> { + from: &'a [u8], + to: &'a [u8], +} + pub trait Iterator { fn next(&mut self) -> Result<()>; @@ -19,10 +26,12 @@ pub trait Iterator { pub struct EventIterator<'a> { buf: &'a [u8], offset: usize, - key_offset: usize, value_offset: usize, - key_len: usize, value_len: usize, + + key_buf: Vec, + + rewrite_rule: Option>, } impl EventIterator<'_> { @@ -30,10 +39,21 @@ impl EventIterator<'_> { EventIterator { buf, offset: 0, - key_offset: 0, - key_len: 0, + key_buf: vec![], value_offset: 0, value_len: 0, + rewrite_rule: None, + } + } + + pub fn with_rewriting<'a>(buf: &'a [u8], from: &'a [u8], to: &'a [u8]) -> EventIterator<'a> { + EventIterator { + buf, + offset: 0, + key_buf: vec![], + value_offset: 0, + value_len: 0, + rewrite_rule: Some(Rewrite { from, to }), } } @@ -42,14 +62,47 @@ impl EventIterator<'_> { self.offset += 4; result } + + fn consume_key_with_len(&mut self, key_len: usize) { + self.key_buf.clear(); + self.key_buf.reserve(key_len); + self.key_buf + .extend_from_slice(&self.buf[self.offset..self.offset + key_len]); + self.offset += key_len; + } + + fn move_to_next_key_with_rewrite(&mut self) { + let key_len = self.get_size() as usize; + let rewrite = self.rewrite_rule.as_ref().expect("rewrite rule not set"); + if key_len < rewrite.from.len() + || &self.buf[self.offset..self.offset + rewrite.from.len()] != rewrite.from + { + self.consume_key_with_len(key_len); + return; + } + self.key_buf.clear(); + self.key_buf + .reserve(rewrite.to.len() + key_len - rewrite.from.len()); + self.key_buf.extend_from_slice(rewrite.to); + self.key_buf + .extend_from_slice(&self.buf[self.offset + rewrite.from.len()..self.offset + key_len]); + self.offset += key_len; + } + + fn fetch_key_buffer_and_move_to_value(&mut self) { + if self.rewrite_rule.is_some() { + self.move_to_next_key_with_rewrite() + } else { + let key_len = self.get_size() as usize; + self.consume_key_with_len(key_len); + } + } } impl Iterator for EventIterator<'_> { fn next(&mut self) -> Result<()> { if self.valid() { - self.key_len = self.get_size() as usize; - self.key_offset = self.offset; - self.offset += self.key_len; + self.fetch_key_buffer_and_move_to_value(); self.value_len = self.get_size() as usize; self.value_offset = self.offset; @@ -63,7 +116,7 @@ impl Iterator for EventIterator<'_> { } fn key(&self) -> &[u8] { - &self.buf[self.key_offset..self.key_offset + self.key_len] + &self.key_buf[..] } fn value(&self) -> &[u8] { @@ -155,4 +208,44 @@ mod tests { } assert_eq!(count, index); } + + #[test] + fn test_rewrite() { + let mut rng = rand::thread_rng(); + let mut event = vec![]; + let mut keys = vec![]; + let mut vals = vec![]; + let count = 20; + + for _i in 0..count { + let should_rewrite = rng.gen::(); + let mut key: Vec = std::iter::once(if should_rewrite { b'k' } else { b'l' }) + .chain((0..100).map(|_| rng.gen_range(0..255))) + .collect(); + let val: Vec = (0..100).map(|_| rng.gen_range(0..255)).collect(); + let e = EventEncoder::encode_event(&key, &val); + for s in e { + event.extend_from_slice(s.as_ref()); + } + if should_rewrite { + key[0] = b'r'; + } + keys.push(key); + vals.push(val); + } + + let mut iter = EventIterator::with_rewriting(&event, b"k", b"r"); + + let mut index = 0_usize; + loop { + if !iter.valid() { + break; + } + iter.next().unwrap(); + assert_eq!(iter.key(), keys[index]); + assert_eq!(iter.value(), vals[index]); + index += 1; + } + assert_eq!(count, index); + } } diff --git a/components/tikv_util/src/deadline.rs b/components/tikv_util/src/deadline.rs index 84463f507b9..64416999fe3 100644 --- a/components/tikv_util/src/deadline.rs +++ b/components/tikv_util/src/deadline.rs @@ -1,6 +1,7 @@ // Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. use fail::fail_point; +use kvproto::errorpb; use super::time::{Duration, Instant}; @@ -58,3 +59,11 @@ impl Deadline { std::time::Instant::now() + self.deadline.duration_since(Instant::now_coarse()) } } + +const DEADLINE_EXCEEDED: &str = "deadline is exceeded"; + +pub fn set_deadline_exceeded_busy_error(e: &mut errorpb::Error) { + let mut server_is_busy_err = errorpb::ServerIsBusy::default(); + server_is_busy_err.set_reason(DEADLINE_EXCEEDED.to_owned()); + e.set_server_is_busy(server_is_busy_err); +} diff --git a/components/tikv_util/src/lib.rs b/components/tikv_util/src/lib.rs index b8aa578a878..59f255faaeb 100644 --- a/components/tikv_util/src/lib.rs +++ b/components/tikv_util/src/lib.rs @@ -5,6 +5,7 @@ #![feature(box_patterns)] #![feature(vec_into_raw_parts)] #![feature(let_chains)] +#![feature(div_duration)] #[cfg(test)] extern crate test; @@ -54,6 +55,7 @@ pub mod memory; pub mod metrics; pub mod mpsc; pub mod quota_limiter; +pub mod slow_score; pub mod store; pub mod stream; pub mod sys; @@ -612,6 +614,22 @@ pub fn set_vec_capacity(v: &mut Vec, cap: usize) { } } +#[derive(Debug, Clone, Copy, PartialEq)] +pub enum InspectFactor { + RaftDisk = 0, + KvDisk, + // TODO: Add more factors, like network io. +} + +impl InspectFactor { + pub fn as_str(&self) -> &str { + match *self { + InspectFactor::RaftDisk => "raft", + InspectFactor::KvDisk => "kvdb", + } + } +} + #[cfg(test)] mod tests { use std::{ diff --git a/components/tikv_util/src/log.rs b/components/tikv_util/src/log.rs index fd351eecbd4..91bd5013c1e 100644 --- a/components/tikv_util/src/log.rs +++ b/components/tikv_util/src/log.rs @@ -83,6 +83,18 @@ macro_rules! trace(($($args:tt)+) => { ::slog_global::trace!($($args)+) };); +/// Logs a infor or debug level message using the slog global logger. +#[macro_export] +macro_rules! info_or_debug{ + ($cond:expr; $($args:tt)+) => { + if $cond { + info!($($args)+) + } else { + debug!($($args)+) + } + }; +} + use std::fmt::{self, Display, Write}; use slog::{BorrowedKV, OwnedKVList, Record, KV}; diff --git a/components/tikv_util/src/logger/mod.rs b/components/tikv_util/src/logger/mod.rs index 5ebe9468a50..1eb82ced3e2 100644 --- a/components/tikv_util/src/logger/mod.rs +++ b/components/tikv_util/src/logger/mod.rs @@ -72,6 +72,24 @@ where } }; + fn build_log_drain( + drain: I, + threshold: u64, + filter: impl FilterFn, + ) -> impl Drain + where + I: Drain, + { + let drain = SlowLogFilter { + threshold, + inner: drain, + }; + let drain = ThreadIDrain(drain); + // Let GlobalLevelFilter wrap ThreadIDrain, so that it saves getting + // thread id for flittered logs. + GlobalLevelFilter::new(drain.filter(filter).fuse()) + } + let (logger, guard) = if use_async { let (async_log, guard) = Async::new(LogAndFuse(drain)) .chan_size(SLOG_CHANNEL_SIZE) @@ -79,21 +97,12 @@ where .thread_name(thd_name!("slogger")) .build_with_guard(); let drain = async_log.fuse(); - let drain = SlowLogFilter { - threshold: slow_threshold, - inner: drain, - }; - let filtered = GlobalLevelFilter::new(drain.filter(filter).fuse()); - - (slog::Logger::root(filtered, slog_o!()), Some(guard)) + let drain = build_log_drain(drain, slow_threshold, filter); + (slog::Logger::root(drain, slog_o!()), Some(guard)) } else { let drain = LogAndFuse(Mutex::new(drain)); - let drain = SlowLogFilter { - threshold: slow_threshold, - inner: drain, - }; - let filtered = GlobalLevelFilter::new(drain.filter(filter).fuse()); - (slog::Logger::root(filtered, slog_o!()), None) + let drain = build_log_drain(drain, slow_threshold, filter); + (slog::Logger::root(drain, slog_o!()), None) }; set_global_logger(level, init_stdlog, logger, guard) @@ -628,6 +637,24 @@ fn write_log_fields( Ok(()) } +struct ThreadIDrain(pub D); + +impl Drain for ThreadIDrain +where + D: Drain, +{ + type Ok = D::Ok; + type Err = D::Err; + fn log(&self, record: &Record<'_>, values: &OwnedKVList) -> Result { + let values = slog::o!( + "thread_id" => std::thread::current().id().as_u64().get(), + // OwnedKVList is essentially an Arc, clone is cheap. + values.clone(), + ); + self.0.log(record, &OwnedKVList::from(values)) + } +} + struct Serializer<'a> { decorator: &'a mut dyn RecordDecorator, } @@ -679,7 +706,7 @@ impl<'a> slog::Serializer for Serializer<'a> { #[cfg(test)] mod tests { - use std::{cell::RefCell, io, io::Write, str::from_utf8}; + use std::{cell::RefCell, io, io::Write, str::from_utf8, sync::Arc, time::Duration}; use chrono::DateTime; use regex::Regex; @@ -688,25 +715,17 @@ mod tests { use super::*; - // Due to the requirements of `Logger::root*` on a writer with a 'static - // lifetime we need to make a Thread Local, - // and implement a custom writer. - thread_local! { - static BUFFER: RefCell> = RefCell::new(Vec::new()); - } - struct TestWriter; + struct TestWriter(Arc>>); impl Write for TestWriter { fn write(&mut self, buf: &[u8]) -> io::Result { - BUFFER.with(|buffer| buffer.borrow_mut().write(buf)) + self.0.lock().unwrap().write(buf) } fn flush(&mut self) -> io::Result<()> { - BUFFER.with(|buffer| buffer.borrow_mut().flush()) + self.0.lock().unwrap().flush() } } fn log_format_cases(logger: slog::Logger) { - use std::time::Duration; - // Empty message is not recommend, just for test purpose here. slog_info!(logger, ""); slog_info!(logger, "Welcome"); @@ -761,113 +780,125 @@ mod tests { #[test] fn test_log_format_text() { - let decorator = PlainSyncDecorator::new(TestWriter); + let buffer: Arc>> = Arc::default(); + let decorator = PlainSyncDecorator::new(TestWriter(buffer.clone())); let drain = TikvFormat::new(decorator, true).fuse(); + let drain = ThreadIDrain(drain); + let drain = slog::Logger::root_typed(drain, slog_o!("raft_id" => 1)).into_erased(); let logger = slog::Logger::root_typed(drain, slog_o!()).into_erased(); log_format_cases(logger); - let expect = r#"[2019/01/15 13:40:39.619 +08:00] [INFO] [mod.rs:469] [] -[2019/01/15 13:40:39.619 +08:00] [INFO] [mod.rs:469] [Welcome] -[2019/01/15 13:40:39.619 +08:00] [INFO] [mod.rs:470] ["Welcome TiKV"] -[2019/01/15 13:40:39.619 +08:00] [INFO] [mod.rs:471] [欢迎] -[2019/01/15 13:40:39.619 +08:00] [INFO] [mod.rs:472] ["欢迎 TiKV"] -[2019/01/15 13:40:39.615 +08:00] [INFO] [mod.rs:455] ["failed to fetch URL"] [backoff=3s] [attempt=3] [url=http://example.com] -[2019/01/15 13:40:39.619 +08:00] [INFO] [mod.rs:460] ["failed to \"fetch\" [URL]: http://example.com"] -[2019/01/15 13:40:39.619 +08:00] [DEBUG] [mod.rs:463] ["Slow query"] ["process keys"=1500] [duration=123ns] [sql="SELECT * FROM TABLE WHERE ID=\"abc\""] -[2019/01/15 13:40:39.619 +08:00] [WARN] [mod.rs:473] [Type] [Other=-inf] [Score=inf] [Counter=NaN] -[2019/01/16 16:56:04.854 +08:00] [INFO] [mod.rs:391] ["more type tests"] [str_array="[\"💖\", \"�\", \"☺☻☹\", \"日a本b語ç日ð本Ê語þ日¥本¼語i日©\", \"日a本b語ç日ð本Ê語þ日¥本¼語i日©日a本b語ç日ð本Ê語þ日¥本¼語i日©日a本b語ç日ð本Ê語þ日¥本¼語i日©\", \"\\\\x80\\\\x80\\\\x80\\\\x80\", \"XML\"]"] [u8=34] [is_None=None] [is_false=false] [is_true=true] ["store ids"="[1, 2, 3]"] [url-peers="[\"peer1\", \"peer 2\"]"] [urls="[\"http://xxx.com:2347\", \"http://xxx.com:2432\"]"] [field2="in quote"] [field1=no_quote] -"#; - - BUFFER.with(|buffer| { - let mut buffer = buffer.borrow_mut(); - let output = from_utf8(&buffer).unwrap(); - assert_eq!(output.lines().count(), expect.lines().count()); - - let re = Regex::new(r"(?P\[.*?\])\s(?P\[.*?\])\s(?P\[.*?\])\s(?P\[.*?\])\s?(?P\[.*\])?").unwrap(); + let thread_id = std::thread::current().id().as_u64(); + let expect = format!( + r#"[2019/01/15 13:40:39.619 +08:00] [INFO] [mod.rs:469] [] [raft_id=1] [thread_id={0}] +[2019/01/15 13:40:39.619 +08:00] [INFO] [mod.rs:469] [Welcome] [raft_id=1] [thread_id={0}] +[2019/01/15 13:40:39.619 +08:00] [INFO] [mod.rs:470] ["Welcome TiKV"] [raft_id=1] [thread_id={0}] +[2019/01/15 13:40:39.619 +08:00] [INFO] [mod.rs:471] [欢迎] [raft_id=1] [thread_id={0}] +[2019/01/15 13:40:39.619 +08:00] [INFO] [mod.rs:472] ["欢迎 TiKV"] [raft_id=1] [thread_id={0}] +[2019/01/15 13:40:39.615 +08:00] [INFO] [mod.rs:455] ["failed to fetch URL"] [backoff=3s] [attempt=3] [url=http://example.com] [raft_id=1] [thread_id={0}] +[2019/01/15 13:40:39.619 +08:00] [INFO] [mod.rs:460] ["failed to \"fetch\" [URL]: http://example.com"] [raft_id=1] [thread_id={0}] +[2019/01/15 13:40:39.619 +08:00] [DEBUG] [mod.rs:463] ["Slow query"] ["process keys"=1500] [duration=123ns] [sql="SELECT * FROM TABLE WHERE ID=\"abc\""] [raft_id=1] [thread_id={0}] +[2019/01/15 13:40:39.619 +08:00] [WARN] [mod.rs:473] [Type] [Other=-inf] [Score=inf] [Counter=NaN] [raft_id=1] [thread_id={0}] +[2019/01/16 16:56:04.854 +08:00] [INFO] [mod.rs:391] ["more type tests"] [str_array="[\"💖\", \"�\", \"☺☻☹\", \"日a本b語ç日ð本Ê語þ日¥本¼語i日©\", \"日a本b語ç日ð本Ê語þ日¥本¼語i日©日a本b語ç日ð本Ê語þ日¥本¼語i日©日a本b語ç日ð本Ê語þ日¥本¼語i日©\", \"\\\\x80\\\\x80\\\\x80\\\\x80\", \"XML\"]"] [u8=34] [is_None=None] [is_false=false] [is_true=true] ["store ids"="[1, 2, 3]"] [url-peers="[\"peer1\", \"peer 2\"]"] [urls="[\"http://xxx.com:2347\", \"http://xxx.com:2432\"]"] [field2="in quote"] [field1=no_quote] [raft_id=1] [thread_id={0}] +"#, + thread_id + ); - for (output_line, expect_line) in output.lines().zip(expect.lines()) { - let expect_segments = re.captures(expect_line).unwrap(); - let output_segments = re.captures(output_line).unwrap(); + let buffer = buffer.lock().unwrap(); + let output = from_utf8(&buffer).unwrap(); + assert_eq!( + output.lines().count(), + expect.lines().count(), + "{}\n===\n{}", + output, + expect + ); - validate_log_datetime(peel(&output_segments["datetime"])); + let re = Regex::new(r"(?P\[.*?\])\s(?P\[.*?\])\s(?P\[.*?\])\s(?P\[.*?\])\s?(?P\[.*\])?").unwrap(); - assert!(validate_log_source_file( - peel(&expect_segments["source_file"]), - peel(&output_segments["source_file"]) - )); - assert_eq!(expect_segments["level"], output_segments["level"]); - assert_eq!(expect_segments["msg"], output_segments["msg"]); - assert_eq!( - expect_segments.name("kvs").map(|s| s.as_str()), - output_segments.name("kvs").map(|s| s.as_str()) - ); - } - buffer.clear(); - }); + for (output_line, expect_line) in output.lines().zip(expect.lines()) { + let expect_segments = re.captures(expect_line).unwrap(); + let output_segments = re.captures(output_line).unwrap(); + + validate_log_datetime(peel(&output_segments["datetime"])); + + assert!(validate_log_source_file( + peel(&expect_segments["source_file"]), + peel(&output_segments["source_file"]) + )); + assert_eq!(expect_segments["level"], output_segments["level"]); + assert_eq!(expect_segments["msg"], output_segments["msg"]); + assert_eq!( + expect_segments.name("kvs").map(|s| s.as_str()), + output_segments.name("kvs").map(|s| s.as_str()) + ); + } } #[test] fn test_log_format_json() { use serde_json::{from_str, Value}; - let drain = Mutex::new(json_format(TestWriter, true)).map(slog::Fuse); + let buffer: Arc>> = Arc::default(); + let drain = Mutex::new(json_format(TestWriter(buffer.clone()), true)).map(slog::Fuse); + let drain = ThreadIDrain(drain); let logger = slog::Logger::root_typed(drain, slog_o!()).into_erased(); log_format_cases(logger); - let expect = r#"{"time":"2020/05/16 15:49:52.449 +08:00","level":"INFO","caller":"mod.rs:469","message":""} -{"time":"2020/05/16 15:49:52.450 +08:00","level":"INFO","caller":"mod.rs:469","message":"Welcome"} -{"time":"2020/05/16 15:49:52.450 +08:00","level":"INFO","caller":"mod.rs:470","message":"Welcome TiKV"} -{"time":"2020/05/16 15:49:52.450 +08:00","level":"INFO","caller":"mod.rs:471","message":"欢迎"} -{"time":"2020/05/16 15:49:52.450 +08:00","level":"INFO","caller":"mod.rs:472","message":"欢迎 TiKV"} -{"time":"2020/05/16 15:49:52.450 +08:00","level":"INFO","caller":"mod.rs:455","message":"failed to fetch URL","backoff":"3s","attempt":3,"url":"http://example.com"} -{"time":"2020/05/16 15:49:52.450 +08:00","level":"INFO","caller":"mod.rs:460","message":"failed to \"fetch\" [URL]: http://example.com"} -{"time":"2020/05/16 15:49:52.450 +08:00","level":"DEBUG","caller":"mod.rs:463","message":"Slow query","process keys":1500,"duration":"123ns","sql":"SELECT * FROM TABLE WHERE ID=\"abc\""} -{"time":"2020/05/16 15:49:52.450 +08:00","level":"WARN","caller":"mod.rs:473","message":"Type","Other":null,"Score":null,"Counter":null} -{"time":"2020/05/16 15:49:52.451 +08:00","level":"INFO","caller":"mod.rs:391","message":"more type tests","str_array":"[\"💖\", \"�\", \"☺☻☹\", \"日a本b語ç日ð本Ê語þ日¥本¼語i日©\", \"日a本b語ç日ð本Ê語þ日¥本¼語i日©日a本b語ç日ð本Ê語þ日¥本¼語i日©日a本b語ç日ð本Ê語þ日¥本¼語i日©\", \"\\\\x80\\\\x80\\\\x80\\\\x80\", \"XML\"]","u8":34,"is_None":null,"is_false":false,"is_true":true,"store ids":"[1, 2, 3]","url-peers":"[\"peer1\", \"peer 2\"]","urls":"[\"http://xxx.com:2347\", \"http://xxx.com:2432\"]","field2":"in quote","field1":"no_quote"} -"#; + let thread_id = std::thread::current().id().as_u64(); + let expect = format!( + r#"{{"time":"2020/05/16 15:49:52.449 +08:00","level":"INFO","caller":"mod.rs:469","message":"","thread_id":{0}}} +{{"time":"2020/05/16 15:49:52.450 +08:00","level":"INFO","caller":"mod.rs:469","message":"Welcome","thread_id":{0}}} +{{"time":"2020/05/16 15:49:52.450 +08:00","level":"INFO","caller":"mod.rs:470","message":"Welcome TiKV","thread_id":{0}}} +{{"time":"2020/05/16 15:49:52.450 +08:00","level":"INFO","caller":"mod.rs:471","message":"欢迎","thread_id":{0}}} +{{"time":"2020/05/16 15:49:52.450 +08:00","level":"INFO","caller":"mod.rs:472","message":"欢迎 TiKV","thread_id":{0}}} +{{"time":"2020/05/16 15:49:52.450 +08:00","level":"INFO","caller":"mod.rs:455","message":"failed to fetch URL","backoff":"3s","attempt":3,"url":"http://example.com","thread_id":{0}}} +{{"time":"2020/05/16 15:49:52.450 +08:00","level":"INFO","caller":"mod.rs:460","message":"failed to \"fetch\" [URL]: http://example.com","thread_id":{0}}} +{{"time":"2020/05/16 15:49:52.450 +08:00","level":"DEBUG","caller":"mod.rs:463","message":"Slow query","process keys":1500,"duration":"123ns","sql":"SELECT * FROM TABLE WHERE ID=\"abc\"","thread_id":{0}}} +{{"time":"2020/05/16 15:49:52.450 +08:00","level":"WARN","caller":"mod.rs:473","message":"Type","Other":null,"Score":null,"Counter":null,"thread_id":{0}}} +{{"time":"2020/05/16 15:49:52.451 +08:00","level":"INFO","caller":"mod.rs:391","message":"more type tests","str_array":"[\"💖\", \"�\", \"☺☻☹\", \"日a本b語ç日ð本Ê語þ日¥本¼語i日©\", \"日a本b語ç日ð本Ê語þ日¥本¼語i日©日a本b語ç日ð本Ê語þ日¥本¼語i日©日a本b語ç日ð本Ê語þ日¥本¼語i日©\", \"\\\\x80\\\\x80\\\\x80\\\\x80\", \"XML\"]","u8":34,"is_None":null,"is_false":false,"is_true":true,"store ids":"[1, 2, 3]","url-peers":"[\"peer1\", \"peer 2\"]","urls":"[\"http://xxx.com:2347\", \"http://xxx.com:2432\"]","field2":"in quote","field1":"no_quote","thread_id":{0}}} +"#, + thread_id + ); - BUFFER.with(|buffer| { - let mut buffer = buffer.borrow_mut(); - let output = from_utf8(&buffer).unwrap(); - assert_eq!(output.lines().count(), expect.lines().count()); + let buffer = buffer.lock().unwrap(); + let output = from_utf8(&buffer).unwrap(); + assert_eq!(output.lines().count(), expect.lines().count()); - for (output_line, expect_line) in output.lines().zip(expect.lines()) { - let mut expect_json = from_str::(expect_line).unwrap(); - let mut output_json = from_str::(output_line).unwrap(); + for (output_line, expect_line) in output.lines().zip(expect.lines()) { + let mut expect_json = from_str::(expect_line).unwrap(); + let mut output_json = from_str::(output_line).unwrap(); - validate_log_datetime(output_json["time"].take().as_str().unwrap()); - // Remove time field to bypass timestamp mismatch. - let _ = expect_json["time"].take(); + validate_log_datetime(output_json["time"].take().as_str().unwrap()); + // Remove time field to bypass timestamp mismatch. + let _ = expect_json["time"].take(); - validate_log_source_file( - output_json["caller"].take().as_str().unwrap(), - expect_json["caller"].take().as_str().unwrap(), - ); + validate_log_source_file( + output_json["caller"].take().as_str().unwrap(), + expect_json["caller"].take().as_str().unwrap(), + ); - assert_eq!(expect_json, output_json); - } - buffer.clear(); - }); + assert_eq!(expect_json, output_json); + } } #[test] fn test_global_level_filter() { - let decorator = PlainSyncDecorator::new(TestWriter); + let buffer: Arc>> = Arc::default(); + let decorator = PlainSyncDecorator::new(TestWriter(buffer.clone())); let drain = TikvFormat::new(decorator, true).fuse(); let logger = slog::Logger::root_typed(GlobalLevelFilter::new(drain), slog_o!()).into_erased(); let expected = "[2019/01/15 13:40:39.619 +08:00] [INFO] [mod.rs:871] [Welcome]\n"; let check_log = |log: &str| { - BUFFER.with(|buffer| { - let mut buffer = buffer.borrow_mut(); - let output = from_utf8(&buffer).unwrap(); - // only check the log len here as some field like timestamp, location may - // change. - assert_eq!(output.len(), log.len()); - buffer.clear(); - }); + let mut buffer = buffer.lock().unwrap(); + let output = from_utf8(&buffer).unwrap(); + // only check the log len here as some field like timestamp, location may + // change. + assert_eq!(output.len(), log.len()); + buffer.clear(); }; set_log_level(Level::Info); diff --git a/components/tikv_util/src/lru.rs b/components/tikv_util/src/lru.rs index 76fad6e8a34..302bfc9264b 100644 --- a/components/tikv_util/src/lru.rs +++ b/components/tikv_util/src/lru.rs @@ -135,6 +135,10 @@ impl Trace { r.key.as_ptr().read() } } + + fn get_tail(&self) -> &K { + unsafe { self.tail.as_ref().prev.as_ref().key.assume_init_ref() } + } } impl Drop for Trace { @@ -174,14 +178,52 @@ impl SizePolicy for CountTracker { } } -pub struct LruCache +/// Some [`EvictPolicy`] (e.g. the `TxnStatusCache` in +/// `tikv::storage::txn::txn_status_cache` module) may need to know what the +/// entry bing popped out is to determine if it really can be popped. But there +/// is performance cost to always get the tail entry. So we pass this interface +/// to the `should_evict` function. An implementation of `EvictPolicy` can read +/// the tail entry only when it really needs. +pub trait GetTailEntry { + fn get_tail_entry(&self) -> Option<(&K, &V)>; +} + +/// An [`EvictPolicy`] defines how the [`LruCache`] should determine an entry +/// at the tail should be popped out. +pub trait EvictPolicy { + fn should_evict( + &self, + current_size: usize, + capacity: usize, + get_tail_entry: &impl GetTailEntry, + ) -> bool; +} + +/// The default [`EvictPolicy`] of [`LruCache`], which pops out entries at the +/// tail when the limit specified by `capacity` is exceeded. +pub struct EvictOnFull; + +impl EvictPolicy for EvictOnFull { + fn should_evict( + &self, + current_size: usize, + capacity: usize, + _: &impl GetTailEntry, + ) -> bool { + capacity < current_size + } +} + +pub struct LruCache where T: SizePolicy, + E: EvictPolicy, { map: HashMap>, trace: Trace, capacity: usize, size_policy: T, + evict_policy: E, } impl LruCache @@ -189,18 +231,30 @@ where T: SizePolicy, { pub fn with_capacity_sample_and_trace( - mut capacity: usize, + capacity: usize, sample_mask: usize, size_policy: T, ) -> LruCache { + Self::new(capacity, sample_mask, size_policy, EvictOnFull) + } +} + +impl LruCache +where + T: SizePolicy, + E: EvictPolicy, +{ + pub fn new(mut capacity: usize, sample_mask: usize, size_policy: T, evict_policy: E) -> Self { + // The capacity is at least 1. if capacity == 0 { capacity = 1; } - LruCache { + Self { map: HashMap::default(), trace: Trace::new(sample_mask), capacity, size_policy, + evict_policy, } } @@ -215,10 +269,18 @@ where self.trace.clear(); self.size_policy.on_reset(0); } + + /// Get the capacity limited on the `LruCache`. #[inline] pub fn capacity(&self) -> usize { self.capacity } + + /// Get the capacity actually allocated by the internal data structure. + #[inline] + pub fn internal_allocated_capacity(&self) -> usize { + self.map.capacity() + } } impl LruCache @@ -234,25 +296,36 @@ where } } -impl LruCache +impl LruCache where K: Eq + Hash + Clone + std::fmt::Debug, T: SizePolicy, + E: EvictPolicy, { #[inline] - pub fn insert(&mut self, key: K, value: V) { + fn insert_impl(&mut self, key: K, value: V, replace: bool) -> bool { + let mut inserted = true; let mut old_key = None; let current_size = SizePolicy::::current(&self.size_policy); + // In case the current size exactly equals to capacity, we also expect to reuse + // tail when inserting. Use `current_size + 1` to include the case. + let should_evict_on_insert = + self.evict_policy + .should_evict(current_size + 1, self.capacity, self); match self.map.entry(key) { HashMapEntry::Occupied(mut e) => { - self.size_policy.on_remove(e.key(), &e.get().value); - self.size_policy.on_insert(e.key(), &value); - let mut entry = e.get_mut(); - self.trace.promote(entry.record); - entry.value = value; + if replace { + self.size_policy.on_remove(e.key(), &e.get().value); + self.size_policy.on_insert(e.key(), &value); + let mut entry = e.get_mut(); + self.trace.promote(entry.record); + entry.value = value; + } else { + inserted = false; + } } HashMapEntry::Vacant(v) => { - let record = if self.capacity <= current_size { + let record = if should_evict_on_insert { let res = self.trace.reuse_tail(v.key().clone()); old_key = Some(res.0); res.1 @@ -274,7 +347,8 @@ where // Perhaps we can reject entries larger than capacity goes in the LRU cache, but // that is impossible for now: the `SizePolicy` trait doesn't provide the // interface of querying the actual size of an item. - self.evict_until_fit() + self.evict_until_fit(); + inserted } fn evict_until_fit(&mut self) { @@ -283,7 +357,7 @@ where let current_size = self.size_policy.current(); // Should we keep at least one entry? So our users won't lose their fresh record // once it exceeds the capacity. - if current_size <= cap || self.map.is_empty() { + if !self.evict_policy.should_evict(current_size, cap, self) || self.map.is_empty() { break; } let key = self.trace.remove_tail(); @@ -292,6 +366,18 @@ where } } + #[inline] + pub fn insert(&mut self, key: K, value: V) { + self.insert_impl(key, value, true); + } + + /// Insert an entry if the key doesn't exist before. The existing entry + /// won't be replaced and won't be promoted to the most-recent place. + #[inline] + pub fn insert_if_not_exist(&mut self, key: K, value: V) -> bool { + self.insert_impl(key, value, false) + } + #[inline] pub fn remove(&mut self, key: &K) -> Option { if let Some(v) = self.map.remove(key) { @@ -313,6 +399,12 @@ where } } + /// Get an item by key without promoting the item. + #[inline] + pub fn get_no_promote(&self, key: &K) -> Option<&V> { + self.map.get(key).map(|v| &v.value) + } + #[inline] pub fn get_mut(&mut self, key: &K) -> Option<&mut V> { match self.map.get_mut(key) { @@ -355,17 +447,37 @@ where } } -unsafe impl Send for LruCache +impl GetTailEntry for LruCache +where + K: Eq + Hash + Clone + std::fmt::Debug, + T: SizePolicy, + E: EvictPolicy, +{ + fn get_tail_entry(&self) -> Option<(&K, &V)> { + if self.is_empty() { + return None; + } + + let k = self.trace.get_tail(); + self.map + .get_key_value(k) + .map(|(k, entry)| (k, &entry.value)) + } +} + +unsafe impl Send for LruCache where K: Send, V: Send, T: Send + SizePolicy, + E: Send + EvictPolicy, { } -impl Drop for LruCache +impl Drop for LruCache where T: SizePolicy, + E: EvictPolicy, { fn drop(&mut self) { self.clear(); @@ -626,4 +738,61 @@ mod tests { assert!(cache.size() <= 42); } } + + #[test] + fn test_get_no_promote() { + let mut cache = LruCache::with_capacity_sample_and_trace(3, 0, CountTracker::default()); + cache.insert(1, 1); + cache.insert(2, 2); + cache.insert(3, 3); + assert_eq!(cache.size(), 3); + assert_eq!(*cache.get_no_promote(&1).unwrap(), 1); + cache.insert(4, 4); + assert_eq!(cache.size(), 3); + // Key 1 is not promoted, so it's popped out first. + assert!(cache.get_no_promote(&1).is_none()); + // Other entries are not affected. + assert_eq!(*cache.get_no_promote(&2).unwrap(), 2); + assert_eq!(*cache.get_no_promote(&3).unwrap(), 3); + assert_eq!(*cache.get_no_promote(&4).unwrap(), 4); + } + + #[test] + fn test_insert_if_not_exist() { + let mut cache = LruCache::with_capacity_sample_and_trace(4, 0, CountTracker::default()); + assert!(cache.insert_if_not_exist(1, 1)); + assert!(cache.insert_if_not_exist(2, 2)); + assert!(cache.insert_if_not_exist(3, 3)); + assert_eq!(cache.size(), 3); + assert_eq!(*cache.get_no_promote(&1).unwrap(), 1); + assert_eq!(*cache.get_no_promote(&2).unwrap(), 2); + assert_eq!(*cache.get_no_promote(&3).unwrap(), 3); + + assert!(!cache.insert_if_not_exist(1, 11)); + // Not updated. + assert_eq!(*cache.get_no_promote(&1).unwrap(), 1); + + assert!(cache.insert_if_not_exist(4, 4)); + assert!(!cache.insert_if_not_exist(2, 22)); + // Not updated. + assert_eq!(*cache.get_no_promote(&2).unwrap(), 2); + + assert_eq!(cache.size(), 4); + assert!(cache.insert_if_not_exist(5, 5)); + assert_eq!(cache.size(), 4); + // key 1 is not promoted, so it's first popped out. + assert!(cache.get_no_promote(&1).is_none()); + assert_eq!(*cache.get_no_promote(&2).unwrap(), 2); + + assert!(cache.insert_if_not_exist(6, 6)); + assert_eq!(cache.size(), 4); + // key 2 is not promoted either, so it's first popped out. + assert!(cache.get_no_promote(&2).is_none()); + assert_eq!(*cache.get_no_promote(&3).unwrap(), 3); + + assert!(cache.insert_if_not_exist(7, 7)); + assert_eq!(cache.size(), 4); + assert!(cache.get_no_promote(&3).is_none()); + assert_eq!(*cache.get_no_promote(&4).unwrap(), 4); + } } diff --git a/components/tikv_util/src/memory.rs b/components/tikv_util/src/memory.rs index 0a2f49461c5..c216fa84b8e 100644 --- a/components/tikv_util/src/memory.rs +++ b/components/tikv_util/src/memory.rs @@ -1,10 +1,18 @@ // Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. -use std::mem; +use std::{ + mem, + sync::{ + atomic::{AtomicUsize, Ordering}, + Arc, + }, +}; +use collections::HashMap; use kvproto::{ + coprocessor as coppb, encryptionpb::EncryptionMeta, - kvrpcpb::LockInfo, + kvrpcpb::{self, LockInfo}, metapb::{Peer, Region, RegionEpoch}, raft_cmdpb::{self, RaftCmdRequest, ReadIndexRequest}, }; @@ -23,13 +31,74 @@ pub unsafe fn vec_transmute(from: Vec) -> Vec { } pub trait HeapSize { - fn heap_size(&self) -> usize { + fn approximate_heap_size(&self) -> usize { 0 } + + fn approximate_mem_size(&self) -> usize + where + Self: Sized, + { + mem::size_of::() + self.approximate_heap_size() + } +} + +macro_rules! impl_zero_heap_size{ + ( $($typ: ty,)+ ) => { + $( + impl HeapSize for $typ { + fn approximate_heap_size(&self) -> usize { 0 } + } + )+ + } +} +impl_zero_heap_size! { + bool, u8, u64, +} +// Do not impl HeapSize for [T], because type coercions make it error-prone. +// E.g., Vec[u8] may be casted to &[u8] which does not own any byte in heap. +impl HeapSize for Vec { + fn approximate_heap_size(&self) -> usize { + let cap_bytes = self.capacity() * std::mem::size_of::(); + if self.is_empty() { + cap_bytes + } else { + // Prefer an approximation of its actually heap size, because we + // want the time complexity to be O(1). + self.len() * self[0].approximate_heap_size() + cap_bytes + } + } +} +impl HeapSize for (A, B) { + fn approximate_heap_size(&self) -> usize { + self.0.approximate_heap_size() + self.1.approximate_heap_size() + } +} +impl HeapSize for Option { + fn approximate_heap_size(&self) -> usize { + match self { + Some(t) => t.approximate_heap_size(), + None => 0, + } + } +} + +impl HeapSize for HashMap { + fn approximate_heap_size(&self) -> usize { + let cap_bytes = self.capacity() * (mem::size_of::() + mem::size_of::()); + if self.is_empty() { + cap_bytes + } else { + let kv = self.iter().next().unwrap(); + // Prefer an approximation of its actually heap size, because we + // want the time complexity to be O(1). + cap_bytes + self.len() * (kv.0.approximate_heap_size() + kv.1.approximate_heap_size()) + } + } } impl HeapSize for Region { - fn heap_size(&self) -> usize { + fn approximate_heap_size(&self) -> usize { let mut size = self.start_key.capacity() + self.end_key.capacity(); size += mem::size_of::(); size += self.peers.capacity() * mem::size_of::(); @@ -41,7 +110,7 @@ impl HeapSize for Region { } impl HeapSize for ReadIndexRequest { - fn heap_size(&self) -> usize { + fn approximate_heap_size(&self) -> usize { self.key_ranges .iter() .map(|r| r.start_key.capacity() + r.end_key.capacity()) @@ -50,7 +119,7 @@ impl HeapSize for ReadIndexRequest { } impl HeapSize for LockInfo { - fn heap_size(&self) -> usize { + fn approximate_heap_size(&self) -> usize { self.primary_lock.capacity() + self.key.capacity() + self.secondaries.iter().map(|k| k.len()).sum::() @@ -58,10 +127,187 @@ impl HeapSize for LockInfo { } impl HeapSize for RaftCmdRequest { - fn heap_size(&self) -> usize { + fn approximate_heap_size(&self) -> usize { mem::size_of::() + self.requests.capacity() * mem::size_of::() + mem::size_of_val(&self.admin_request) + mem::size_of_val(&self.status_request) } } + +impl HeapSize for coppb::KeyRange { + fn approximate_heap_size(&self) -> usize { + self.start.capacity() + self.end.capacity() + } +} + +impl HeapSize for kvrpcpb::Context { + fn approximate_heap_size(&self) -> usize { + self.resolved_locks.capacity() * mem::size_of::() + + self.committed_locks.capacity() * mem::size_of::() + + self.resource_group_tag.capacity() + + self.request_source.as_bytes().len() + + self + .get_resource_control_context() + .resource_group_name + .as_bytes() + .len() + + self.get_source_stmt().session_alias.as_bytes().len() + } +} + +#[derive(Debug)] +pub struct MemoryQuotaExceeded; + +impl std::error::Error for MemoryQuotaExceeded {} + +impl_display_as_debug!(MemoryQuotaExceeded); + +pub struct MemoryQuota { + in_use: AtomicUsize, + capacity: AtomicUsize, +} + +pub struct OwnedAllocated { + allocated: usize, + from: Arc, +} + +impl OwnedAllocated { + pub fn new(target: Arc) -> Self { + Self { + allocated: 0, + from: target, + } + } + + pub fn alloc(&mut self, bytes: usize) -> Result<(), MemoryQuotaExceeded> { + self.from.alloc(bytes)?; + self.allocated += bytes; + Ok(()) + } +} + +impl Drop for OwnedAllocated { + fn drop(&mut self) { + self.from.free(self.allocated) + } +} + +impl MemoryQuota { + pub fn new(capacity: usize) -> MemoryQuota { + MemoryQuota { + in_use: AtomicUsize::new(0), + capacity: AtomicUsize::new(capacity), + } + } + + pub fn in_use(&self) -> usize { + self.in_use.load(Ordering::Relaxed) + } + + pub fn capacity(&self) -> usize { + self.capacity.load(Ordering::Relaxed) + } + + pub fn set_capacity(&self, capacity: usize) { + self.capacity.store(capacity, Ordering::Relaxed); + } + + pub fn alloc(&self, bytes: usize) -> Result<(), MemoryQuotaExceeded> { + let capacity = self.capacity.load(Ordering::Relaxed); + let mut in_use_bytes = self.in_use.load(Ordering::Relaxed); + loop { + if in_use_bytes + bytes > capacity { + return Err(MemoryQuotaExceeded); + } + let new_in_use_bytes = in_use_bytes + bytes; + match self.in_use.compare_exchange_weak( + in_use_bytes, + new_in_use_bytes, + Ordering::Relaxed, + Ordering::Relaxed, + ) { + Ok(_) => return Ok(()), + Err(current) => in_use_bytes = current, + } + } + } + + pub fn free(&self, bytes: usize) { + let mut in_use_bytes = self.in_use.load(Ordering::Relaxed); + loop { + // Saturating at the numeric bounds instead of overflowing. + let new_in_use_bytes = in_use_bytes - std::cmp::min(bytes, in_use_bytes); + match self.in_use.compare_exchange_weak( + in_use_bytes, + new_in_use_bytes, + Ordering::Relaxed, + Ordering::Relaxed, + ) { + Ok(_) => return, + Err(current) => in_use_bytes = current, + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_memory_quota() { + let quota = MemoryQuota::new(100); + quota.alloc(10).unwrap(); + assert_eq!(quota.in_use(), 10); + quota.alloc(100).unwrap_err(); + assert_eq!(quota.in_use(), 10); + quota.free(5); + assert_eq!(quota.in_use(), 5); + quota.alloc(95).unwrap(); + assert_eq!(quota.in_use(), 100); + quota.free(95); + assert_eq!(quota.in_use(), 5); + } + + #[test] + fn test_resize_memory_quota() { + let quota = MemoryQuota::new(100); + quota.alloc(10).unwrap(); + assert_eq!(quota.in_use(), 10); + quota.alloc(100).unwrap_err(); + assert_eq!(quota.in_use(), 10); + quota.set_capacity(200); + quota.alloc(100).unwrap(); + assert_eq!(quota.in_use(), 110); + quota.set_capacity(50); + quota.alloc(100).unwrap_err(); + assert_eq!(quota.in_use(), 110); + quota.free(100); + assert_eq!(quota.in_use(), 10); + quota.alloc(40).unwrap(); + assert_eq!(quota.in_use(), 50); + } + + #[test] + fn test_allocated() { + let quota = Arc::new(MemoryQuota::new(100)); + let mut allocated = OwnedAllocated::new(Arc::clone("a)); + allocated.alloc(42).unwrap(); + assert_eq!(quota.in_use(), 42); + quota.alloc(59).unwrap_err(); + allocated.alloc(16).unwrap(); + assert_eq!(quota.in_use(), 58); + let mut allocated2 = OwnedAllocated::new(Arc::clone("a)); + allocated2.alloc(8).unwrap(); + allocated2.alloc(40).unwrap_err(); + assert_eq!(quota.in_use(), 66); + quota.alloc(4).unwrap(); + assert_eq!(quota.in_use(), 70); + drop(allocated); + assert_eq!(quota.in_use(), 12); + drop(allocated2); + assert_eq!(quota.in_use(), 4); + } +} diff --git a/components/tikv_util/src/mpsc/mod.rs b/components/tikv_util/src/mpsc/mod.rs index 700691f1189..9a71dbc0c5e 100644 --- a/components/tikv_util/src/mpsc/mod.rs +++ b/components/tikv_util/src/mpsc/mod.rs @@ -8,9 +8,8 @@ pub mod future; pub mod priority_queue; use std::{ - cell::Cell, sync::{ - atomic::{AtomicBool, AtomicIsize, Ordering}, + atomic::{AtomicBool, AtomicIsize, AtomicUsize, Ordering}, Arc, }, time::Duration, @@ -208,7 +207,7 @@ const CHECK_INTERVAL: usize = 8; /// A sender of channel that limits the maximun pending messages count loosely. pub struct LooseBoundedSender { sender: Sender, - tried_cnt: Cell, + tried_cnt: AtomicUsize, limit: usize, } @@ -230,25 +229,23 @@ impl LooseBoundedSender { /// Send a message regardless its capacity limit. #[inline] pub fn force_send(&self, t: T) -> Result<(), SendError> { - let cnt = self.tried_cnt.get(); - self.tried_cnt.set(cnt + 1); + self.tried_cnt.fetch_add(1, Ordering::AcqRel); self.sender.send(t) } /// Attempts to send a message into the channel without blocking. #[inline] pub fn try_send(&self, t: T) -> Result<(), TrySendError> { - let cnt = self.tried_cnt.get(); let check_interval = || { fail_point!("loose_bounded_sender_check_interval", |_| 0); CHECK_INTERVAL }; - if cnt < check_interval() { - self.tried_cnt.set(cnt + 1); - } else if self.len() < self.limit { - self.tried_cnt.set(1); - } else { - return Err(TrySendError::Full(t)); + if self.tried_cnt.fetch_add(1, Ordering::AcqRel) >= check_interval() { + if self.len() < self.limit { + self.tried_cnt.store(1, Ordering::Release); + } else { + return Err(TrySendError::Full(t)); + } } match self.sender.send(t) { @@ -275,7 +272,7 @@ impl Clone for LooseBoundedSender { fn clone(&self) -> LooseBoundedSender { LooseBoundedSender { sender: self.sender.clone(), - tried_cnt: self.tried_cnt.clone(), + tried_cnt: AtomicUsize::new(0), limit: self.limit, } } @@ -287,7 +284,7 @@ pub fn loose_bounded(cap: usize) -> (LooseBoundedSender, Receiver) { ( LooseBoundedSender { sender, - tried_cnt: Cell::new(0), + tried_cnt: AtomicUsize::new(0), limit: cap, }, receiver, diff --git a/components/tikv_util/src/slow_score.rs b/components/tikv_util/src/slow_score.rs new file mode 100644 index 00000000000..676dbef4efa --- /dev/null +++ b/components/tikv_util/src/slow_score.rs @@ -0,0 +1,287 @@ +// Copyright 2024 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ + cmp, + time::{Duration, Instant}, +}; + +use ordered_float::OrderedFloat; + +/// The result of a tick of the slow score. +pub struct SlowScoreTickResult { + pub tick_id: u64, + // None if skipped in this tick + pub updated_score: Option, + pub has_new_record: bool, + pub should_force_report_slow_store: bool, +} + +/// Interval for updating the slow score. +const UPDATE_INTERVALS: Duration = Duration::from_secs(10); +/// Recovery intervals for the slow score. +/// If the score has reached 100 and there is no timeout inspecting requests +/// during this interval, the score will go back to 1 after 5min. +const RECOVERY_INTERVALS: Duration = Duration::from_secs(60 * 5); +// Slow score is a value that represents the speed of a store and ranges in [1, +// 100]. It is maintained in the AIMD way. +// If there are some inspecting requests timeout during a round, by default the +// score will be increased at most 1x when above 10% inspecting requests +// timeout. If there is not any timeout inspecting requests, the score will go +// back to 1 in after 5min. +pub struct SlowScore { + value: OrderedFloat, + last_record_time: Instant, + last_update_time: Instant, + + timeout_requests: usize, + total_requests: usize, + + inspect_interval: Duration, + // The maximal tolerated timeout ratio. + ratio_thresh: OrderedFloat, + // Minimal time that the score could be decreased from 100 to 1. + min_ttr: Duration, + + // After how many ticks the value need to be updated. + round_ticks: u64, + // Identify every ticks. + last_tick_id: u64, + // If the last tick does not finished, it would be recorded as a timeout. + last_tick_finished: bool, +} + +impl SlowScore { + pub fn new(inspect_interval: Duration) -> SlowScore { + SlowScore { + value: OrderedFloat(1.0), + + timeout_requests: 0, + total_requests: 0, + + inspect_interval, + ratio_thresh: OrderedFloat(0.1), + min_ttr: RECOVERY_INTERVALS, + last_record_time: Instant::now(), + last_update_time: Instant::now(), + round_ticks: 30, + last_tick_id: 0, + last_tick_finished: true, + } + } + + // Only for kvdb. + pub fn new_with_extra_config(inspect_interval: Duration, timeout_ratio: f64) -> SlowScore { + SlowScore { + value: OrderedFloat(1.0), + + timeout_requests: 0, + total_requests: 0, + + inspect_interval, + ratio_thresh: OrderedFloat(timeout_ratio), + min_ttr: RECOVERY_INTERVALS, + last_record_time: Instant::now(), + last_update_time: Instant::now(), + // The minimal round ticks is 1 for kvdb. + round_ticks: cmp::max( + UPDATE_INTERVALS.div_duration_f64(inspect_interval) as u64, + 1_u64, + ), + last_tick_id: 0, + last_tick_finished: true, + } + } + + pub fn record(&mut self, id: u64, duration: Duration, not_busy: bool) { + self.last_record_time = Instant::now(); + if id != self.last_tick_id { + return; + } + self.last_tick_finished = true; + self.total_requests += 1; + if not_busy && duration >= self.inspect_interval { + self.timeout_requests += 1; + } + } + + pub fn record_timeout(&mut self) { + self.last_tick_finished = true; + self.total_requests += 1; + self.timeout_requests += 1; + } + + pub fn update(&mut self) -> f64 { + let elapsed = self.last_update_time.elapsed(); + self.update_impl(elapsed).into() + } + + pub fn get(&self) -> f64 { + self.value.into() + } + + // Update the score in a AIMD way. + fn update_impl(&mut self, elapsed: Duration) -> OrderedFloat { + if self.timeout_requests == 0 { + let desc = 100.0 * (elapsed.as_millis() as f64 / self.min_ttr.as_millis() as f64); + if OrderedFloat(desc) > self.value - OrderedFloat(1.0) { + self.value = 1.0.into(); + } else { + self.value -= desc; + } + } else { + let timeout_ratio = self.timeout_requests as f64 / self.total_requests as f64; + let near_thresh = + cmp::min(OrderedFloat(timeout_ratio), self.ratio_thresh) / self.ratio_thresh; + let value = self.value * (OrderedFloat(1.0) + near_thresh); + self.value = cmp::min(OrderedFloat(100.0), value); + } + + self.total_requests = 0; + self.timeout_requests = 0; + self.last_update_time = Instant::now(); + self.value + } + + pub fn should_force_report_slow_store(&self) -> bool { + self.value >= OrderedFloat(100.0) && (self.last_tick_id % self.round_ticks == 0) + } + + pub fn get_inspect_interval(&self) -> Duration { + self.inspect_interval + } + + pub fn last_tick_finished(&self) -> bool { + self.last_tick_finished + } + + pub fn tick(&mut self) -> SlowScoreTickResult { + let should_force_report_slow_store = self.should_force_report_slow_store(); + + let id = self.last_tick_id + 1; + self.last_tick_id += 1; + self.last_tick_finished = false; + + let (updated_score, has_new_record) = if self.last_tick_id % self.round_ticks == 0 { + // `last_update_time` is refreshed every round. If no update happens in a whole + // round, we set the status to unknown. + let has_new_record = self.last_record_time >= self.last_update_time; + let slow_score = self.update(); + (Some(slow_score), has_new_record) + } else { + (None, false) + }; + + SlowScoreTickResult { + tick_id: id, + updated_score, + has_new_record, + should_force_report_slow_store, + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_slow_score() { + let mut slow_score = SlowScore::new(Duration::from_millis(500)); + slow_score.timeout_requests = 5; + slow_score.total_requests = 100; + assert_eq!( + OrderedFloat(1.5), + slow_score.update_impl(Duration::from_secs(10)) + ); + + slow_score.timeout_requests = 10; + slow_score.total_requests = 100; + assert_eq!( + OrderedFloat(3.0), + slow_score.update_impl(Duration::from_secs(10)) + ); + + slow_score.timeout_requests = 20; + slow_score.total_requests = 100; + assert_eq!( + OrderedFloat(6.0), + slow_score.update_impl(Duration::from_secs(10)) + ); + + slow_score.timeout_requests = 100; + slow_score.total_requests = 100; + assert_eq!( + OrderedFloat(12.0), + slow_score.update_impl(Duration::from_secs(10)) + ); + + slow_score.timeout_requests = 11; + slow_score.total_requests = 100; + assert_eq!( + OrderedFloat(24.0), + slow_score.update_impl(Duration::from_secs(10)) + ); + + slow_score.timeout_requests = 0; + slow_score.total_requests = 100; + assert_eq!( + OrderedFloat(19.0), + slow_score.update_impl(Duration::from_secs(15)) + ); + + slow_score.timeout_requests = 0; + slow_score.total_requests = 100; + assert_eq!( + OrderedFloat(1.0), + slow_score.update_impl(Duration::from_secs(57)) + ); + } + + #[test] + fn test_slow_score_extra() { + let mut slow_score = SlowScore::new_with_extra_config(Duration::from_millis(1000), 0.6); + slow_score.timeout_requests = 1; + slow_score.total_requests = 10; + let score = slow_score.update_impl(Duration::from_secs(10)); + assert!(score > OrderedFloat(1.16)); + assert!(score < OrderedFloat(1.17)); + + slow_score.timeout_requests = 2; + slow_score.total_requests = 10; + let score = slow_score.update_impl(Duration::from_secs(10)); + assert!(score > OrderedFloat(1.5)); + assert!(score < OrderedFloat(1.6)); + + slow_score.timeout_requests = 0; + slow_score.total_requests = 100; + assert_eq!( + OrderedFloat(1.0), + slow_score.update_impl(Duration::from_secs(57)) + ); + + slow_score.timeout_requests = 3; + slow_score.total_requests = 10; + assert_eq!( + OrderedFloat(1.5), + slow_score.update_impl(Duration::from_secs(10)) + ); + + slow_score.timeout_requests = 6; + slow_score.total_requests = 10; + assert_eq!( + OrderedFloat(3.0), + slow_score.update_impl(Duration::from_secs(10)) + ); + + slow_score.timeout_requests = 10; + slow_score.total_requests = 10; + assert_eq!( + OrderedFloat(6.0), + slow_score.update_impl(Duration::from_secs(10)) + ); + + // Test too large inspect interval. + let slow_score = SlowScore::new_with_extra_config(Duration::from_secs(11), 0.1); + assert_eq!(slow_score.round_ticks, 1); + } +} diff --git a/components/tikv_util/src/sys/disk.rs b/components/tikv_util/src/sys/disk.rs index c8fe87a56b0..5918bdd8e3b 100644 --- a/components/tikv_util/src/sys/disk.rs +++ b/components/tikv_util/src/sys/disk.rs @@ -1,5 +1,8 @@ // Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. -use std::sync::atomic::{AtomicI32, AtomicU64, Ordering}; +use std::{ + path::Path, + sync::atomic::{AtomicI32, AtomicU64, Ordering}, +}; use fail::fail_point; pub use kvproto::disk_usage::DiskUsage; @@ -78,3 +81,16 @@ pub fn get_disk_status(_store_id: u64) -> DiskUsage { _ => panic!("Disk Status Value not meet expectations"), } } + +pub fn get_disk_space_stats>(path: P) -> std::io::Result<(u64, u64)> { + fail_point!("mock_disk_space_stats", |stats| { + let stats = stats.unwrap(); + let values = stats.split(',').collect::>(); + Ok(( + values[0].parse::().unwrap(), + values[1].parse::().unwrap(), + )) + }); + let disk_stats = fs2::statvfs(path)?; + Ok((disk_stats.total_space(), disk_stats.available_space())) +} diff --git a/components/tikv_util/src/sys/mod.rs b/components/tikv_util/src/sys/mod.rs index 797da2aea54..3afa8cd724c 100644 --- a/components/tikv_util/src/sys/mod.rs +++ b/components/tikv_util/src/sys/mod.rs @@ -195,10 +195,11 @@ pub fn path_in_diff_mount_point(_path1: impl AsRef, _path2: impl AsRef 0); + assert!(available > 0); + assert!(capacity >= available); + + disk::get_disk_space_stats("/non-exist-path").unwrap_err(); + } } diff --git a/components/tikv_util/src/time.rs b/components/tikv_util/src/time.rs index 8594379a9bd..f329247c563 100644 --- a/components/tikv_util/src/time.rs +++ b/components/tikv_util/src/time.rs @@ -200,10 +200,9 @@ impl Drop for Monitor { } } -use self::inner::monotonic_coarse_now; -pub use self::inner::monotonic_now; /// Returns the monotonic raw time since some unspecified starting point. pub use self::inner::monotonic_raw_now; +pub use self::inner::{monotonic_coarse_now, monotonic_now}; use crate::sys::thread::StdThreadBuildWrapper; const NANOSECONDS_PER_SECOND: u64 = 1_000_000_000; diff --git a/components/tikv_util/src/worker/pool.rs b/components/tikv_util/src/worker/pool.rs index c3919e42619..a22732a7aae 100644 --- a/components/tikv_util/src/worker/pool.rs +++ b/components/tikv_util/src/worker/pool.rs @@ -7,7 +7,7 @@ use std::{ future::Future, sync::{ atomic::{AtomicBool, AtomicUsize, Ordering}, - Arc, Mutex, + Arc, }, time::{Duration, Instant}, }; @@ -20,13 +20,13 @@ use futures::{ stream::StreamExt, }; use prometheus::IntGauge; -use yatp::{Remote, ThreadPool}; +use yatp::Remote; use super::metrics::*; use crate::{ future::{block_on_timeout, poll_future_notify}, timer::GLOBAL_TIMER_HANDLE, - yatp_pool::{DefaultTicker, YatpPoolBuilder}, + yatp_pool::{DefaultTicker, FuturePool, YatpPoolBuilder}, }; #[derive(PartialEq)] @@ -222,7 +222,15 @@ impl LazyWorker { } pub fn remote(&self) -> Remote { - self.worker.remote.clone() + self.worker.remote() + } + + pub fn pool_size(&self) -> usize { + self.worker.pool_size() + } + + pub fn pool(&self) -> FuturePool { + self.worker.pool() } } @@ -301,11 +309,8 @@ impl> Builder { let pool = YatpPoolBuilder::new(DefaultTicker::default()) .name_prefix(self.name) .thread_count(self.thread_count, self.thread_count, self.thread_count) - .build_single_level_pool(); - let remote = pool.remote().clone(); - let pool = Arc::new(Mutex::new(Some(pool))); + .build_future_pool(); Worker { - remote, stop: Arc::new(AtomicBool::new(false)), pool, counter: Arc::new(AtomicUsize::new(0)), @@ -318,8 +323,7 @@ impl> Builder { /// A worker that can schedule time consuming tasks. #[derive(Clone)] pub struct Worker { - pool: Arc>>>, - remote: Remote, + pool: FuturePool, pending_capacity: usize, counter: Arc, stop: Arc, @@ -371,7 +375,7 @@ impl Worker { .interval(std::time::Instant::now(), interval) .compat(); let stop = self.stop.clone(); - self.remote.spawn(async move { + let _ = self.pool.spawn(async move { while !stop.load(Ordering::Relaxed) && let Some(Ok(_)) = interval.next().await { @@ -389,7 +393,7 @@ impl Worker { .interval(std::time::Instant::now(), interval) .compat(); let stop = self.stop.clone(); - self.remote.spawn(async move { + let _ = self.pool.spawn(async move { while !stop.load(Ordering::Relaxed) && let Some(Ok(_)) = interval.next().await { @@ -403,7 +407,7 @@ impl Worker { where F: Future + Send + 'static, { - self.remote.spawn(f); + let _ = self.pool.spawn(f); } fn delay_notify(tx: UnboundedSender>, timeout: Duration) { @@ -438,10 +442,8 @@ impl Worker { /// Stops the worker thread. pub fn stop(&self) { - if let Some(pool) = self.pool.lock().unwrap().take() { - self.stop.store(true, Ordering::Release); - pool.shutdown(); - } + self.stop.store(true, Ordering::Release); + self.pool.shutdown(); } /// Checks if underlying worker can't handle task immediately. @@ -451,7 +453,15 @@ impl Worker { } pub fn remote(&self) -> Remote { - self.remote.clone() + self.pool.remote().clone() + } + + pub fn pool_size(&self) -> usize { + self.pool.get_pool_size() + } + + pub fn pool(&self) -> FuturePool { + self.pool.clone() } fn start_impl( @@ -461,7 +471,7 @@ impl Worker { metrics_pending_task_count: IntGauge, ) { let counter = self.counter.clone(); - self.remote.spawn(async move { + let _ = self.pool.spawn(async move { let mut handle = RunnableWrapper { inner: runner }; while let Some(msg) = receiver.next().await { match msg { @@ -488,7 +498,7 @@ impl Worker { let counter = self.counter.clone(); let timeout = runner.get_interval(); Self::delay_notify(tx.clone(), timeout); - self.remote.spawn(async move { + let _ = self.pool.spawn(async move { let mut handle = RunnableWrapper { inner: runner }; while let Some(msg) = receiver.next().await { match msg { diff --git a/components/tikv_util/src/yatp_pool/future_pool.rs b/components/tikv_util/src/yatp_pool/future_pool.rs index 827ffbbdce2..c65219ffbb1 100644 --- a/components/tikv_util/src/yatp_pool/future_pool.rs +++ b/components/tikv_util/src/yatp_pool/future_pool.rs @@ -13,6 +13,7 @@ use std::{ use fail::fail_point; use futures::channel::oneshot::{self, Canceled}; +use futures_util::future::FutureExt; use prometheus::{IntCounter, IntGauge}; use tracker::TrackedFuture; use yatp::{queue::Extras, task::future}; @@ -119,6 +120,11 @@ impl FuturePool { pub fn shutdown(&self) { self.inner.pool.shutdown(); } + + // Get a remote queue for spawning tasks without owning the thread pool. + pub fn remote(&self) -> &yatp::Remote { + self.inner.pool.remote() + } } struct PoolInner { @@ -174,11 +180,13 @@ impl PoolInner { metrics_running_task_count.inc(); - let f = async move { - let _ = future.await; + // NB: Prefer FutureExt::map to async block, because an async block + // doubles memory usage. + // See https://github.com/rust-lang/rust/issues/59087 + let f = future.map(move |_| { metrics_handled_task_count.inc(); metrics_running_task_count.dec(); - }; + }); if let Some(extras) = extras { self.pool.spawn(future::TaskCell::new(f, extras)); @@ -203,12 +211,14 @@ impl PoolInner { let (tx, rx) = oneshot::channel(); metrics_running_task_count.inc(); - self.pool.spawn(async move { - let res = future.await; + // NB: Prefer FutureExt::map to async block, because an async block + // doubles memory usage. + // See https://github.com/rust-lang/rust/issues/59087 + self.pool.spawn(future.map(move |res| { metrics_handled_task_count.inc(); metrics_running_task_count.dec(); let _ = tx.send(res); - }); + })); Ok(rx) } } diff --git a/components/tikv_util/src/yatp_pool/metrics.rs b/components/tikv_util/src/yatp_pool/metrics.rs index 8ae1aa8910e..efb1379dcc7 100644 --- a/components/tikv_util/src/yatp_pool/metrics.rs +++ b/components/tikv_util/src/yatp_pool/metrics.rs @@ -19,8 +19,8 @@ lazy_static! { pub static ref YATP_POOL_SCHEDULE_WAIT_DURATION_VEC: HistogramVec = register_histogram_vec!( "tikv_yatp_pool_schedule_wait_duration", "Histogram of yatp pool schedule wait duration.", - &["name"], - exponential_buckets(1e-5, 4.0, 12).unwrap() // 10us ~ 41s + &["name", "priority"], + exponential_buckets(1e-5, 2.0, 18).unwrap() // 10us ~ 2.5s ) .unwrap(); } diff --git a/components/tikv_util/src/yatp_pool/mod.rs b/components/tikv_util/src/yatp_pool/mod.rs index fc80e69cd84..b4a3d3c0825 100644 --- a/components/tikv_util/src/yatp_pool/mod.rs +++ b/components/tikv_util/src/yatp_pool/mod.rs @@ -1,14 +1,14 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. mod future_pool; -mod metrics; +pub mod metrics; use std::sync::Arc; use fail::fail_point; pub use future_pool::{Full, FuturePool}; use futures::{compat::Stream01CompatExt, StreamExt}; -use prometheus::{local::LocalHistogram, Histogram}; +use prometheus::{local::LocalHistogram, Histogram, HistogramOpts}; use yatp::{ pool::{CloneRunnerBuilder, Local, Remote, Runner}, queue::{multilevel, priority, Extras, QueueType, TaskCell as _}, @@ -165,7 +165,10 @@ pub struct YatpPoolRunner { before_pause: Option>, // Statistics about the schedule wait duration. - schedule_wait_duration: LocalHistogram, + // local histogram for high,medium,low priority tasks. + schedule_wait_durations: [LocalHistogram; 3], + // return the index of `schedule_wait_durations` from task metadata. + metric_idx_from_task_meta: Arc usize + Send + Sync>, } impl Runner for YatpPoolRunner { @@ -190,12 +193,12 @@ impl Runner for YatpPoolRunner { fn handle(&mut self, local: &mut Local, mut task_cell: Self::TaskCell) -> bool { let extras = task_cell.mut_extras(); if let Some(schedule_time) = extras.schedule_time() { - self.schedule_wait_duration - .observe(schedule_time.elapsed().as_secs_f64()); + let idx = (*self.metric_idx_from_task_meta)(extras.metadata()); + self.schedule_wait_durations[idx].observe(schedule_time.elapsed().as_secs_f64()); } let finished = self.inner.handle(local, task_cell); if self.ticker.try_tick() { - self.schedule_wait_duration.flush(); + self.schedule_wait_durations.iter().for_each(|m| m.flush()); } finished } @@ -229,7 +232,8 @@ impl YatpPoolRunner { after_start: Option>, before_stop: Option>, before_pause: Option>, - schedule_wait_duration: Histogram, + schedule_wait_durations: [Histogram; 3], + metric_idx_from_task_meta: Arc usize + Send + Sync>, ) -> Self { YatpPoolRunner { inner, @@ -238,7 +242,8 @@ impl YatpPoolRunner { after_start, before_stop, before_pause, - schedule_wait_duration: schedule_wait_duration.local(), + schedule_wait_durations: schedule_wait_durations.map(|m| m.local()), + metric_idx_from_task_meta, } } } @@ -256,6 +261,10 @@ pub struct YatpPoolBuilder { max_tasks: usize, cleanup_method: CleanupMethod, + // whether to tracker task scheduling wait duration + enable_task_wait_metrics: bool, + metric_idx_from_task_meta: Option usize + Send + Sync>>, + #[cfg(test)] background_cleanup_hook: Option>, } @@ -275,6 +284,9 @@ impl YatpPoolBuilder { max_tasks: std::usize::MAX, cleanup_method: CleanupMethod::InPlace, + enable_task_wait_metrics: false, + metric_idx_from_task_meta: None, + #[cfg(test)] background_cleanup_hook: None, } @@ -344,6 +356,19 @@ impl YatpPoolBuilder { self } + pub fn enable_task_wait_metrics(mut self) -> Self { + self.enable_task_wait_metrics = true; + self + } + + pub fn metric_idx_from_task_meta( + mut self, + f: Arc usize + Send + Sync>, + ) -> Self { + self.metric_idx_from_task_meta = Some(f); + self + } + pub fn build_future_pool(self) -> FuturePool { let name = self .name_prefix @@ -469,15 +494,24 @@ impl YatpPoolBuilder { let after_start = self.after_start.take(); let before_stop = self.before_stop.take(); let before_pause = self.before_pause.take(); - let schedule_wait_duration = - metrics::YATP_POOL_SCHEDULE_WAIT_DURATION_VEC.with_label_values(&[&name]); + let schedule_wait_durations = if self.enable_task_wait_metrics { + ["high", "medium", "low"].map(|p| { + metrics::YATP_POOL_SCHEDULE_WAIT_DURATION_VEC.with_label_values(&[&name, p]) + }) + } else { + std::array::from_fn(|_| Histogram::with_opts(HistogramOpts::new("_", "_")).unwrap()) + }; + let metric_idx_from_task_meta = self + .metric_idx_from_task_meta + .unwrap_or_else(|| Arc::new(|_| 0)); let read_pool_runner = YatpPoolRunner::new( Default::default(), self.ticker.clone(), after_start, before_stop, before_pause, - schedule_wait_duration, + schedule_wait_durations, + metric_idx_from_task_meta, ); (builder, read_pool_runner) } @@ -500,6 +534,7 @@ mod tests { let name = "test_record_schedule_wait_duration"; let pool = YatpPoolBuilder::new(DefaultTicker::default()) .name_prefix(name) + .enable_task_wait_metrics() .build_single_level_pool(); let (tx, rx) = mpsc::channel(); for _ in 0..3 { @@ -518,7 +553,8 @@ mod tests { } // Drop the pool so the local metrics are flushed. drop(pool); - let histogram = metrics::YATP_POOL_SCHEDULE_WAIT_DURATION_VEC.with_label_values(&[name]); + let histogram = + metrics::YATP_POOL_SCHEDULE_WAIT_DURATION_VEC.with_label_values(&[name, "high"]); assert_eq!(histogram.get_sample_count() as u32, 6, "{:?}", histogram); } diff --git a/etc/OWNERS b/etc/OWNERS new file mode 100644 index 00000000000..e8de98f61cb --- /dev/null +++ b/etc/OWNERS @@ -0,0 +1,7 @@ +# See the OWNERS docs at https://go.k8s.io/owners +options: + no_parent_owners: true +filters: + "(OWNERS|config-template\\.toml)$": + approvers: + - sig-critical-approvers-config-src diff --git a/etc/config-template.toml b/etc/config-template.toml index 36d8d25d883..8b4f2cf9ed7 100644 --- a/etc/config-template.toml +++ b/etc/config-template.toml @@ -83,6 +83,18 @@ ## maximum number of old log files to retain # max-backups = 0 +[memory] +## Whether enable the heap profiling which may have a bit performance overhead about 2% for the +## default sample rate. +# enable-heap-profiling = true + +## Average interval between allocation samples, as measured in bytes of allocation activity. +## Increasing the sampling interval decreases profile fidelity, but also decreases the +## computational overhead. +## The default sample interval is 512 KB. It only accepts power of two, otherwise it will be +## rounded up to the next power of two. +# profiling-sample-per-bytes = "512KB" + ## Configurations for the single thread pool serving read requests. [readpool.unified] ## The minimal working thread count of the thread pool. @@ -437,6 +449,15 @@ ## exceeds `region-compact-tombstones-percent`. # region-compact-tombstones-percent = 30 +## The minimum number of duplicated MVCC keys to trigger manual compaction. +# region-compact-min-redundant-rows = 50000 + +## The minimum percentage of duplicated MVCC keys to trigger manual compaction. +## It should be set between 1 and 100. Manual compaction is only triggered when the number of +## duplicated MVCC keys exceeds `region-compact-min-redundant-rows` and the percentage of duplicated MVCC keys +## exceeds `region-compact-redundant-rows-percent`. +# region-compact-redundant-rows-percent = 20 + ## Interval to check whether to start a manual compaction for Lock Column Family. ## If written bytes reach `lock-cf-compact-bytes-threshold` for Lock Column Family, TiKV will ## trigger a manual compaction for Lock Column Family. diff --git a/metrics/alertmanager/tikv.rules.yml b/metrics/alertmanager/tikv.rules.yml index e43ca401d42..94805bc0733 100644 --- a/metrics/alertmanager/tikv.rules.yml +++ b/metrics/alertmanager/tikv.rules.yml @@ -1,6 +1,18 @@ groups: - name: alert.rules rules: + - alert: TiKV_critical_error + expr: sum(rate(tikv_critical_error_total[1m])) BY (type, instance) > 0 + # without the for clause will become active on the first evaluation. + labels: + env: ENV_LABELS_ENV + level: critical + expr: sum(rate(tikv_critical_error_total[1m])) BY (type, instance) > 0 + annotations: + description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}' + value: '{{ $value }}' + summary: TiKV encounters critical error + - alert: TiKV_memory_used_too_fast expr: process_resident_memory_bytes{job=~"tikv",instance=~".*"} - (process_resident_memory_bytes{job=~"tikv",instance=~".*"} offset 5m) > 5*1024*1024*1024 for: 5m @@ -15,7 +27,7 @@ groups: - alert: TiKV_GC_can_not_work expr: sum(increase(tikv_gcworker_gc_tasks_vec{task="gc"}[1d])) < 1 and (sum(increase(tikv_gc_compaction_filter_perform[1d])) < 1 and sum(increase(tikv_engine_event_total{db="kv", cf="write", type="compaction"}[1d])) >= 1) - for: 1m + for: 5m labels: env: ENV_LABELS_ENV level: emergency diff --git a/metrics/grafana/tikv_details.json b/metrics/grafana/tikv_details.json index d327041cd8a..c48a3317cd6 100644 --- a/metrics/grafana/tikv_details.json +++ b/metrics/grafana/tikv_details.json @@ -179,6 +179,14 @@ "interval": "", "legendFormat": "Apply Duration .99", "refId": "E" + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_raft_msg_wait_time_duration_secs_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le))", + "hide": false, + "interval": "", + "legendFormat": "Raft Message Wait .99", + "refId": "F" } ], "thresholds": [], @@ -422,9 +430,36 @@ "expr": "sum(tikv_store_size_bytes{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"used\"}) by (instance)", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{instance}}", + "legendFormat": "{{instance}}-used", "refId": "A", "step": 10 + }, + { + "expr": "sum(tikv_store_size_bytes{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"kv_size\"}) by (instance)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}-kv_size", + "refId": "B", + "step": 10, + "hide": true + }, + { + "expr": "sum(tikv_store_size_bytes{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"raft_size\"}) by (instance)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}-raft_size", + "refId": "C", + "step": 10, + "hide": true + }, + { + "expr": "sum(tikv_store_size_bytes{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"import_size\"}) by (instance)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}-import_size", + "refId": "D", + "step": 10, + "hide": true } ], "thresholds": [], @@ -1954,7 +1989,26 @@ "hide": false, "interval": "", "legendFormat": "store-write-channelfull-{{instance}}", - "refId": "E" + "metric": "", + "query": "sum(rate(\n tikv_raftstore_store_write_msg_block_wait_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum((\n tikv_raftstore_process_busy\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (instance, type) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}-{{type}}", + "metric": "", + "query": "sum((\n tikv_raftstore_process_busy\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (instance, type) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], @@ -5792,7 +5846,7 @@ "fillGradient": 0, "gridPos": { "h": 9, - "w": 24, + "w": 12, "x": 0, "y": 37 }, @@ -5881,6 +5935,111 @@ "align": false, "alignLevel": null } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "decimals": 1, + "description": "The count of gRPC raft message", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 37 + }, + "hiddenSeries": false, + "id": 24763573092, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.11", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(tikv_raftstore_message_recv_by_store{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (instance, store)", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{instance}} - {{store}}", + "metric": "tikv_raftstore_message_recv_by_store", + "refId": "A", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "gRPC message count", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } } ], "repeat": null, @@ -9428,11 +9587,11 @@ "targets": [ { "exemplar": true, - "expr": "tikv_raftstore_slow_score{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}", + "expr": "tikv_raftstore_slow_score{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"} by (instance, type)", "format": "time_series", "interval": "", "intervalFactor": 2, - "legendFormat": "{{instance}}", + "legendFormat": "{{instance}}-{{type}}", "refId": "A", "step": 4 } @@ -13865,7 +14024,7 @@ "format": "heatmap", "intervalFactor": 2, "legendFormat": "{{le}}", - "metric": "tikv_raftstore_request_wait_time_duration_secs_bucket", + "metric": "tikv_raftstore_apply_wait_time_duration_secs_bucket", "refId": "A", "step": 4 } @@ -14043,7 +14202,7 @@ "interval": "", "intervalFactor": 2, "legendFormat": "{{le}}", - "metric": "tikv_raftstore_request_wait_time_duration_secs_bucket", + "metric": "tikv_raftstore_store_write_handle_msg_duration_secs_bucket", "refId": "A", "step": 4 } @@ -14117,7 +14276,7 @@ "interval": "", "intervalFactor": 2, "legendFormat": "{{le}}", - "metric": "tikv_raftstore_request_wait_time_duration_secs_bucket", + "metric": "tikv_raftstore_store_write_trigger_wb_bytes_bucket", "refId": "A", "step": 4 } @@ -14306,7 +14465,7 @@ "format": "time_series", "intervalFactor": 2, "legendFormat": "store-{{type}}", - "metric": "tikv_raftstore_request_wait_time_duration_secs_bucket", + "metric": "tikv_raftstore_store_perf_context_time_duration_secs_bucket", "refId": "A", "step": 4 }, @@ -14360,6 +14519,77 @@ "align": false, "alignLevel": null } + }, + { + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateSpectral", + "exponent": 0.5, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": "${DS_TEST-CLUSTER}", + "description": "The wait time of each raft message", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 62 + }, + "heatmap": {}, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 1977, + "legend": { + "show": false + }, + "links": [], + "reverseYBuckets": false, + "targets": [ + { + "expr": "sum(delta(tikv_raftstore_raft_msg_wait_time_duration_secs_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le)", + "format": "heatmap", + "intervalFactor": 2, + "legendFormat": "{{le}}", + "metric": "tikv_raftstore_raft_msg_wait_time_duration_secs_bucket", + "refId": "A", + "step": 4 + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Raft message wait duration", + "tooltip": { + "show": true, + "showHistogram": false + }, + "type": "heatmap", + "xAxis": { + "show": true + }, + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": 0, + "format": "s", + "logBase": 1, + "max": null, + "min": null, + "show": true, + "splitFactor": null + }, + "yBucketBound": "upper", + "yBucketNumber": null, + "yBucketSize": null } ], "repeat": null, @@ -25758,6 +25988,117 @@ "align": false, "alignLevel": null } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "description": "Total bytes of memory used by coprocessor requests", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 71 + }, + "hiddenSeries": false, + "hideTimeOverride": false, + "id": 24763573286, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sort": "max", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.5.27", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum((\n tikv_coprocessor_memory_quota\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=~\"$tidb_cluster.*\",instance=~\"$instance\"}\n \n)) by (instance, type) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}-{{type}}", + "metric": "", + "query": "sum((\n tikv_coprocessor_memory_quota\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=~\"$tidb_cluster.*\",instance=~\"$instance\"}\n \n)) by (instance, type) ", + "refId": "A", + "step": 10, + "target": "" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Memory Quota", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "transformations": [], + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "logBase": 1, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": 0 + } } ], "repeat": null, @@ -31941,7 +32282,7 @@ "steppedLine": false, "targets": [ { - "expr": "avg(tikv_engine_memory_bytes{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\", type=\"mem-tables\"}) by (cf)", + "expr": "avg(tikv_engine_memory_bytes{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\", type=\"mem-tables-all\"}) by (cf)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{cf}}", @@ -39068,7 +39409,7 @@ "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", - "description": "The gap between safe ts and current time", + "description": "The gap between now() and the minimal (non-zero) safe ts for followers", "editable": true, "error": false, "fieldConfig": { @@ -39119,7 +39460,7 @@ "targets": [ { "exemplar": true, - "expr": "sum(tikv_resolved_ts_min_safe_ts_gap_millis{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}) by (instance)", + "expr": "sum(tikv_resolved_ts_min_follower_safe_ts_gap_millis{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}) by (instance)", "format": "time_series", "interval": "", "intervalFactor": 2, @@ -39132,7 +39473,7 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Max gap of safe-ts", + "title": "Max gap of follower safe-ts", "tooltip": { "msResolution": false, "shared": true, @@ -39292,7 +39633,7 @@ "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, - "description": "The region that has minimal safe ts", + "description": "The region id of the follower that has minimal safe ts", "editable": true, "error": false, "fieldConfig": { @@ -39348,7 +39689,7 @@ "targets": [ { "exemplar": true, - "expr": "sum(tikv_resolved_ts_min_safe_ts_region{tidb_cluster=~\"$tidb_cluster.*\", instance=~\"$instance\"}) by (instance)", + "expr": "sum(tikv_resolved_ts_min_follower_safe_ts_region{tidb_cluster=~\"$tidb_cluster.*\", instance=~\"$instance\"}) by (instance)", "format": "time_series", "hide": false, "interval": "", @@ -39362,7 +39703,7 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Min Safe TS Region", + "title": "Min Safe TS Follower Region", "tooltip": { "msResolution": false, "shared": true, @@ -39471,6 +39812,79 @@ "yBucketNumber": null, "yBucketSize": null }, + { + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateSpectral", + "exponent": 0.5, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": "${DS_TEST-CLUSTER}", + "description": "The backoff duration before starting initial scan", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 70 + }, + "heatmap": {}, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 23763573950, + "legend": { + "show": false + }, + "links": [], + "reverseYBuckets": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(rate(tikv_resolved_ts_initial_scan_backoff_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le)", + "format": "heatmap", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{le}}", + "metric": "", + "refId": "A", + "step": 4 + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Initial scan backoff duration", + "tooltip": { + "show": true, + "showHistogram": false + }, + "type": "heatmap", + "xAxis": { + "show": true + }, + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": 0, + "format": "s", + "logBase": 1, + "max": null, + "min": null, + "show": true, + "splitFactor": null + }, + "yBucketBound": "upper", + "yBucketNumber": null, + "yBucketSize": null + }, { "aliasColors": {}, "bars": false, @@ -40092,6 +40506,15 @@ "legendFormat": "{{instance}}-{{reason}}", "refId": "A", "step": 10 + }, + { + "expr": "sum(delta(tikv_raftstore_check_stale_peer{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (instance)", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{instance}}-stale-peer", + "refId": "B", + "step": 10 } ], "thresholds": [], @@ -48186,7 +48609,7 @@ "refId": "StandardVariableQuery" }, "refresh": 1, - "regex": "/type=\"([^\"]+)\"/", + "regex": "/\\btype=\"([^\"]+)\"/", "skipUrlSync": false, "sort": 1, "tagValuesQuery": "", diff --git a/proxy_components/engine_tiflash/Cargo.toml b/proxy_components/engine_tiflash/Cargo.toml index f70029b4732..b218c0d6d9e 100644 --- a/proxy_components/engine_tiflash/Cargo.toml +++ b/proxy_components/engine_tiflash/Cargo.toml @@ -83,6 +83,7 @@ yatp = { git = "https://github.com/tikv/yatp.git", branch = "master" } git = "https://github.com/tikv/rust-rocksdb.git" package = "rocksdb" features = ["encryption"] +branch = "tikv-7.5" [dev-dependencies] rand = "0.8" diff --git a/proxy_components/engine_tiflash/src/cf_options.rs b/proxy_components/engine_tiflash/src/cf_options.rs index 1162c67f210..6a2372fb31f 100644 --- a/proxy_components/engine_tiflash/src/cf_options.rs +++ b/proxy_components/engine_tiflash/src/cf_options.rs @@ -40,6 +40,23 @@ impl RocksCfOptions { pub fn into_raw(self) -> RawCfOptions { self.0 } + + pub fn set_flush_size(&mut self, f: usize) -> Result<()> { + if let Some(m) = self.0.get_write_buffer_manager() { + m.set_flush_size(f); + } else { + return Err(box_err!("write buffer manager not found")); + } + Ok(()) + } + + pub fn get_flush_size(&self) -> Result { + if let Some(m) = self.0.get_write_buffer_manager() { + return Ok(m.flush_size() as u64); + } + + Err(box_err!("write buffer manager not found")) + } } impl Deref for RocksCfOptions { diff --git a/proxy_components/engine_tiflash/src/compact.rs b/proxy_components/engine_tiflash/src/compact.rs index 199b7d9f3be..f64c9a7d49e 100644 --- a/proxy_components/engine_tiflash/src/compact.rs +++ b/proxy_components/engine_tiflash/src/compact.rs @@ -121,6 +121,10 @@ impl CompactExt for RocksEngine { db.compact_files_cf(handle, &opts, &files, output_level) .map_err(r2e) } + + fn check_in_range(&self, start: Option<&[u8]>, end: Option<&[u8]>) -> Result<()> { + self.as_inner().check_in_range(start, end).map_err(r2e) + } } #[cfg(test)] diff --git a/proxy_components/engine_tiflash/src/db_options.rs b/proxy_components/engine_tiflash/src/db_options.rs index c9ef2cfda98..c95f81f8297 100644 --- a/proxy_components/engine_tiflash/src/db_options.rs +++ b/proxy_components/engine_tiflash/src/db_options.rs @@ -100,6 +100,14 @@ impl DbOptions for RocksDbOptions { Ok(()) } + fn get_flush_size(&self) -> Result { + if let Some(m) = self.0.get_write_buffer_manager() { + return Ok(m.flush_size() as u64); + } + + Err(box_err!("write buffer manager not found")) + } + fn set_flush_oldest_first(&mut self, f: bool) -> Result<()> { if let Some(m) = self.0.get_write_buffer_manager() { m.set_flush_oldest_first(f); @@ -112,6 +120,10 @@ impl DbOptions for RocksDbOptions { fn set_titandb_options(&mut self, opts: &Self::TitanDbOptions) { self.0.set_titandb_options(opts.as_raw()) } + + fn set_track_and_verify_wals_in_manifest(&mut self, v: bool) { + self.0.set_track_and_verify_wals_in_manifest(v) + } } pub struct RocksTitanDbOptions(RawTitanDBOptions); diff --git a/proxy_components/engine_tiflash/src/misc.rs b/proxy_components/engine_tiflash/src/misc.rs index 29d665ce563..c043e9b2050 100644 --- a/proxy_components/engine_tiflash/src/misc.rs +++ b/proxy_components/engine_tiflash/src/misc.rs @@ -1,16 +1,20 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. -use engine_rocks::{get_range_stats, STORE_ENGINE_EVENT_COUNTER_VEC}; + use engine_traits::{ CfNamesExt, DeleteStrategy, ImportExt, IterOptions, Iterable, Iterator, MiscExt, Mutable, Range, RangeStats, Result, SstWriter, SstWriterBuilder, WriteBatch, WriteBatchExt, - WriteOptions, ALL_CFS, + WriteOptions, }; use rocksdb::{FlushOptions, Range as RocksRange}; use tikv_util::{box_try, keybuilder::KeyBuilder}; use crate::{ - engine::RocksEngine, r2e, rocks_metrics::RocksStatisticsReporter, rocks_metrics_defs::*, - sst::RocksSstWriterBuilder, util, RocksSstWriter, + engine::RocksEngine, + r2e, + rocks_metrics::{RocksStatisticsReporter, STORE_ENGINE_EVENT_COUNTER_VEC}, + rocks_metrics_defs::*, + sst::RocksSstWriterBuilder, + util, RocksSstWriter, }; pub const MAX_DELETE_COUNT_BY_KEY: usize = 2048; @@ -315,7 +319,7 @@ impl MiscExt for RocksEngine { fn get_engine_used_size(&self) -> Result { let mut used_size: u64 = 0; - for cf in ALL_CFS { + for cf in self.cf_names() { let handle = util::get_cf_handle(self.as_inner(), cf)?; used_size += util::get_engine_cf_used_size(self.as_inner(), handle); } @@ -330,16 +334,26 @@ impl MiscExt for RocksEngine { self.as_inner().sync_wal().map_err(r2e) } + fn disable_manual_compaction(&self) -> Result<()> { + self.as_inner().disable_manual_compaction(); + Ok(()) + } + + fn enable_manual_compaction(&self) -> Result<()> { + self.as_inner().enable_manual_compaction(); + Ok(()) + } + fn pause_background_work(&self) -> Result<()> { // This will make manual compaction return error instead of waiting. In practice // we might want to identify this case by parsing error message. - self.as_inner().disable_manual_compaction(); + self.disable_manual_compaction()?; self.as_inner().pause_bg_work(); Ok(()) } fn continue_background_work(&self) -> Result<()> { - self.as_inner().enable_manual_compaction(); + self.enable_manual_compaction()?; self.as_inner().continue_bg_work(); Ok(()) } @@ -411,7 +425,7 @@ impl MiscExt for RocksEngine { } fn get_range_stats(&self, cf: &str, start: &[u8], end: &[u8]) -> Result> { - Ok(get_range_stats(&self.rocks, cf, start, end)) + Ok(crate::properties::get_range_stats(self, cf, start, end)) } fn is_stalled_or_stopped(&self) -> bool { @@ -449,7 +463,8 @@ impl MiscExt for RocksEngine { #[cfg(test)] mod tests { use engine_traits::{ - DeleteStrategy, Iterable, Iterator, Mutable, SyncMutable, WriteBatchExt, ALL_CFS, + CompactExt, DeleteStrategy, Iterable, Iterator, Mutable, SyncMutable, WriteBatchExt, + ALL_CFS, }; use tempfile::Builder; @@ -505,7 +520,8 @@ mod tests { wb.write().unwrap(); check_data(&db, ALL_CFS, kvs.as_slice()); - db.delete_ranges_cfs(strategy, ranges).unwrap(); + db.delete_ranges_cfs(&WriteOptions::default(), strategy, ranges) + .unwrap(); let mut kvs_left: Vec<_> = kvs; for r in ranges { @@ -643,10 +659,18 @@ mod tests { } check_data(&db, ALL_CFS, kvs.as_slice()); - db.delete_ranges_cfs(DeleteStrategy::DeleteFiles, &[Range::new(b"k2", b"k4")]) - .unwrap(); - db.delete_ranges_cfs(DeleteStrategy::DeleteBlobs, &[Range::new(b"k2", b"k4")]) - .unwrap(); + db.delete_ranges_cfs( + &WriteOptions::default(), + DeleteStrategy::DeleteFiles, + &[Range::new(b"k2", b"k4")], + ) + .unwrap(); + db.delete_ranges_cfs( + &WriteOptions::default(), + DeleteStrategy::DeleteBlobs, + &[Range::new(b"k2", b"k4")], + ) + .unwrap(); check_data(&db, ALL_CFS, kvs_left.as_slice()); } @@ -691,10 +715,119 @@ mod tests { // Delete all in ["k2", "k4"). db.delete_ranges_cfs( + &WriteOptions::default(), DeleteStrategy::DeleteByRange, &[Range::new(b"kabcdefg2", b"kabcdefg4")], ) .unwrap(); check_data(&db, &[cf], kvs_left.as_slice()); } + + #[test] + fn test_get_sst_key_ranges() { + let path = Builder::new() + .prefix("test_get_sst_key_ranges") + .tempdir() + .unwrap(); + let path_str = path.path().to_str().unwrap(); + + let mut opts = RocksDbOptions::default(); + opts.create_if_missing(true); + opts.enable_multi_batch_write(true); + + let mut cf_opts = RocksCfOptions::default(); + // Prefix extractor(trim the timestamp at tail) for write cf. + cf_opts + .set_prefix_extractor( + "FixedSuffixSliceTransform", + crate::util::FixedSuffixSliceTransform::new(8), + ) + .unwrap_or_else(|err| panic!("{:?}", err)); + // Create prefix bloom filter for memtable. + cf_opts.set_memtable_prefix_bloom_size_ratio(0.1_f64); + let cf = "default"; + let db = new_engine_opt(path_str, opts, vec![(cf, cf_opts)]).unwrap(); + let mut wb = db.write_batch(); + let kvs: Vec<(&[u8], &[u8])> = vec![ + (b"k1", b"v1"), + (b"k2", b"v2"), + (b"k6", b"v3"), + (b"k7", b"v4"), + ]; + + for &(k, v) in kvs.as_slice() { + wb.put_cf(cf, k, v).unwrap(); + } + wb.write().unwrap(); + + db.flush_cf(cf, true).unwrap(); + let sst_range = db.get_sst_key_ranges(cf, 0).unwrap(); + let expected = vec![(b"k1".to_vec(), b"k7".to_vec())]; + assert_eq!(sst_range, expected); + + let mut wb = db.write_batch(); + let kvs: Vec<(&[u8], &[u8])> = vec![(b"k3", b"v1"), (b"k4", b"v2"), (b"k8", b"v3")]; + + for &(k, v) in kvs.as_slice() { + wb.put_cf(cf, k, v).unwrap(); + } + wb.write().unwrap(); + + db.flush_cf(cf, true).unwrap(); + let sst_range = db.get_sst_key_ranges(cf, 0).unwrap(); + let expected = vec![ + (b"k3".to_vec(), b"k8".to_vec()), + (b"k1".to_vec(), b"k7".to_vec()), + ]; + assert_eq!(sst_range, expected); + + db.compact_range_cf(cf, None, None, false, 1).unwrap(); + let sst_range = db.get_sst_key_ranges(cf, 0).unwrap(); + assert_eq!(sst_range.len(), 0); + let sst_range = db.get_sst_key_ranges(cf, 1).unwrap(); + let expected = vec![(b"k1".to_vec(), b"k8".to_vec())]; + assert_eq!(sst_range, expected); + } + + #[test] + fn test_flush_oldest() { + let path = Builder::new() + .prefix("test_flush_oldest") + .tempdir() + .unwrap(); + let path_str = path.path().to_str().unwrap(); + + let mut opts = RocksDbOptions::default(); + opts.create_if_missing(true); + + let db = new_engine(path_str, ALL_CFS).unwrap(); + db.put_cf("default", b"k", b"v").unwrap(); + std::thread::sleep(std::time::Duration::from_secs(1)); + db.put_cf("write", b"k", b"v").unwrap(); + db.put_cf("lock", b"k", b"v").unwrap(); + assert_eq!( + db.get_total_sst_files_size_cf("default").unwrap().unwrap(), + 0 + ); + assert_eq!(db.get_total_sst_files_size_cf("write").unwrap().unwrap(), 0); + assert_eq!(db.get_total_sst_files_size_cf("lock").unwrap().unwrap(), 0); + let now = std::time::SystemTime::now(); + assert!( + !db.flush_oldest_cf(true, Some(now - std::time::Duration::from_secs(5))) + .unwrap() + ); + assert_eq!( + db.get_total_sst_files_size_cf("default").unwrap().unwrap(), + 0 + ); + assert_eq!(db.get_total_sst_files_size_cf("write").unwrap().unwrap(), 0); + assert_eq!(db.get_total_sst_files_size_cf("lock").unwrap().unwrap(), 0); + assert!( + db.flush_oldest_cf(true, Some(now - std::time::Duration::from_secs(1))) + .unwrap() + ); + assert_eq!(db.get_total_sst_files_size_cf("write").unwrap().unwrap(), 0); + assert_eq!(db.get_total_sst_files_size_cf("lock").unwrap().unwrap(), 0); + assert!(db.get_total_sst_files_size_cf("default").unwrap().unwrap() > 0); + } } diff --git a/proxy_components/engine_tiflash/src/properties.rs b/proxy_components/engine_tiflash/src/properties.rs index b92fe57c30c..1739fb1036e 100644 --- a/proxy_components/engine_tiflash/src/properties.rs +++ b/proxy_components/engine_tiflash/src/properties.rs @@ -8,7 +8,8 @@ use std::{ u64, }; -use engine_traits::{MvccProperties, Range}; +use api_version::{ApiV2, KeyMode, KvFormat}; +use engine_traits::{raw_ttl::ttl_current_ts, MvccProperties, Range, RangeStats}; use rocksdb::{ DBEntryType, TablePropertiesCollector, TablePropertiesCollectorFactory, TitanBlobIndex, UserCollectedProperties, @@ -130,12 +131,6 @@ impl<'a> DecodeProperties for UserCollectedPropertiesDecoder<'a> { } } -#[derive(Debug, Clone, PartialEq, Copy)] -pub enum RangeOffsetKind { - Size, - Keys, -} - #[derive(Debug, Default, Clone, Copy)] pub struct RangeOffsets { pub size: u64, @@ -205,7 +200,6 @@ impl RangeProperties { } /// Returns `size` and `keys`. - #[allow(clippy::redundant_closure)] pub fn get_approximate_distance_in_range(&self, start: &[u8], end: &[u8]) -> (u64, u64) { assert!(start <= end); if start == end { @@ -387,7 +381,8 @@ impl TablePropertiesCollectorFactory for RangeProperti } } -/// Can only be used for write CF. +/// Can be used for write CF in TiDB & TxnKV scenario, or be used for default CF +/// in RawKV scenario. pub struct MvccPropertiesCollector { props: MvccProperties, last_row: Vec, @@ -395,10 +390,12 @@ pub struct MvccPropertiesCollector { row_versions: u64, cur_index_handle: IndexHandle, row_index_handles: IndexHandles, + key_mode: KeyMode, // Use KeyMode::Txn for both TiDB & TxnKV, KeyMode::Raw for RawKV. + current_ts: u64, } impl MvccPropertiesCollector { - fn new() -> MvccPropertiesCollector { + fn new(key_mode: KeyMode) -> MvccPropertiesCollector { MvccPropertiesCollector { props: MvccProperties::new(), last_row: Vec::new(), @@ -406,6 +403,8 @@ impl MvccPropertiesCollector { row_versions: 0, cur_index_handle: IndexHandle::default(), row_index_handles: IndexHandles::new(), + key_mode, + current_ts: ttl_current_ts(), } } } @@ -415,7 +414,10 @@ impl TablePropertiesCollector for MvccPropertiesCollector { // TsFilter filters sst based on max_ts and min_ts during iterating. // To prevent seeing outdated (GC) records, we should consider // RocksDB delete entry type. - if entry_type != DBEntryType::Put && entry_type != DBEntryType::Delete { + if entry_type != DBEntryType::Put + && entry_type != DBEntryType::Delete + && entry_type != DBEntryType::BlobIndex + { return; } @@ -453,18 +455,43 @@ impl TablePropertiesCollector for MvccPropertiesCollector { self.props.max_row_versions = self.row_versions; } - let write_type = match Write::parse_type(value) { - Ok(v) => v, - Err(_) => { - self.num_errors += 1; - return; + if entry_type != DBEntryType::BlobIndex { + if self.key_mode == KeyMode::Raw { + let decode_raw_value = ApiV2::decode_raw_value(value); + match decode_raw_value { + Ok(raw_value) => { + if raw_value.is_valid(self.current_ts) { + self.props.num_puts += 1; + } else { + self.props.num_deletes += 1; + } + if let Some(expire_ts) = raw_value.expire_ts { + self.props.ttl.add(expire_ts); + } + } + Err(_) => { + self.num_errors += 1; + } + } + } else { + let write_type = match Write::parse_type(value) { + Ok(v) => v, + Err(_) => { + self.num_errors += 1; + return; + } + }; + + match write_type { + WriteType::Put => self.props.num_puts += 1, + WriteType::Delete => self.props.num_deletes += 1, + _ => {} + } } - }; - - match write_type { - WriteType::Put => self.props.num_puts += 1, - WriteType::Delete => self.props.num_deletes += 1, - _ => {} + } else { + // NOTE: if titan is enabled, the entry will always be treated as PUT. + // Be careful if you try to enable Titan on CF_WRITE. + self.props.num_puts += 1; } // Add new row. @@ -494,22 +521,33 @@ impl TablePropertiesCollector for MvccPropertiesCollector { } } -/// Can only be used for write CF. +/// Can be used for write CF of TiDB/TxnKV, default CF of RawKV. #[derive(Default)] pub struct MvccPropertiesCollectorFactory {} impl TablePropertiesCollectorFactory for MvccPropertiesCollectorFactory { fn create_table_properties_collector(&mut self, _: u32) -> MvccPropertiesCollector { - MvccPropertiesCollector::new() + MvccPropertiesCollector::new(KeyMode::Txn) + } +} + +#[derive(Default)] +pub struct RawMvccPropertiesCollectorFactory {} + +impl TablePropertiesCollectorFactory + for RawMvccPropertiesCollectorFactory +{ + fn create_table_properties_collector(&mut self, _: u32) -> MvccPropertiesCollector { + MvccPropertiesCollector::new(KeyMode::Raw) } } -pub fn get_range_entries_and_versions( +pub fn get_range_stats( engine: &crate::RocksEngine, cf: &str, start: &[u8], end: &[u8], -) -> Option<(u64, u64)> { +) -> Option { let range = Range::new(start, end); let collection = match engine.get_properties_of_tables_in_range(cf, &[range]) { Ok(v) => v, @@ -531,12 +569,17 @@ pub fn get_range_entries_and_versions( num_entries += v.num_entries(); props.add(&mvcc); } - - Some((num_entries, props.num_versions)) + Some(RangeStats { + num_entries, + num_versions: props.num_versions, + num_rows: props.num_rows, + num_deletes: props.num_deletes, + }) } #[cfg(test)] mod tests { + use api_version::RawValue; use engine_traits::{MiscExt, SyncMutable, CF_WRITE, LARGE_CFS}; use rand::Rng; use tempfile::Builder; @@ -746,10 +789,9 @@ mod tests { let start_keys = keys::data_key(&[]); let end_keys = keys::data_end_key(&[]); - let (entries, versions) = - get_range_entries_and_versions(&db, CF_WRITE, &start_keys, &end_keys).unwrap(); - assert_eq!(entries, (cases.len() * 2) as u64); - assert_eq!(versions, cases.len() as u64); + let range_stats = get_range_stats(&db, CF_WRITE, &start_keys, &end_keys).unwrap(); + assert_eq!(range_stats.num_entries, (cases.len() * 2) as u64); + assert_eq!(range_stats.num_versions, cases.len() as u64); } #[test] @@ -765,7 +807,7 @@ mod tests { ("ef", 6, WriteType::Put, DBEntryType::Delete), ("gh", 7, WriteType::Delete, DBEntryType::Put), ]; - let mut collector = MvccPropertiesCollector::new(); + let mut collector = MvccPropertiesCollector::new(KeyMode::Txn); for &(key, ts, write_type, entry_type) in &cases { let ts = ts.into(); let k = Key::from_raw(key.as_bytes()).append_ts(ts); @@ -784,6 +826,44 @@ mod tests { assert_eq!(props.max_row_versions, 3); } + #[test] + fn test_mvcc_properties_rawkv_mode() { + let test_raws = vec![ + (b"r\0a", 1, false, u64::MAX), + (b"r\0a", 5, false, u64::MAX), + (b"r\0a", 7, false, u64::MAX), + (b"r\0b", 1, false, u64::MAX), + (b"r\0b", 1, true, u64::MAX), + (b"r\0c", 1, true, 10), + (b"r\0d", 1, true, 10), + ]; + + let mut collector = MvccPropertiesCollector::new(KeyMode::Raw); + for &(key, ts, is_delete, expire_ts) in &test_raws { + let encode_key = ApiV2::encode_raw_key(key, Some(ts.into())); + let k = keys::data_key(encode_key.as_encoded()); + let v = ApiV2::encode_raw_value(RawValue { + user_value: &[0; 10][..], + expire_ts: Some(expire_ts), + is_delete, + }); + collector.add(&k, &v, DBEntryType::Put, 0, 0); + } + + let result = UserProperties(collector.finish()); + + let props = RocksMvccProperties::decode(&result).unwrap(); + assert_eq!(props.min_ts, 1.into()); + assert_eq!(props.max_ts, 7.into()); + assert_eq!(props.num_rows, 4); + assert_eq!(props.num_deletes, 3); + assert_eq!(props.num_puts, 4); + assert_eq!(props.num_versions, 7); + assert_eq!(props.max_row_versions, 3); + assert_eq!(props.ttl.max_expire_ts, Some(u64::MAX)); + assert_eq!(props.ttl.min_expire_ts, Some(10)); + } + #[bench] fn bench_mvcc_properties(b: &mut Bencher) { let ts = 1.into(); @@ -797,7 +877,7 @@ mod tests { entries.push((k, w.as_ref().to_bytes())); } - let mut collector = MvccPropertiesCollector::new(); + let mut collector = MvccPropertiesCollector::new(KeyMode::Txn); b.iter(|| { for &(ref k, ref v) in &entries { collector.add(k, v, DBEntryType::Put, 0, 0); diff --git a/proxy_components/engine_tiflash/src/range_properties.rs b/proxy_components/engine_tiflash/src/range_properties.rs index 101a004982a..dfc41db5f6e 100644 --- a/proxy_components/engine_tiflash/src/range_properties.rs +++ b/proxy_components/engine_tiflash/src/range_properties.rs @@ -9,7 +9,7 @@ use tikv_util::{box_err, box_try, debug, info}; use crate::{ engine::RocksEngine, - properties::{get_range_entries_and_versions, RangeProperties}, + properties::{get_range_stats, RangeProperties}, }; impl RangePropertiesExt for RocksEngine { @@ -27,9 +27,8 @@ impl RangePropertiesExt for RocksEngine { let start = &range.start_key; let end = &range.end_key; - let (_, keys) = - get_range_entries_and_versions(self, CF_WRITE, start, end).unwrap_or_default(); - Ok(keys) + let range_stats = get_range_stats(self, CF_WRITE, start, end).unwrap_or_default(); + Ok(range_stats.num_versions) } fn get_range_approximate_keys_cf( diff --git a/proxy_components/engine_tiflash/src/sst_partitioner.rs b/proxy_components/engine_tiflash/src/sst_partitioner.rs index fc1dcd40270..f642a94f28f 100644 --- a/proxy_components/engine_tiflash/src/sst_partitioner.rs +++ b/proxy_components/engine_tiflash/src/sst_partitioner.rs @@ -23,6 +23,8 @@ impl rocksdb::SstPartitionerFactory output_level: context.output_level, smallest_key: context.smallest_key, largest_key: context.largest_key, + next_level_boundaries: context.next_level_boundaries.clone(), + next_level_sizes: context.next_level_sizes.clone(), }; self.0.create_partitioner(&ctx).map(RocksSstPartitioner) } diff --git a/proxy_components/proxy_ffi/Cargo.toml b/proxy_components/proxy_ffi/Cargo.toml index 6b32f62c7f8..6241342f930 100644 --- a/proxy_components/proxy_ffi/Cargo.toml +++ b/proxy_components/proxy_ffi/Cargo.toml @@ -46,10 +46,11 @@ tokio = { version = "1.5", features = ["sync", "rt-multi-thread"] } tokio-timer = { git = "https://github.com/tikv/tokio", branch = "tokio-timer-hotfix" } tracker = { workspace = true, default-features = false } reqwest = { version = "0.11", features = ["blocking"] } -url = "2.4.0" +url = "2" collections = { workspace = true } [dependencies.rocksdb] git = "https://github.com/tikv/rust-rocksdb.git" package = "rocksdb" -features = ["encryption"] \ No newline at end of file +features = ["encryption"] +branch = "tikv-7.5" \ No newline at end of file diff --git a/src/config/OWNERS b/src/config/OWNERS new file mode 100644 index 00000000000..8bddc663cb4 --- /dev/null +++ b/src/config/OWNERS @@ -0,0 +1,7 @@ +# See the OWNERS docs at https://go.k8s.io/owners +options: + no_parent_owners: true +filters: + "(OWNERS|mod\\.rs)$": + approvers: + - sig-critical-approvers-config-src diff --git a/src/config/configurable.rs b/src/config/configurable.rs index 6fe9409c1c0..c92b01cf465 100644 --- a/src/config/configurable.rs +++ b/src/config/configurable.rs @@ -15,6 +15,7 @@ pub trait ConfigurableDb { fn set_rate_bytes_per_sec(&self, rate_bytes_per_sec: i64) -> ConfigRes; fn set_rate_limiter_auto_tuned(&self, auto_tuned: bool) -> ConfigRes; fn set_flush_size(&self, f: usize) -> ConfigRes; + fn set_cf_flush_size(&self, cf: &str, f: usize) -> ConfigRes; fn set_flush_oldest_first(&self, f: bool) -> ConfigRes; fn set_shared_block_cache_capacity(&self, capacity: usize) -> ConfigRes; fn set_high_priority_background_threads(&self, n: i32, allow_reduce: bool) -> ConfigRes; @@ -57,6 +58,11 @@ impl ConfigurableDb for RocksEngine { opt.set_flush_size(f).map_err(Box::from) } + fn set_cf_flush_size(&self, cf: &str, f: usize) -> ConfigRes { + let mut cf_option = self.get_options_cf(cf)?; + cf_option.set_flush_size(f).map_err(Box::from) + } + fn set_flush_oldest_first(&self, f: bool) -> ConfigRes { let mut opt = self.get_db_options(); opt.set_flush_oldest_first(f).map_err(Box::from) @@ -171,6 +177,17 @@ impl ConfigurableDb for TabletRegistry { }) } + fn set_cf_flush_size(&self, cf: &str, f: usize) -> ConfigRes { + loop_registry(self, |cache| { + if let Some(latest) = cache.latest() { + latest.set_cf_flush_size(cf, f)?; + Ok(false) + } else { + Ok(true) + } + }) + } + fn set_flush_oldest_first(&self, f: bool) -> ConfigRes { loop_registry(self, |cache| { if let Some(latest) = cache.latest() { diff --git a/src/config/mod.rs b/src/config/mod.rs index 5c7f1424c38..cf57b3960fb 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -31,7 +31,7 @@ use engine_rocks::{ raw::{ BlockBasedOptions, Cache, ChecksumType, CompactionPriority, ConcurrentTaskLimiter, DBCompactionStyle, DBCompressionType, DBRateLimiterMode, DBRecoveryMode, Env, - PrepopulateBlockCache, RateLimiter, WriteBufferManager, + LRUCacheOptions, PrepopulateBlockCache, RateLimiter, WriteBufferManager, }, util::{ FixedPrefixSliceTransform, FixedSuffixSliceTransform, NoopSliceTransform, @@ -82,6 +82,7 @@ use crate::{ server::{ gc_worker::{GcConfig, RawCompactionFilterFactory, WriteCompactionFilterFactory}, lock_manager::Config as PessimisticTxnConfig, + status_server::HEAP_PROFILE_ACTIVE, ttl::TtlCompactionFilterFactory, Config as ServerConfig, CONFIG_ROCKSDB_GAUGE, }, @@ -110,6 +111,7 @@ const RAFT_ENGINE_MEMORY_LIMIT_RATE: f64 = 0.15; const WRITE_BUFFER_MEMORY_LIMIT_RATE: f64 = 0.2; // Too large will increase Raft Engine memory usage. const WRITE_BUFFER_MEMORY_LIMIT_MAX: u64 = ReadableSize::gb(8).0; +const DEFAULT_LOCK_BUFFER_MEMORY_LIMIT: ReadableSize = ReadableSize::mb(128); /// Configs that actually took effect in the last run pub const LAST_CONFIG_FILE: &str = "last_tikv.toml"; @@ -188,7 +190,10 @@ impl TitanCfConfig { let mut opts = RocksTitanDbOptions::new(); opts.set_min_blob_size(self.min_blob_size.0); opts.set_blob_file_compression(self.blob_file_compression.into()); - opts.set_blob_cache(self.blob_cache_size.0 as usize, -1, false, 0.0); + let mut cache_opts = LRUCacheOptions::new(); + cache_opts.set_capacity(self.blob_cache_size.0 as usize); + let cache = Cache::new_lru_cache(cache_opts); + opts.set_blob_cache(&cache); opts.set_min_gc_batch_size(self.min_gc_batch_size.0); opts.set_max_gc_batch_size(self.max_gc_batch_size.0); opts.set_discardable_ratio(self.discardable_ratio); @@ -243,22 +248,30 @@ const RAFTDB_DEFAULT_BACKGROUND_JOB_LIMITS: BackgroundJobLimits = BackgroundJobL // `defaults` serves as an upper bound for returning limits. fn get_background_job_limits_impl( + engine_type: EngineType, cpu_num: u32, defaults: &BackgroundJobLimits, ) -> BackgroundJobLimits { // At the minimum, we should have two background jobs: one for flush and one for // compaction. Otherwise, the number of background jobs should not exceed // cpu_num - 1. - let max_background_jobs = cmp::max(2, cmp::min(defaults.max_background_jobs, cpu_num - 1)); + let mut max_background_jobs = cmp::max(2, cmp::min(defaults.max_background_jobs, cpu_num - 1)); // Scale flush threads proportionally to cpu cores. Also make sure the number of // flush threads doesn't exceed total jobs. let max_background_flushes = cmp::min( (max_background_jobs + 3) / 4, defaults.max_background_flushes, ); - // Cap max_sub_compactions to allow at least two compactions. - let max_compactions = max_background_jobs - max_background_flushes; + + // set the default compaction threads differently for v1 and v2: + // v1: cap max_sub_compactions to allow at least two compactions. + // v2: decrease the compaction threads to make the qps more stable. + let max_compactions = match engine_type { + EngineType::RaftKv => max_background_jobs - max_background_flushes, + EngineType::RaftKv2 => (max_background_jobs + 3) / 4, + }; let max_sub_compactions: u32 = (max_compactions - 1).clamp(1, defaults.max_sub_compactions); + max_background_jobs = max_background_flushes + max_compactions; // Maximum background GC threads for Titan let max_titan_background_gc = cmp::min(defaults.max_titan_background_gc, cpu_num); @@ -270,9 +283,12 @@ fn get_background_job_limits_impl( } } -fn get_background_job_limits(defaults: &BackgroundJobLimits) -> BackgroundJobLimits { +fn get_background_job_limits( + engine_type: EngineType, + defaults: &BackgroundJobLimits, +) -> BackgroundJobLimits { let cpu_num = cmp::max(SysQuota::cpu_cores_quota() as u32, 1); - get_background_job_limits_impl(cpu_num, defaults) + get_background_job_limits_impl(engine_type, cpu_num, defaults) } macro_rules! cf_config { @@ -311,6 +327,7 @@ macro_rules! cf_config { #[online_config(skip)] pub compression_per_level: [DBCompressionType; 7], pub write_buffer_size: Option, + pub write_buffer_limit: Option, pub max_write_buffer_number: i32, #[online_config(skip)] pub min_write_buffer_number_to_merge: i32, @@ -643,6 +660,7 @@ macro_rules! build_cf_opt { $cf_name, provider.clone(), $opt.compaction_guard_min_output_file_size.0, + $opt.max_compaction_bytes.0, ) .unwrap(); cf_opts.set_sst_partitioner_factory(factory); @@ -668,6 +686,7 @@ macro_rules! build_cf_opt { pub struct CfResources { pub cache: Cache, pub compaction_thread_limiters: HashMap<&'static str, ConcurrentTaskLimiter>, + pub write_buffer_managers: HashMap<&'static str, Arc>, } cf_config!(DefaultCfConfig); @@ -734,6 +753,7 @@ impl Default for DefaultCfConfig { ttl: None, periodic_compaction_seconds: None, titan: TitanCfConfig::default(), + write_buffer_limit: None, } } } @@ -832,6 +852,9 @@ impl DefaultCfConfig { } } cf_opts.set_titan_cf_options(&self.titan.build_opts()); + if let Some(write_buffer_manager) = shared.write_buffer_managers.get(CF_DEFAULT) { + cf_opts.set_write_buffer_manager(write_buffer_manager); + } cf_opts } } @@ -906,6 +929,7 @@ impl Default for WriteCfConfig { ttl: None, periodic_compaction_seconds: None, titan, + write_buffer_limit: None, } } } @@ -962,6 +986,9 @@ impl WriteCfConfig { .unwrap(); } cf_opts.set_titan_cf_options(&self.titan.build_opts()); + if let Some(write_buffer_manager) = shared.write_buffer_managers.get(CF_WRITE) { + cf_opts.set_write_buffer_manager(write_buffer_manager); + } cf_opts } } @@ -1028,6 +1055,7 @@ impl Default for LockCfConfig { ttl: None, periodic_compaction_seconds: None, titan, + write_buffer_limit: None, } } } @@ -1062,6 +1090,9 @@ impl LockCfConfig { .unwrap(); } cf_opts.set_titan_cf_options(&self.titan.build_opts()); + if let Some(write_buffer_manager) = shared.write_buffer_managers.get(CF_LOCK) { + cf_opts.set_write_buffer_manager(write_buffer_manager); + } cf_opts } } @@ -1127,6 +1158,7 @@ impl Default for RaftCfConfig { ttl: None, periodic_compaction_seconds: None, titan, + write_buffer_limit: None, } } } @@ -1235,10 +1267,10 @@ pub struct DbConfig { #[serde(with = "rocks_config::rate_limiter_mode_serde")] #[online_config(skip)] pub rate_limiter_mode: DBRateLimiterMode, - // deprecated. use rate_limiter_auto_tuned. - #[online_config(skip)] + #[online_config(hidden)] #[doc(hidden)] #[serde(skip_serializing)] + #[deprecated = "The configuration has been removed. Use `rate_limiter_auto_tuned` instead"] pub auto_tuned: Option, pub rate_limiter_auto_tuned: bool, pub bytes_per_sync: ReadableSize, @@ -1264,6 +1296,8 @@ pub struct DbConfig { #[doc(hidden)] #[serde(skip_serializing)] pub write_buffer_flush_oldest_first: bool, + #[online_config(skip)] + pub track_and_verify_wals_in_manifest: bool, // Dangerous option only for programming use. #[online_config(skip)] #[serde(skip)] @@ -1290,20 +1324,16 @@ pub struct DbResources { } impl Default for DbConfig { + #[allow(deprecated)] fn default() -> DbConfig { - let bg_job_limits = get_background_job_limits(&KVDB_DEFAULT_BACKGROUND_JOB_LIMITS); - let titan_config = TitanDbConfig { - max_background_gc: bg_job_limits.max_titan_background_gc as i32, - ..Default::default() - }; DbConfig { wal_recovery_mode: DBRecoveryMode::PointInTime, wal_dir: "".to_owned(), wal_ttl_seconds: 0, wal_size_limit: ReadableSize::kb(0), max_total_wal_size: None, - max_background_jobs: bg_job_limits.max_background_jobs as i32, - max_background_flushes: bg_job_limits.max_background_flushes as i32, + max_background_jobs: 0, + max_background_flushes: 0, max_manifest_file_size: ReadableSize::mb(128), create_if_missing: true, max_open_files: 40960, @@ -1322,7 +1352,7 @@ impl Default for DbConfig { rate_limiter_auto_tuned: true, bytes_per_sync: ReadableSize::mb(1), wal_bytes_per_sync: ReadableSize::kb(512), - max_sub_compactions: bg_job_limits.max_sub_compactions, + max_sub_compactions: 0, writable_file_max_buffer_size: ReadableSize::mb(1), use_direct_io_for_flush_and_compaction: false, enable_pipelined_write: false, @@ -1332,12 +1362,13 @@ impl Default for DbConfig { write_buffer_limit: None, write_buffer_stall_ratio: 0.0, write_buffer_flush_oldest_first: true, + track_and_verify_wals_in_manifest: false, paranoid_checks: None, defaultcf: DefaultCfConfig::default(), writecf: WriteCfConfig::default(), lockcf: LockCfConfig::default(), raftcf: RaftCfConfig::default(), - titan: titan_config, + titan: TitanDbConfig::default(), } } } @@ -1385,21 +1416,48 @@ impl DbConfig { // strategy is consistent with single RocksDB. self.defaultcf.max_compactions.get_or_insert(1); self.writecf.max_compactions.get_or_insert(1); - if self.lockcf.write_buffer_size.is_none() { - self.lockcf.write_buffer_size = Some(ReadableSize::mb(4)); - } + self.lockcf + .write_buffer_size + .get_or_insert(ReadableSize::mb(32)); + self.lockcf + .write_buffer_limit + .get_or_insert(DEFAULT_LOCK_BUFFER_MEMORY_LIMIT); } } + let bg_job_limits = get_background_job_limits(engine, &KVDB_DEFAULT_BACKGROUND_JOB_LIMITS); + if self.max_background_jobs == 0 { + self.max_background_jobs = bg_job_limits.max_background_jobs as i32; + } + if self.max_background_flushes == 0 { + self.max_background_flushes = bg_job_limits.max_background_flushes as i32; + } + if self.max_sub_compactions == 0 { + self.max_sub_compactions = bg_job_limits.max_sub_compactions; + } + if self.titan.max_background_gc == 0 { + self.titan.max_background_gc = bg_job_limits.max_titan_background_gc as i32; + } } - pub fn build_resources(&self, env: Arc) -> DbResources { + pub fn build_resources(&self, env: Arc, engine: EngineType) -> DbResources { let rate_limiter = if self.rate_bytes_per_sec.0 > 0 { + // for raft-v2, we use a longer window to make the compaction io smoother + let (tune_per_secs, window_size, recent_size) = match engine { + // 1s tune duraion, long term window is 5m, short term window is 30s. + // this is the default settings. + EngineType::RaftKv => (1, 300, 30), + // 5s tune duraion, long term window is 1h, short term window is 5m + EngineType::RaftKv2 => (5, 720, 60), + }; Some(Arc::new(RateLimiter::new_writeampbased_with_auto_tuned( self.rate_bytes_per_sec.0 as i64, (self.rate_limiter_refill_period.as_millis() * 1000) as i64, 10, // fairness self.rate_limiter_mode, self.rate_limiter_auto_tuned, + tune_per_secs, + window_size, + recent_size, ))) } else { None @@ -1481,6 +1539,7 @@ impl DbConfig { // Historical stats are not used. opts.set_stats_persist_period_sec(0); } + opts.set_track_and_verify_wals_in_manifest(self.track_and_verify_wals_in_manifest); opts } @@ -1510,9 +1569,29 @@ impl DbConfig { ConcurrentTaskLimiter::new(CF_RAFT, n), ); } + let mut write_buffer_managers = HashMap::default(); + self.lockcf.write_buffer_limit.map(|limit| { + write_buffer_managers.insert( + CF_LOCK, + Arc::new(WriteBufferManager::new(limit.0 as usize, 0f32, true)), + ) + }); + self.defaultcf.write_buffer_limit.map(|limit| { + write_buffer_managers.insert( + CF_DEFAULT, + Arc::new(WriteBufferManager::new(limit.0 as usize, 0f32, true)), + ) + }); + self.writecf.write_buffer_limit.map(|limit| { + write_buffer_managers.insert( + CF_WRITE, + Arc::new(WriteBufferManager::new(limit.0 as usize, 0f32, true)), + ) + }); CfResources { cache, compaction_thread_limiters, + write_buffer_managers, } } @@ -1556,6 +1635,9 @@ impl DbConfig { self.writecf.validate()?; self.raftcf.validate()?; self.titan.validate()?; + if self.raftcf.write_buffer_limit.is_some() { + return Err("raftcf does not support cf based write buffer manager".into()); + } if self.enable_unordered_write { if self.titan.enabled { return Err("RocksDB.unordered_write does not support Titan".into()); @@ -1660,6 +1742,7 @@ impl Default for RaftDefaultCfConfig { ttl: None, periodic_compaction_seconds: None, titan: TitanCfConfig::default(), + write_buffer_limit: None, } } } @@ -1752,7 +1835,9 @@ pub struct RaftDbConfig { impl Default for RaftDbConfig { fn default() -> RaftDbConfig { - let bg_job_limits = get_background_job_limits(&RAFTDB_DEFAULT_BACKGROUND_JOB_LIMITS); + // raftdb should only be used for raftkv + let bg_job_limits = + get_background_job_limits(EngineType::RaftKv, &RAFTDB_DEFAULT_BACKGROUND_JOB_LIMITS); let titan_config = TitanDbConfig { max_background_gc: bg_job_limits.max_titan_background_gc as i32, ..Default::default() @@ -1903,13 +1988,14 @@ pub enum DbType { } pub struct DbConfigManger { + cfg: DbConfig, db: D, db_type: DbType, } impl DbConfigManger { - pub fn new(db: D, db_type: DbType) -> Self { - DbConfigManger { db, db_type } + pub fn new(cfg: DbConfig, db: D, db_type: DbType) -> Self { + DbConfigManger { cfg, db, db_type } } } @@ -1944,10 +2030,31 @@ impl DbConfigManger { _ => Err(format!("invalid cf {:?} for db {:?}", cf, self.db_type).into()), } } + + fn update_background_cfg( + &self, + max_background_jobs: i32, + max_background_flushes: i32, + ) -> Result<(), Box> { + assert!(max_background_jobs > 0 && max_background_flushes > 0); + let max_background_compacts = + std::cmp::max(max_background_jobs - max_background_flushes, 1); + self.db + .set_db_config(&[("max_background_jobs", &max_background_jobs.to_string())])?; + self.db.set_db_config(&[( + "max_background_flushes", + &max_background_flushes.to_string(), + )])?; + self.db.set_db_config(&[( + "max_background_compactions", + &max_background_compacts.to_string(), + )]) + } } impl ConfigManager for DbConfigManger { fn dispatch(&mut self, change: ConfigChange) -> Result<(), Box> { + self.cfg.update(change.clone())?; let change_str = format!("{:?}", change); let mut change: Vec<(String, ConfigValue)> = change.into_iter().collect(); let cf_config = change.drain_filter(|(name, _)| name.ends_with("cf")); @@ -1966,6 +2073,15 @@ impl ConfigManager for DbConfigManger { cf_change.insert(name, value); } } + if let Some(f) = cf_change.remove("write_buffer_limit") { + if cf_name != CF_LOCK { + return Err( + "cf write buffer manager is only supportted for lock cf now".into() + ); + } + let size: ReadableSize = f.into(); + self.db.set_cf_flush_size(cf_name, size.0 as usize)?; + } if !cf_change.is_empty() { let cf_change = config_value_to_string(cf_change.into_iter().collect()); let cf_change_slice = config_to_slice(&cf_change); @@ -1996,7 +2112,8 @@ impl ConfigManager for DbConfigManger { .drain_filter(|(name, _)| name == "write_buffer_limit") .next() { - self.db.set_flush_size(size.1.into())?; + let size: ReadableSize = size.1.into(); + self.db.set_flush_size(size.0 as usize)?; } if let Some(f) = change @@ -2011,8 +2128,7 @@ impl ConfigManager for DbConfigManger { .next() { let max_background_jobs: i32 = background_jobs_config.1.into(); - self.db - .set_db_config(&[("max_background_jobs", &max_background_jobs.to_string())])?; + self.update_background_cfg(max_background_jobs, self.cfg.max_background_flushes)?; } if let Some(background_subcompactions_config) = change @@ -2029,10 +2145,7 @@ impl ConfigManager for DbConfigManger { .next() { let max_background_flushes: i32 = background_flushes_config.1.into(); - self.db.set_db_config(&[( - "max_background_flushes", - &max_background_flushes.to_string(), - )])?; + self.update_background_cfg(self.cfg.max_background_jobs, max_background_flushes)?; } if !change.is_empty() { @@ -2762,6 +2875,7 @@ pub struct BackupStreamConfig { pub initial_scan_pending_memory_quota: ReadableSize, #[online_config(skip)] pub initial_scan_rate_limit: ReadableSize, + pub initial_scan_concurrency: usize, } impl BackupStreamConfig { @@ -2789,6 +2903,9 @@ impl BackupStreamConfig { ) .into()); } + if self.initial_scan_concurrency == 0 { + return Err("the `initial_scan_concurrency` shouldn't be zero".into()); + } Ok(()) } } @@ -2816,6 +2933,7 @@ impl Default for BackupStreamConfig { file_size_limit, initial_scan_pending_memory_quota: ReadableSize(quota_size as _), initial_scan_rate_limit: ReadableSize::mb(60), + initial_scan_concurrency: 6, temp_file_memory_quota: cache_size, } } @@ -2831,7 +2949,11 @@ pub struct CdcConfig { #[online_config(skip)] pub incremental_scan_threads: usize, pub incremental_scan_concurrency: usize, + /// Limit scan speed based on disk I/O traffic. pub incremental_scan_speed_limit: ReadableSize, + /// Limit scan speed based on memory accesing traffic. + #[doc(hidden)] + pub incremental_fetch_speed_limit: ReadableSize, /// `TsFilter` can increase speed and decrease resource usage when /// incremental content is much less than total content. However in /// other cases, `TsFilter` can make performance worse because it needs @@ -2852,13 +2974,15 @@ pub struct CdcConfig { pub old_value_cache_memory_quota: ReadableSize, // Deprecated! preserved for compatibility check. - #[online_config(skip)] + #[online_config(hidden)] #[doc(hidden)] #[serde(skip_serializing)] + #[deprecated = "The configuration has been removed."] pub old_value_cache_size: usize, } impl Default for CdcConfig { + #[allow(deprecated)] fn default() -> Self { Self { min_ts_interval: ReadableDuration::secs(1), @@ -2870,6 +2994,7 @@ impl Default for CdcConfig { // TiCDC requires a SSD, the typical write speed of SSD // is more than 500MB/s, so 128MB/s is enough. incremental_scan_speed_limit: ReadableSize::mb(128), + incremental_fetch_speed_limit: ReadableSize::mb(512), incremental_scan_ts_filter_ratio: 0.2, tso_worker_threads: 1, // 512MB memory for CDC sink. @@ -2937,6 +3062,8 @@ pub struct ResolvedTsConfig { pub advance_ts_interval: ReadableDuration, #[online_config(skip)] pub scan_lock_pool_size: usize, + pub memory_quota: ReadableSize, + pub incremental_scan_concurrency: usize, } impl ResolvedTsConfig { @@ -2957,6 +3084,8 @@ impl Default for ResolvedTsConfig { enable: true, advance_ts_interval: ReadableDuration::secs(20), scan_lock_pool_size: 2, + memory_quota: ReadableSize::mb(256), + incremental_scan_concurrency: 6, } } } @@ -3093,6 +3222,72 @@ impl ConfigManager for LogConfigManager { } } +#[derive(Clone, Serialize, Deserialize, PartialEq, Debug, OnlineConfig)] +#[serde(default)] +#[serde(rename_all = "kebab-case")] +pub struct MemoryConfig { + // Whether enable the heap profiling which may have a bit performance overhead about 2% for the + // default sample rate. + pub enable_heap_profiling: bool, + + // Average interval between allocation samples, as measured in bytes of allocation activity. + // Increasing the sampling interval decreases profile fidelity, but also decreases the + // computational overhead. + // The default sample interval is 512 KB. It only accepts power of two, otherwise it will be + // rounded up to the next power of two. + pub profiling_sample_per_bytes: ReadableSize, +} + +impl Default for MemoryConfig { + fn default() -> Self { + Self { + enable_heap_profiling: true, + profiling_sample_per_bytes: ReadableSize::kb(512), + } + } +} + +impl MemoryConfig { + pub fn init(&self) { + if self.enable_heap_profiling { + let mut activate = HEAP_PROFILE_ACTIVE.lock().unwrap(); + if let Err(e) = tikv_alloc::activate_prof() { + error!("failed to enable heap profiling"; "err" => ?e); + return; + } + *activate = Some(None); + tikv_alloc::set_prof_sample(self.profiling_sample_per_bytes.0).unwrap(); + } + } +} + +pub struct MemoryConfigManager; + +impl ConfigManager for MemoryConfigManager { + fn dispatch(&mut self, changes: ConfigChange) -> CfgResult<()> { + if let Some(ConfigValue::Bool(enable)) = changes.get("enable_heap_profiling") { + if *enable { + let mut activate = HEAP_PROFILE_ACTIVE.lock().unwrap(); + // already enabled by HTTP API, do nothing + if activate.is_none() { + tikv_alloc::activate_prof()?; + *activate = Some(None); + } + } else { + let mut activate = HEAP_PROFILE_ACTIVE.lock().unwrap(); + tikv_alloc::deactivate_prof()?; + *activate = None; + } + } + + if let Some(ConfigValue::Size(sample_rate)) = changes.get("profiling_sample_per_bytes") { + tikv_alloc::set_prof_sample(*sample_rate).unwrap(); + } + info!("update memory config"; "config" => ?changes); + Ok(()) + } +} + #[derive(Clone, Serialize, Deserialize, PartialEq, Debug, OnlineConfig)] #[serde(default)] #[serde(rename_all = "kebab-case")] @@ -3143,21 +3338,29 @@ pub struct TikvConfig { #[online_config(hidden)] pub cfg_path: String, - // Deprecated! These configuration has been moved to LogConfig. - // They are preserved for compatibility check. #[doc(hidden)] - #[online_config(skip)] + #[online_config(hidden)] + #[serde(skip_serializing)] + #[deprecated = "The configuration has been moved to log.level."] pub log_level: LogLevel, #[doc(hidden)] - #[online_config(skip)] + #[online_config(hidden)] + #[serde(skip_serializing)] + #[deprecated = "The configuration has been moved to log.file.filename."] pub log_file: String, #[doc(hidden)] - #[online_config(skip)] + #[online_config(hidden)] + #[serde(skip_serializing)] + #[deprecated = "The configuration has been moved to log.format."] pub log_format: LogFormat, - #[online_config(skip)] + #[online_config(hidden)] + #[serde(skip_serializing)] + #[deprecated = "The configuration has been moved to log.file.max_days."] pub log_rotation_timespan: ReadableDuration, #[doc(hidden)] - #[online_config(skip)] + #[online_config(hidden)] + #[serde(skip_serializing)] + #[deprecated = "The configuration has been moved to log.file.max_size."] pub log_rotation_size: ReadableSize, #[online_config(skip)] @@ -3188,6 +3391,9 @@ pub struct TikvConfig { #[online_config(submodule)] pub log: LogConfig, + #[online_config(submodule)] + pub memory: MemoryConfig, + #[online_config(submodule)] pub quota: QuotaConfig, @@ -3265,6 +3471,7 @@ pub struct TikvConfig { } impl Default for TikvConfig { + #[allow(deprecated)] fn default() -> TikvConfig { TikvConfig { cfg_path: "".to_owned(), @@ -3281,6 +3488,7 @@ impl Default for TikvConfig { memory_usage_limit: None, memory_usage_high_water: 0.9, log: LogConfig::default(), + memory: MemoryConfig::default(), quota: QuotaConfig::default(), readpool: ReadPoolConfig::default(), server: ServerConfig::default(), @@ -3633,7 +3841,8 @@ impl TikvConfig { self.raft_engine.validate()?; self.server.validate()?; self.pd.validate()?; - self.coprocessor.validate()?; + self.coprocessor + .validate(self.storage.engine == EngineType::RaftKv2)?; self.raft_store.validate( self.coprocessor.region_split_size(), self.coprocessor.enable_region_bucket(), @@ -3653,11 +3862,17 @@ impl TikvConfig { self.quota.validate()?; self.causal_ts.validate()?; + // Validate feature TTL with Titan configuration. + if self.rocksdb.titan.enabled && self.storage.enable_ttl { + return Err("Titan is unavailable for feature TTL".to_string().into()); + } + Ok(()) } // As the init of `logger` is very early, this adjust needs to be separated and // called immediately after parsing the command line. + #[allow(deprecated)] pub fn logger_compatible_adjust(&mut self) { let default_tikv_cfg = TikvConfig::default(); let default_log_cfg = LogConfig::default(); @@ -3709,6 +3924,7 @@ impl TikvConfig { } } + #[allow(deprecated)] pub fn compatible_adjust(&mut self) { let default_raft_store = RaftstoreConfig::default(); let default_coprocessor = CopConfig::default(); @@ -4316,6 +4532,7 @@ pub enum Module { BackupStream, Quota, Log, + Memory, Unknown(String), } @@ -4344,6 +4561,7 @@ impl From<&str> for Module { "resource_metering" => Module::ResourceMetering, "quota" => Module::Quota, "log" => Module::Log, + "memory" => Module::Memory, n => Module::Unknown(n.to_owned()), } } @@ -4483,7 +4701,10 @@ impl ConfigController { #[cfg(test)] mod tests { - use std::{sync::Arc, time::Duration}; + use std::{ + sync::{mpsc::channel, Arc}, + time::Duration, + }; use api_version::{ApiV1, KvFormat}; use case_macros::*; @@ -4607,6 +4828,7 @@ mod tests { // Check api version. { + tikv_cfg.rocksdb.titan.enabled = false; let cases = [ (ApiVersion::V1, ApiVersion::V1, true), (ApiVersion::V1, ApiVersion::V1ttl, false), @@ -4647,7 +4869,7 @@ mod tests { assert_eq!(last_cfg_metadata.modified().unwrap(), first_modified); // write to file when config is the inequivalent of last one. - cfg.log_level = slog::Level::Warning.into(); + cfg.log.level = slog::Level::Warning.into(); persist_config(&cfg).unwrap(); last_cfg_metadata = last_cfg_path.metadata().unwrap(); assert_ne!(last_cfg_metadata.modified().unwrap(), first_modified); @@ -4763,7 +4985,9 @@ mod tests { fn test_rocks_rate_limit_zero() { let mut tikv_cfg = TikvConfig::default(); tikv_cfg.rocksdb.rate_bytes_per_sec = ReadableSize(0); - let resource = tikv_cfg.rocksdb.build_resources(Arc::new(Env::default())); + let resource = tikv_cfg + .rocksdb + .build_resources(Arc::new(Env::default()), tikv_cfg.storage.engine); tikv_cfg .rocksdb .build_opt(&resource, tikv_cfg.storage.engine); @@ -4927,7 +5151,9 @@ mod tests { Arc, ) { assert_eq!(F::TAG, cfg.storage.api_version()); - let resource = cfg.rocksdb.build_resources(Arc::default()); + let resource = cfg + .rocksdb + .build_resources(Arc::default(), cfg.storage.engine); let engine = RocksDBEngine::new( &cfg.storage.data_dir, Some(cfg.rocksdb.build_opt(&resource, cfg.storage.engine)), @@ -4958,7 +5184,11 @@ mod tests { let cfg_controller = ConfigController::new(cfg); cfg_controller.register( Module::Rocksdb, - Box::new(DbConfigManger::new(engine.clone(), DbType::Kv)), + Box::new(DbConfigManger::new( + cfg_controller.get_current().rocksdb, + engine.clone(), + DbType::Kv, + )), ); let (scheduler, receiver) = dummy_scheduler(); cfg_controller.register( @@ -5022,22 +5252,27 @@ mod tests { assert_eq!(flow_controller.enabled(), true); } - #[test] - fn test_change_resolved_ts_config() { - use crossbeam::channel; + struct MockCfgManager(Box); - pub struct TestConfigManager(channel::Sender); - impl ConfigManager for TestConfigManager { - fn dispatch(&mut self, change: ConfigChange) -> online_config::Result<()> { - self.0.send(change).unwrap(); - Ok(()) - } + impl ConfigManager for MockCfgManager { + fn dispatch(&mut self, change: ConfigChange) -> online_config::Result<()> { + (self.0)(change); + Ok(()) } + } + #[test] + fn test_change_resolved_ts_config() { let (cfg, _dir) = TikvConfig::with_tmp().unwrap(); let cfg_controller = ConfigController::new(cfg); - let (tx, rx) = channel::unbounded(); - cfg_controller.register(Module::ResolvedTs, Box::new(TestConfigManager(tx))); + let (tx, rx) = channel(); + let tx = std::sync::Mutex::new(tx); + cfg_controller.register( + Module::ResolvedTs, + Box::new(MockCfgManager(Box::new(move |c| { + tx.lock().unwrap().send(c).unwrap(); + }))), + ); // Return error if try to update not support config or unknow config cfg_controller @@ -5097,6 +5332,7 @@ mod tests { cfg.rocksdb.defaultcf.block_cache_size = Some(ReadableSize::mb(8)); cfg.rocksdb.rate_bytes_per_sec = ReadableSize::mb(64); cfg.rocksdb.rate_limiter_auto_tuned = false; + cfg.rocksdb.lockcf.write_buffer_limit = Some(ReadableSize::mb(1)); cfg.validate().unwrap(); let (storage, cfg_controller, ..) = new_engines::(cfg); let db = storage.get_engine().get_rocksdb(); @@ -5108,6 +5344,7 @@ mod tests { .update_config("rocksdb.max-background-jobs", "8") .unwrap(); assert_eq!(db.get_db_options().get_max_background_jobs(), 8); + assert_eq!(db.get_db_options().get_max_background_compactions(), 6); // update max_background_flushes, set to a bigger value assert_eq!(db.get_db_options().get_max_background_flushes(), 2); @@ -5116,6 +5353,7 @@ mod tests { .update_config("rocksdb.max-background-flushes", "5") .unwrap(); assert_eq!(db.get_db_options().get_max_background_flushes(), 5); + assert_eq!(db.get_db_options().get_max_background_compactions(), 3); // update rate_bytes_per_sec assert_eq!( @@ -5131,6 +5369,40 @@ mod tests { ReadableSize::mb(128).0 as i64 ); + cfg_controller + .update_config("rocksdb.write-buffer-limit", "10MB") + .unwrap(); + let flush_size = db.get_db_options().get_flush_size().unwrap(); + assert_eq!(flush_size, ReadableSize::mb(10).0); + + cfg_controller + .update_config("rocksdb.lockcf.write-buffer-limit", "22MB") + .unwrap(); + let cf_opt = db.get_options_cf("lock").unwrap(); + let flush_size = cf_opt.get_flush_size().unwrap(); + assert_eq!(flush_size, ReadableSize::mb(22).0); + + cfg_controller + .update_config("rocksdb.lockcf.write-buffer-size", "102MB") + .unwrap(); + let cf_opt = db.get_options_cf("lock").unwrap(); + let bsize = cf_opt.get_write_buffer_size(); + assert_eq!(bsize, ReadableSize::mb(102).0); + + cfg_controller + .update_config("rocksdb.writecf.write-buffer-size", "102MB") + .unwrap(); + let cf_opt = db.get_options_cf("write").unwrap(); + let bsize = cf_opt.get_write_buffer_size(); + assert_eq!(bsize, ReadableSize::mb(102).0); + + cfg_controller + .update_config("rocksdb.defaultcf.write-buffer-size", "102MB") + .unwrap(); + let cf_opt = db.get_options_cf("default").unwrap(); + let bsize = cf_opt.get_write_buffer_size(); + assert_eq!(bsize, ReadableSize::mb(102).0); + // update some configs on default cf let cf_opts = db.get_options_cf(CF_DEFAULT).unwrap(); assert_eq!(cf_opts.get_disable_auto_compactions(), false); @@ -5200,7 +5472,7 @@ mod tests { } #[test] - fn test_change_logconfig() { + fn test_change_log_config() { let (cfg, _dir) = TikvConfig::with_tmp().unwrap(); let cfg_controller = ConfigController::new(cfg); @@ -5222,6 +5494,37 @@ mod tests { ); } + #[test] + #[cfg(feature = "mem-profiling")] + fn test_change_memory_config() { + let (cfg, _dir) = TikvConfig::with_tmp().unwrap(); + let cfg_controller = ConfigController::new(cfg); + + cfg_controller.register(Module::Memory, Box::new(MemoryConfigManager)); + cfg_controller + .update_config("memory.enable_heap_profiling", "false") + .unwrap(); + assert_eq!(tikv_alloc::is_profiling_active(), false); + cfg_controller + .update_config("memory.enable_heap_profiling", "true") + .unwrap(); + assert_eq!(tikv_alloc::is_profiling_active(), true); + + cfg_controller + .update_config("memory.profiling_sample_per_bytes", "1MB") + .unwrap(); + assert_eq!( + cfg_controller + .get_current() + .memory + .profiling_sample_per_bytes, + ReadableSize::mb(1), + ); + cfg_controller + .update_config("memory.profiling_sample_per_bytes", "invalid") + .unwrap_err(); + } + #[test] fn test_dispatch_titan_blob_run_mode_config() { let mut cfg = TikvConfig::default(); @@ -5239,7 +5542,28 @@ mod tests { let diff = config_value_to_string(diff.into_iter().collect()); assert_eq!(diff.len(), 1); assert_eq!(diff[0].0.as_str(), "blob_run_mode"); - assert_eq!(diff[0].1.as_str(), "fallback"); + assert_eq!(diff[0].1.as_str(), "kFallback"); + } + + #[test] + fn test_update_titan_blob_run_mode_config() { + let mut cfg = TikvConfig::default(); + cfg.rocksdb.titan.enabled = true; + let (_, cfg_controller, ..) = new_engines::(cfg); + for run_mode in [ + "kFallback", + "kNormal", + "kReadOnly", + "fallback", + "normal", + "read-only", + ] { + let change = HashMap::from([( + "rocksdb.defaultcf.titan.blob-run-mode".to_string(), + run_mode.to_string(), + )]); + cfg_controller.update_without_persist(change).unwrap(); + } } #[test] @@ -5416,12 +5740,14 @@ mod tests { let cfg_controller = ConfigController::new(cfg.clone()); let (scheduler, _receiver) = dummy_scheduler(); let version_tracker = Arc::new(VersionTrack::new(cfg.server.clone())); + let cop_manager = MockCfgManager(Box::new(|_| {})); cfg_controller.register( Module::Server, Box::new(ServerConfigManager::new( scheduler, version_tracker.clone(), ResourceQuota::new(None), + Box::new(cop_manager), )), ); @@ -5473,6 +5799,40 @@ mod tests { ); } + #[test] + fn test_change_coprocessor_endpoint_config() { + let (mut cfg, _dir) = TikvConfig::with_tmp().unwrap(); + cfg.validate().unwrap(); + let cfg_controller = ConfigController::new(cfg.clone()); + let (scheduler, _receiver) = dummy_scheduler(); + let version_tracker = Arc::new(VersionTrack::new(cfg.server.clone())); + + let (cop_tx, cop_rx) = channel(); + let cop_tx = std::sync::Mutex::new(cop_tx); + let cop_manager = MockCfgManager(Box::new(move |c| { + cop_tx.lock().unwrap().send(c).unwrap(); + })); + cfg_controller.register( + Module::Server, + Box::new(ServerConfigManager::new( + scheduler, + version_tracker, + ResourceQuota::new(None), + Box::new(cop_manager), + )), + ); + + cfg_controller + .update_config("server.end-point-memory-quota", "32MB") + .unwrap(); + let mut change = cop_rx.try_recv().unwrap(); + let quota = change.remove("end_point_memory_quota").unwrap(); + let cap: ReadableSize = quota.into(); + assert_eq!(cap, ReadableSize::mb(32)); + cfg.server.end_point_memory_quota = ReadableSize::mb(32); + assert_eq_debug(&cfg_controller.get_current(), &cfg); + } + #[test] fn test_compatible_adjust_validate_equal() { // After calling many time of `compatible_adjust` and `validate` should has @@ -5657,6 +6017,21 @@ mod tests { cfg.validate().unwrap_err(); cfg.rocksdb.writecf.format_version = Some(5); cfg.validate().unwrap(); + + let mut valid_cfg = TikvConfig::default(); + valid_cfg.storage.api_version = 2; + valid_cfg.storage.enable_ttl = true; + valid_cfg.rocksdb.titan.enabled = false; + valid_cfg.validate().unwrap(); + + let mut invalid_cfg = TikvConfig::default(); + invalid_cfg.storage.api_version = 2; + invalid_cfg.storage.enable_ttl = true; + invalid_cfg.rocksdb.titan.enabled = true; + assert_eq!( + invalid_cfg.validate().unwrap_err().to_string(), + "Titan is unavailable for feature TTL" + ); } #[test] @@ -5775,61 +6150,95 @@ mod tests { #[test] fn test_background_job_limits() { - // cpu num = 1 + for engine in [EngineType::RaftKv, EngineType::RaftKv2] { + // cpu num = 1 + assert_eq!( + get_background_job_limits_impl( + engine, + 1, // cpu_num + &KVDB_DEFAULT_BACKGROUND_JOB_LIMITS + ), + BackgroundJobLimits { + max_background_jobs: 2, + max_background_flushes: 1, + max_sub_compactions: 1, + max_titan_background_gc: 1, + } + ); + assert_eq!( + get_background_job_limits_impl( + engine, + 1, // cpu_num + &RAFTDB_DEFAULT_BACKGROUND_JOB_LIMITS + ), + BackgroundJobLimits { + max_background_jobs: 2, + max_background_flushes: 1, + max_sub_compactions: 1, + max_titan_background_gc: 1, + } + ); + // cpu num = 2 + assert_eq!( + get_background_job_limits_impl( + EngineType::RaftKv, + 2, // cpu_num + &KVDB_DEFAULT_BACKGROUND_JOB_LIMITS + ), + BackgroundJobLimits { + max_background_jobs: 2, + max_background_flushes: 1, + max_sub_compactions: 1, + max_titan_background_gc: 2, + } + ); + assert_eq!( + get_background_job_limits_impl( + EngineType::RaftKv, + 2, // cpu_num + &RAFTDB_DEFAULT_BACKGROUND_JOB_LIMITS + ), + BackgroundJobLimits { + max_background_jobs: 2, + max_background_flushes: 1, + max_sub_compactions: 1, + max_titan_background_gc: 2, + } + ); + } + + // cpu num = 4 assert_eq!( get_background_job_limits_impl( - 1, // cpu_num + EngineType::RaftKv, + 4, // cpu_num &KVDB_DEFAULT_BACKGROUND_JOB_LIMITS ), BackgroundJobLimits { - max_background_jobs: 2, - max_background_flushes: 1, - max_sub_compactions: 1, - max_titan_background_gc: 1, - } - ); - assert_eq!( - get_background_job_limits_impl( - 1, // cpu_num - &RAFTDB_DEFAULT_BACKGROUND_JOB_LIMITS - ), - BackgroundJobLimits { - max_background_jobs: 2, + max_background_jobs: 3, max_background_flushes: 1, max_sub_compactions: 1, - max_titan_background_gc: 1, + max_titan_background_gc: 4, } ); - // cpu num = 2 assert_eq!( get_background_job_limits_impl( - 2, // cpu_num + EngineType::RaftKv2, + 4, // cpu_num &KVDB_DEFAULT_BACKGROUND_JOB_LIMITS ), BackgroundJobLimits { max_background_jobs: 2, max_background_flushes: 1, max_sub_compactions: 1, - max_titan_background_gc: 2, - } - ); - assert_eq!( - get_background_job_limits_impl( - 2, // cpu_num - &RAFTDB_DEFAULT_BACKGROUND_JOB_LIMITS - ), - BackgroundJobLimits { - max_background_jobs: 2, - max_background_flushes: 1, - max_sub_compactions: 1, - max_titan_background_gc: 2, + max_titan_background_gc: 4, } ); - // cpu num = 4 assert_eq!( get_background_job_limits_impl( + EngineType::RaftKv, 4, // cpu_num - &KVDB_DEFAULT_BACKGROUND_JOB_LIMITS + &RAFTDB_DEFAULT_BACKGROUND_JOB_LIMITS ), BackgroundJobLimits { max_background_jobs: 3, @@ -5838,33 +6247,36 @@ mod tests { max_titan_background_gc: 4, } ); + // cpu num = 8 assert_eq!( get_background_job_limits_impl( - 4, // cpu_num - &RAFTDB_DEFAULT_BACKGROUND_JOB_LIMITS + EngineType::RaftKv, + 8, // cpu_num + &KVDB_DEFAULT_BACKGROUND_JOB_LIMITS ), BackgroundJobLimits { - max_background_jobs: 3, - max_background_flushes: 1, - max_sub_compactions: 1, + max_background_jobs: 7, + max_background_flushes: 2, + max_sub_compactions: 3, max_titan_background_gc: 4, } ); - // cpu num = 8 assert_eq!( get_background_job_limits_impl( + EngineType::RaftKv2, 8, // cpu_num &KVDB_DEFAULT_BACKGROUND_JOB_LIMITS ), BackgroundJobLimits { - max_background_jobs: 7, + max_background_jobs: 4, max_background_flushes: 2, - max_sub_compactions: 3, + max_sub_compactions: 1, max_titan_background_gc: 4, } ); assert_eq!( get_background_job_limits_impl( + EngineType::RaftKv, 8, // cpu_num &RAFTDB_DEFAULT_BACKGROUND_JOB_LIMITS ), @@ -5873,6 +6285,7 @@ mod tests { // cpu num = 16 assert_eq!( get_background_job_limits_impl( + EngineType::RaftKv, 16, // cpu_num &KVDB_DEFAULT_BACKGROUND_JOB_LIMITS ), @@ -5880,6 +6293,20 @@ mod tests { ); assert_eq!( get_background_job_limits_impl( + EngineType::RaftKv2, + 16, // cpu_num + &KVDB_DEFAULT_BACKGROUND_JOB_LIMITS + ), + BackgroundJobLimits { + max_background_jobs: 6, + max_background_flushes: 3, + max_sub_compactions: 2, + max_titan_background_gc: 4, + } + ); + assert_eq!( + get_background_job_limits_impl( + EngineType::RaftKv, 16, // cpu_num &RAFTDB_DEFAULT_BACKGROUND_JOB_LIMITS ), @@ -5966,6 +6393,9 @@ mod tests { default_cfg .server .optimize_for(default_cfg.coprocessor.region_split_size()); + default_cfg + .raft_store + .optimize_for(default_cfg.storage.engine == EngineType::RaftKv2); default_cfg.security.redact_info_log = Some(false); default_cfg.coprocessor.region_max_size = Some(default_cfg.coprocessor.region_max_size()); default_cfg.coprocessor.region_max_keys = Some(default_cfg.coprocessor.region_max_keys()); @@ -6067,12 +6497,12 @@ mod tests { assert_eq!(default_cfg.coprocessor.region_split_size(), SPLIT_SIZE); assert!(!default_cfg.coprocessor.enable_region_bucket()); - assert_eq!(default_cfg.split.qps_threshold, DEFAULT_QPS_THRESHOLD); + assert_eq!(default_cfg.split.qps_threshold(), DEFAULT_QPS_THRESHOLD); assert_eq!( - default_cfg.split.region_cpu_overload_threshold_ratio, + default_cfg.split.region_cpu_overload_threshold_ratio(), REGION_CPU_OVERLOAD_THRESHOLD_RATIO ); - assert_eq!(default_cfg.split.byte_threshold, DEFAULT_BYTE_THRESHOLD); + assert_eq!(default_cfg.split.byte_threshold(), DEFAULT_BYTE_THRESHOLD); let mut default_cfg = TikvConfig::default(); default_cfg.storage.engine = EngineType::RaftKv2; @@ -6082,15 +6512,15 @@ mod tests { RAFTSTORE_V2_SPLIT_SIZE ); assert_eq!( - default_cfg.split.qps_threshold, + default_cfg.split.qps_threshold(), DEFAULT_BIG_REGION_QPS_THRESHOLD ); assert_eq!( - default_cfg.split.region_cpu_overload_threshold_ratio, + default_cfg.split.region_cpu_overload_threshold_ratio(), BIG_REGION_CPU_OVERLOAD_THRESHOLD_RATIO ); assert_eq!( - default_cfg.split.byte_threshold, + default_cfg.split.byte_threshold(), DEFAULT_BIG_REGION_BYTE_THRESHOLD ); assert!(default_cfg.coprocessor.enable_region_bucket()); @@ -6098,21 +6528,25 @@ mod tests { let mut default_cfg = TikvConfig::default(); default_cfg.coprocessor.region_split_size = Some(ReadableSize::mb(500)); default_cfg.coprocessor.optimize_for(false); - default_cfg.coprocessor.validate().unwrap(); + default_cfg.coprocessor.validate(false).unwrap(); assert_eq!( default_cfg.coprocessor.region_split_size(), ReadableSize::mb(500) ); + assert!(!default_cfg.coprocessor.enable_region_bucket()); + default_cfg.coprocessor.validate(true).unwrap(); assert!(default_cfg.coprocessor.enable_region_bucket()); let mut default_cfg = TikvConfig::default(); default_cfg.coprocessor.region_split_size = Some(ReadableSize::mb(500)); default_cfg.coprocessor.optimize_for(true); - default_cfg.coprocessor.validate().unwrap(); + default_cfg.coprocessor.validate(false).unwrap(); assert_eq!( default_cfg.coprocessor.region_split_size(), ReadableSize::mb(500) ); + assert!(!default_cfg.coprocessor.enable_region_bucket()); + default_cfg.coprocessor.validate(true).unwrap(); assert!(default_cfg.coprocessor.enable_region_bucket()); } @@ -6425,4 +6859,67 @@ mod tests { Some(ReadableSize::gb(1)) ); } + + #[test] + fn test_compact_check_default() { + let content = r#" + [raftstore] + region-compact-check-step = 50 + "#; + let mut cfg: TikvConfig = toml::from_str(content).unwrap(); + cfg.validate().unwrap(); + assert_eq!(cfg.raft_store.region_compact_check_step.unwrap(), 50); + assert_eq!( + cfg.raft_store + .region_compact_redundant_rows_percent + .unwrap(), + 20 + ); + + let content = r#" + [raftstore] + region-compact-check-step = 50 + [storage] + engine = "partitioned-raft-kv" + "#; + let mut cfg: TikvConfig = toml::from_str(content).unwrap(); + cfg.validate().unwrap(); + assert_eq!(cfg.raft_store.region_compact_check_step.unwrap(), 50); + assert_eq!( + cfg.raft_store + .region_compact_redundant_rows_percent + .unwrap(), + 20 + ); + + let content = r#" + [raftstore] + region-compact-redundant-rows-percent = 50 + "#; + let mut cfg: TikvConfig = toml::from_str(content).unwrap(); + cfg.validate().unwrap(); + assert_eq!(cfg.raft_store.region_compact_check_step.unwrap(), 100); + assert_eq!( + cfg.raft_store + .region_compact_redundant_rows_percent + .unwrap(), + 50 + ); + + let content = r#" + [raftstore] + region-compact-redundant-rows-percent = 50 + [storage] + engine = "partitioned-raft-kv" + "#; + let mut cfg: TikvConfig = toml::from_str(content).unwrap(); + cfg.validate().unwrap(); + assert_eq!(cfg.raft_store.region_compact_check_step.unwrap(), 5); + assert_eq!( + cfg.raft_store + .region_compact_redundant_rows_percent + .unwrap(), + 50 + ); + } } diff --git a/src/coprocessor/config_manager.rs b/src/coprocessor/config_manager.rs new file mode 100644 index 00000000000..4afd030a692 --- /dev/null +++ b/src/coprocessor/config_manager.rs @@ -0,0 +1,30 @@ +// Copyright 2024 TiKV Project Authors. Licensed under Apache-2.0. + +//! Coprocessor online config manager. + +use std::sync::Arc; + +use online_config::{ConfigChange, ConfigManager, ConfigValue, Result as CfgResult}; +use tikv_util::{config::ReadableSize, memory::MemoryQuota}; + +pub(super) struct CopConfigManager { + memory_quota: Arc, +} + +impl CopConfigManager { + pub fn new(memory_quota: Arc) -> Self { + Self { memory_quota } + } +} + +impl ConfigManager for CopConfigManager { + fn dispatch(&mut self, mut change: ConfigChange) -> CfgResult<()> { + if let Some(quota) = change.remove("end_point_memory_quota") { + if quota != ConfigValue::None { + let cap: ReadableSize = quota.into(); + self.memory_quota.set_capacity(cap.0 as _); + } + } + Ok(()) + } +} diff --git a/src/coprocessor/dag/mod.rs b/src/coprocessor/dag/mod.rs index 31a6df181d5..bd077c5c0ba 100644 --- a/src/coprocessor/dag/mod.rs +++ b/src/coprocessor/dag/mod.rs @@ -143,7 +143,9 @@ fn handle_qe_response( can_be_cached: bool, data_version: Option, ) -> Result { - use tidb_query_common::error::ErrorInner; + use tidb_query_common::error::{ErrorInner, EvaluateError}; + + use crate::coprocessor::Error; match result { Ok((sel_resp, range)) => { @@ -162,6 +164,7 @@ fn handle_qe_response( } Err(err) => match *err.0 { ErrorInner::Storage(err) => Err(err.into()), + ErrorInner::Evaluate(EvaluateError::DeadlineExceeded) => Err(Error::DeadlineExceeded), ErrorInner::Evaluate(err) => { let mut resp = Response::default(); let mut sel_resp = SelectResponse::default(); @@ -179,7 +182,9 @@ fn handle_qe_response( fn handle_qe_stream_response( result: tidb_query_common::Result<(Option<(StreamResponse, IntervalRange)>, bool)>, ) -> Result<(Option, bool)> { - use tidb_query_common::error::ErrorInner; + use tidb_query_common::error::{ErrorInner, EvaluateError}; + + use crate::coprocessor::Error; match result { Ok((Some((s_resp, range)), finished)) => { @@ -192,6 +197,7 @@ fn handle_qe_stream_response( Ok((None, finished)) => Ok((None, finished)), Err(err) => match *err.0 { ErrorInner::Storage(err) => Err(err.into()), + ErrorInner::Evaluate(EvaluateError::DeadlineExceeded) => Err(Error::DeadlineExceeded), ErrorInner::Evaluate(err) => { let mut resp = Response::default(); let mut s_resp = StreamResponse::default(); @@ -203,3 +209,43 @@ fn handle_qe_stream_response( }, } } + +#[cfg(test)] +mod tests { + use anyhow::anyhow; + use protobuf::Message; + use tidb_query_common::error::{Error as CommonError, EvaluateError, StorageError}; + + use super::*; + use crate::coprocessor::Error; + + #[test] + fn test_handle_qe_response() { + // Ok Response + let ok_res = Ok((SelectResponse::default(), None)); + let res = handle_qe_response(ok_res, true, Some(1)).unwrap(); + assert!(res.can_be_cached); + assert_eq!(res.get_cache_last_version(), 1); + let mut select_res = SelectResponse::new(); + Message::merge_from_bytes(&mut select_res, res.get_data()).unwrap(); + assert!(!select_res.has_error()); + + // Storage Error + let storage_err = CommonError::from(StorageError(anyhow!("unknown"))); + let res = handle_qe_response(Err(storage_err), false, None); + assert!(matches!(res, Err(Error::Other(_)))); + + // Evaluate Error + let err = CommonError::from(EvaluateError::DeadlineExceeded); + let res = handle_qe_response(Err(err), false, None); + assert!(matches!(res, Err(Error::DeadlineExceeded))); + + let err = CommonError::from(EvaluateError::InvalidCharacterString { + charset: "test".into(), + }); + let res = handle_qe_response(Err(err), false, None).unwrap(); + let mut select_res = SelectResponse::new(); + Message::merge_from_bytes(&mut select_res, res.get_data()).unwrap(); + assert_eq!(select_res.get_error().get_code(), 1300); + } +} diff --git a/src/coprocessor/endpoint.rs b/src/coprocessor/endpoint.rs index 7a12c7493e5..f9a1e28dd19 100644 --- a/src/coprocessor/endpoint.rs +++ b/src/coprocessor/endpoint.rs @@ -1,7 +1,8 @@ // Copyright 2018 TiKV Project Authors. Licensed under Apache-2.0. use std::{ - borrow::Cow, future::Future, iter::FromIterator, marker::PhantomData, sync::Arc, time::Duration, + borrow::Cow, future::Future, iter::FromIterator, marker::PhantomData, mem, sync::Arc, + time::Duration, }; use ::tracker::{ @@ -11,19 +12,30 @@ use api_version::{dispatch_api_version, KvFormat}; use async_stream::try_stream; use concurrency_manager::ConcurrencyManager; use engine_traits::PerfLevel; -use futures::{channel::mpsc, future::Either, prelude::*}; -use kvproto::{coprocessor as coppb, errorpb, kvrpcpb}; +use futures::{ + channel::{mpsc, oneshot}, + future::Either, + prelude::*, +}; +use kvproto::{coprocessor as coppb, errorpb, kvrpcpb, kvrpcpb::CommandPri}; +use online_config::ConfigManager; use protobuf::{CodedInputStream, Message}; -use resource_control::{ResourceGroupManager, TaskMetadata}; +use resource_control::{ResourceGroupManager, ResourceLimiter, TaskMetadata}; use resource_metering::{FutureExt, ResourceTagFactory, StreamExt}; use tidb_query_common::execute_stats::ExecSummary; use tikv_alloc::trace::MemoryTraceGuard; use tikv_kv::SnapshotExt; -use tikv_util::{quota_limiter::QuotaLimiter, time::Instant}; +use tikv_util::{ + deadline::set_deadline_exceeded_busy_error, + memory::{MemoryQuota, OwnedAllocated}, + quota_limiter::QuotaLimiter, + time::Instant, +}; use tipb::{AnalyzeReq, AnalyzeType, ChecksumRequest, ChecksumScanOn, DagRequest, ExecType}; use tokio::sync::Semaphore; use txn_types::Lock; +use super::config_manager::CopConfigManager; use crate::{ coprocessor::{cache::CachedRequestHandler, interceptors::*, metrics::*, tracker::Tracker, *}, read_pool::ReadPoolHandle, @@ -49,6 +61,8 @@ pub struct Endpoint { /// The concurrency limiter of the coprocessor. semaphore: Option>, + /// The memory quota for coprocessor requests. + memory_quota: Arc, concurrency_manager: ConcurrencyManager, @@ -86,18 +100,18 @@ impl Endpoint { quota_limiter: Arc, resource_ctl: Option>, ) -> Self { - // FIXME: When yatp is used, we need to limit coprocessor requests in progress - // to avoid using too much memory. However, if there are a number of large - // requests, small requests will still be blocked. This needs to be improved. let semaphore = match &read_pool { ReadPoolHandle::Yatp { .. } => { Some(Arc::new(Semaphore::new(cfg.end_point_max_concurrency))) } _ => None, }; + let memory_quota = Arc::new(MemoryQuota::new(cfg.end_point_memory_quota.0 as _)); + register_coprocessor_memory_quota_metrics(memory_quota.clone()); Self { read_pool, semaphore, + memory_quota, concurrency_manager, perf_level: cfg.end_point_perf_level, resource_tag_factory, @@ -113,6 +127,10 @@ impl Endpoint { } } + pub fn config_manager(&self) -> Box { + Box::new(CopConfigManager::new(self.memory_quota.clone())) + } + fn check_memory_locks(&self, req_ctx: &ReqContext) -> Result<()> { let start_ts = req_ctx.txn_start_ts; if !req_ctx.context.get_stale_read() { @@ -495,7 +513,7 @@ impl Endpoint { ) -> impl Future>> { let priority = req_ctx.context.get_priority(); let task_id = req_ctx.build_task_id(); - let key_ranges = req_ctx + let key_ranges: Vec<_> = req_ctx .ranges .iter() .map(|key_range| (key_range.get_start().to_vec(), key_range.get_end().to_vec())) @@ -503,6 +521,8 @@ impl Endpoint { let resource_tag = self .resource_tag_factory .new_tag_with_key_ranges(&req_ctx.context, key_ranges); + let mut allocated_bytes = resource_tag.approximate_heap_size(); + let metadata = TaskMetadata::from_ctx(req_ctx.context.get_resource_control_context()); let resource_limiter = self.resource_ctl.as_ref().and_then(|r| { r.get_resource_limiter( @@ -511,23 +531,35 @@ impl Endpoint { .get_resource_control_context() .get_resource_group_name(), req_ctx.context.get_request_source(), + req_ctx + .context + .get_resource_control_context() + .get_override_priority(), ) }); // box the tracker so that moving it is cheap. let tracker = Box::new(Tracker::new(req_ctx, self.slow_log_threshold)); - - let res = self - .read_pool - .spawn_handle( - Self::handle_unary_request_impl(self.semaphore.clone(), tracker, handler_builder) - .in_resource_metering_tag(resource_tag), - priority, - task_id, - metadata, - resource_limiter, - ) - .map_err(|_| Error::MaxPendingTasksExceeded); - async move { res.await? } + allocated_bytes += tracker.approximate_mem_size(); + + let (tx, rx) = oneshot::channel(); + let future = + Self::handle_unary_request_impl(self.semaphore.clone(), tracker, handler_builder) + .in_resource_metering_tag(resource_tag) + .map(|res| { + let _ = tx.send(res); + }); + let res = self.read_pool_spawn_with_memory_quota_check( + allocated_bytes, + future, + priority, + task_id, + metadata, + resource_limiter, + ); + async move { + res?; + rx.map_err(|_| Error::MaxPendingTasksExceeded).await? + } } /// Parses and handles a unary request. Returns a future that will never @@ -544,8 +576,9 @@ impl Endpoint { if let Err(busy_err) = self.read_pool.check_busy_threshold(Duration::from_millis( req.get_context().get_busy_threshold_ms() as u64, )) { - let mut resp = coppb::Response::default(); - resp.mut_region_error().set_server_is_busy(busy_err); + let mut pb_error = errorpb::Error::new(); + pb_error.set_server_is_busy(busy_err); + let resp = make_error_response(Error::Region(pb_error)); return Either::Left(async move { resp.into() }); } @@ -756,6 +789,10 @@ impl Endpoint { .get_resource_control_context() .get_resource_group_name(), req_ctx.context.get_request_source(), + req_ctx + .context + .get_resource_control_context() + .get_override_priority(), ) }); let key_ranges = req_ctx @@ -766,24 +803,29 @@ impl Endpoint { let resource_tag = self .resource_tag_factory .new_tag_with_key_ranges(&req_ctx.context, key_ranges); + let mut allocated_bytes = resource_tag.approximate_heap_size(); + let task_id = req_ctx.build_task_id(); let tracker = Box::new(Tracker::new(req_ctx, self.slow_log_threshold)); + allocated_bytes += tracker.approximate_mem_size(); + + let future = + Self::handle_stream_request_impl(self.semaphore.clone(), tracker, handler_builder) + .in_resource_metering_tag(resource_tag) + .then(futures::future::ok::<_, mpsc::SendError>) + .forward(tx) + .unwrap_or_else(|e| { + warn!("coprocessor stream send error"; "error" => %e); + }); - self.read_pool - .spawn( - Self::handle_stream_request_impl(self.semaphore.clone(), tracker, handler_builder) - .in_resource_metering_tag(resource_tag) - .then(futures::future::ok::<_, mpsc::SendError>) - .forward(tx) - .unwrap_or_else(|e| { - warn!("coprocessor stream send error"; "error" => %e); - }), - priority, - task_id, - metadata, - resource_limiter, - ) - .map_err(|_| Error::MaxPendingTasksExceeded)?; + self.read_pool_spawn_with_memory_quota_check( + allocated_bytes, + future, + priority, + task_id, + metadata, + resource_limiter, + )?; Ok(rx) } @@ -808,79 +850,97 @@ impl Endpoint { .or_else(|e| futures::future::ok(make_error_response(e))) // Stream .map(|item: std::result::Result<_, ()>| item.unwrap()) } + + fn read_pool_spawn_with_memory_quota_check( + &self, + mut allocated_bytes: usize, + future: F, + priority: CommandPri, + task_id: u64, + metadata: TaskMetadata<'_>, + resource_limiter: Option>, + ) -> Result<()> + where + F: Future + Send + 'static, + { + allocated_bytes += mem::size_of_val(&future); + let mut owned_quota = OwnedAllocated::new(self.memory_quota.clone()); + owned_quota.alloc(allocated_bytes)?; + let fut = future.map(move |_| { + // Release quota after handle completed. + drop(owned_quota); + }); + self.read_pool + .spawn(fut, priority, task_id, metadata, resource_limiter) + .map_err(|_| Error::MaxPendingTasksExceeded) + } +} + +macro_rules! make_error_response_common { + ($resp:expr, $tag:expr, $e:expr) => {{ + match $e { + Error::Region(e) => { + $tag = storage::get_tag_from_header(&e); + $resp.set_region_error(e); + } + Error::Locked(info) => { + $tag = "meet_lock"; + $resp.set_locked(info); + } + Error::DeadlineExceeded => { + $tag = "deadline_exceeded"; + let mut err = errorpb::Error::default(); + set_deadline_exceeded_busy_error(&mut err); + err.set_message($e.to_string()); + $resp.set_region_error(err); + } + Error::MaxPendingTasksExceeded => { + $tag = "max_pending_tasks_exceeded"; + let mut server_is_busy_err = errorpb::ServerIsBusy::default(); + server_is_busy_err.set_reason($e.to_string()); + let mut errorpb = errorpb::Error::default(); + errorpb.set_message($e.to_string()); + errorpb.set_server_is_busy(server_is_busy_err); + $resp.set_region_error(errorpb); + } + Error::MemoryQuotaExceeded => { + $tag = "memory_quota_exceeded"; + let mut server_is_busy_err = errorpb::ServerIsBusy::default(); + server_is_busy_err.set_reason($e.to_string()); + let mut errorpb = errorpb::Error::default(); + errorpb.set_message($e.to_string()); + errorpb.set_server_is_busy(server_is_busy_err); + $resp.set_region_error(errorpb); + } + Error::Other(_) => { + $tag = "other"; + warn!("unexpected other error encountered processing coprocessor task"; + "error" => ?&$e, + ); + $resp.set_other_error($e.to_string()); + } + }; + COPR_REQ_ERROR.with_label_values(&[$tag]).inc(); + }}; } fn make_error_batch_response(batch_resp: &mut coppb::StoreBatchTaskResponse, e: Error) { - warn!( + debug!( "batch cop task error-response"; "err" => %e ); let tag; - match e { - Error::Region(e) => { - tag = storage::get_tag_from_header(&e); - batch_resp.set_region_error(e); - } - Error::Locked(info) => { - tag = "meet_lock"; - batch_resp.set_locked(info); - } - Error::DeadlineExceeded => { - tag = "deadline_exceeded"; - batch_resp.set_other_error(e.to_string()); - } - Error::MaxPendingTasksExceeded => { - tag = "max_pending_tasks_exceeded"; - let mut server_is_busy_err = errorpb::ServerIsBusy::default(); - server_is_busy_err.set_reason(e.to_string()); - let mut errorpb = errorpb::Error::default(); - errorpb.set_message(e.to_string()); - errorpb.set_server_is_busy(server_is_busy_err); - batch_resp.set_region_error(errorpb); - } - Error::Other(_) => { - tag = "other"; - batch_resp.set_other_error(e.to_string()); - } - }; - COPR_REQ_ERROR.with_label_values(&[tag]).inc(); + make_error_response_common!(batch_resp, tag, e); } fn make_error_response(e: Error) -> coppb::Response { - warn!( + debug!( "error-response"; "err" => %e ); - let mut resp = coppb::Response::default(); let tag; - match e { - Error::Region(e) => { - tag = storage::get_tag_from_header(&e); - resp.set_region_error(e); - } - Error::Locked(info) => { - tag = "meet_lock"; - resp.set_locked(info); - } - Error::DeadlineExceeded => { - tag = "deadline_exceeded"; - resp.set_other_error(e.to_string()); - } - Error::MaxPendingTasksExceeded => { - tag = "max_pending_tasks_exceeded"; - let mut server_is_busy_err = errorpb::ServerIsBusy::default(); - server_is_busy_err.set_reason(e.to_string()); - let mut errorpb = errorpb::Error::default(); - errorpb.set_message(e.to_string()); - errorpb.set_server_is_busy(server_is_busy_err); - resp.set_region_error(errorpb); - } - Error::Other(_) => { - tag = "other"; - resp.set_other_error(e.to_string()); - } - }; - COPR_REQ_ERROR.with_label_values(&[tag]).inc(); + let mut resp = coppb::Response::default(); + make_error_response_common!(resp, tag, e); resp } @@ -1945,7 +2005,11 @@ mod tests { let resp = block_on(copr.handle_unary_request(config, handler_builder)).unwrap(); assert_eq!(resp.get_data().len(), 0); - assert!(!resp.get_other_error().is_empty()); + let region_err = resp.get_region_error(); + assert_eq!( + region_err.get_server_is_busy().reason, + "deadline is exceeded".to_string() + ); } { @@ -1962,7 +2026,11 @@ mod tests { let resp = block_on(copr.handle_unary_request(config, handler_builder)).unwrap(); assert_eq!(resp.get_data().len(), 0); - assert!(!resp.get_other_error().is_empty()); + let region_err = resp.get_region_error(); + assert_eq!( + region_err.get_server_is_busy().reason, + "deadline is exceeded".to_string() + ); } } @@ -2014,4 +2082,74 @@ mod tests { let resp = block_on(copr.parse_and_handle_unary_request(req, None)); assert_eq!(resp.get_locked().get_key(), b"key"); } + + #[test] + fn test_make_error_response() { + let resp = make_error_response(Error::DeadlineExceeded); + let region_err = resp.get_region_error(); + assert_eq!( + region_err.get_server_is_busy().reason, + "deadline is exceeded".to_string() + ); + assert_eq!( + region_err.get_message(), + "Coprocessor task terminated due to exceeding the deadline" + ); + } + + #[test] + fn test_memory_quota() { + let engine = TestEngineBuilder::new().build().unwrap(); + let read_pool = ReadPool::from(build_read_pool_for_test( + &CoprReadPoolConfig::default_for_test(), + engine, + )); + let cm = ConcurrencyManager::new(1.into()); + let copr = Endpoint::::new( + &Config::default(), + read_pool.handle(), + cm, + ResourceTagFactory::new_for_test(), + Arc::new(QuotaLimiter::default()), + None, + ); + + // By default, coprocessor does not return memory quota exceeded error. + { + let handler_builder = Box::new(|_, _: &_| { + Ok(UnaryFixture::new(Ok(coppb::Response::default())).into_boxed()) + }); + + let mut config = ReqContext::default_for_test(); + config.deadline = Deadline::from_now(Duration::from_millis(500)); + + let resp = block_on(copr.handle_unary_request(config, handler_builder)).unwrap(); + assert!(!resp.has_region_error(), "{:?}", resp); + } + + // Trigger memory quota exceeded error. + copr.memory_quota.set_capacity(1); + { + let handler_builder = Box::new(|_, _: &_| { + Ok(UnaryFixture::new(Ok(coppb::Response::default())).into_boxed()) + }); + + let mut config = ReqContext::default_for_test(); + config.deadline = Deadline::from_now(Duration::from_millis(500)); + + let res = block_on(copr.handle_unary_request(config, handler_builder)); + assert!(res.is_err(), "{:?}", res); + let resp = make_error_response(res.unwrap_err()); + assert_eq!(resp.get_data().len(), 0); + let region_err = resp.get_region_error(); + assert!( + region_err + .get_server_is_busy() + .reason + .contains("exceeding memory quota"), + "{:?}", + region_err.get_server_is_busy().reason + ); + } + } } diff --git a/src/coprocessor/error.rs b/src/coprocessor/error.rs index 25751553c28..5c3ce554cc2 100644 --- a/src/coprocessor/error.rs +++ b/src/coprocessor/error.rs @@ -2,6 +2,7 @@ use error_code::{self, ErrorCode, ErrorCodeExt}; use thiserror::Error; +use tikv_util::memory::MemoryQuotaExceeded; use crate::{ storage, @@ -26,6 +27,9 @@ pub enum Error { #[error("Coprocessor task canceled due to exceeding max pending tasks")] MaxPendingTasksExceeded, + #[error("Coprocessor task canceled due to exceeding memory quota")] + MemoryQuotaExceeded, + #[error("{0}")] Other(String), } @@ -117,6 +121,12 @@ impl From for Error { } } +impl From for Error { + fn from(_: MemoryQuotaExceeded) -> Self { + Error::MemoryQuotaExceeded + } +} + pub type Result = std::result::Result; impl ErrorCodeExt for Error { @@ -126,6 +136,7 @@ impl ErrorCodeExt for Error { Error::Locked(_) => error_code::coprocessor::LOCKED, Error::DeadlineExceeded => error_code::coprocessor::DEADLINE_EXCEEDED, Error::MaxPendingTasksExceeded => error_code::coprocessor::MAX_PENDING_TASKS_EXCEEDED, + Error::MemoryQuotaExceeded => error_code::coprocessor::MEMORY_QUOTA_EXCEEDED, Error::Other(_) => error_code::UNKNOWN, } } diff --git a/src/coprocessor/metrics.rs b/src/coprocessor/metrics.rs index 64905b3dfba..d41f3666f2f 100644 --- a/src/coprocessor/metrics.rs +++ b/src/coprocessor/metrics.rs @@ -8,6 +8,7 @@ use pd_client::BucketMeta; use prometheus::*; use prometheus_static_metric::*; use raftstore::store::{util::build_key_range, ReadStats}; +use tikv_util::memory::MemoryQuota; use crate::{ server::metrics::{GcKeysCF, GcKeysDetail}, @@ -208,6 +209,12 @@ impl CopLocalMetrics { pub fn local_read_stats(&self) -> &ReadStats { &self.local_read_stats } + + #[cfg(test)] + pub fn clear(&mut self) { + self.local_read_stats.region_infos.clear(); + self.local_read_stats.region_buckets.clear(); + } } thread_local! { @@ -318,3 +325,37 @@ pub fn tls_collect_query( .add_query_num(region_id, peer, key_range, QueryKind::Coprocessor); }); } + +pub fn register_coprocessor_memory_quota_metrics(source: Arc) { + struct MemoryQuotaCollector { + gauges: IntGaugeVec, + source: Arc, + } + impl prometheus::core::Collector for MemoryQuotaCollector { + fn desc(&self) -> Vec<&prometheus::core::Desc> { + self.gauges.desc() + } + fn collect(&self) -> Vec { + self.gauges + .with_label_values(&["capacity"]) + .set(self.source.capacity() as _); + self.gauges + .with_label_values(&["in_use"]) + .set(self.source.in_use() as _); + self.gauges.collect() + } + } + let gauges = IntGaugeVec::new( + Opts::new( + "tikv_coprocessor_memory_quota", + "Statistics of in_use and capacity of coprocessor memory quota", + ), + &["type"], + ) + .unwrap(); + if let Err(e) = + prometheus::default_registry().register(Box::new(MemoryQuotaCollector { gauges, source })) + { + warn!("register memory quota metrics failed"; "error" => ?e); + } +} diff --git a/src/coprocessor/mod.rs b/src/coprocessor/mod.rs index 140d3c0476e..28f40d8b9e3 100644 --- a/src/coprocessor/mod.rs +++ b/src/coprocessor/mod.rs @@ -23,6 +23,7 @@ mod cache; mod checksum; +mod config_manager; pub mod dag; mod endpoint; mod error; @@ -43,7 +44,7 @@ use metrics::ReqTag; use rand::prelude::*; use tidb_query_common::execute_stats::ExecSummary; use tikv_alloc::{mem_trace, Id, MemoryTrace, MemoryTraceGuard}; -use tikv_util::{deadline::Deadline, time::Duration}; +use tikv_util::{deadline::Deadline, memory::HeapSize, time::Duration}; use txn_types::TsSet; pub use self::{ @@ -147,6 +148,16 @@ pub struct ReqContext { pub allowed_in_flashback: bool, } +impl HeapSize for ReqContext { + fn approximate_heap_size(&self) -> usize { + self.context.approximate_heap_size() + + self.ranges.approximate_heap_size() + + self.peer.as_ref().map_or(0, |p| p.as_bytes().len()) + + self.lower_bound.approximate_heap_size() + + self.upper_bound.approximate_heap_size() + } +} + impl ReqContext { pub fn new( tag: ReqTag, @@ -159,7 +170,11 @@ impl ReqContext { cache_match_version: Option, perf_level: PerfLevel, ) -> Self { - let deadline = Deadline::from_now(max_handle_duration); + let mut deadline_duration = max_handle_duration; + if context.max_execution_duration_ms > 0 { + deadline_duration = Duration::from_millis(context.max_execution_duration_ms); + } + let deadline = Deadline::from_now(deadline_duration); let bypass_locks = TsSet::from_u64s(context.take_resolved_locks()); let access_locks = TsSet::from_u64s(context.take_committed_locks()); let lower_bound = match ranges.first().as_ref() { @@ -235,6 +250,23 @@ lazy_static! { mod tests { use super::*; + fn default_req_ctx_with_ctx_duration( + context: kvrpcpb::Context, + max_handle_duration: Duration, + ) -> ReqContext { + ReqContext::new( + ReqTag::test, + context, + Vec::new(), + max_handle_duration, + None, + None, + TimeStamp::max(), + None, + PerfLevel::EnableCount, + ) + } + #[test] fn test_build_task_id() { let mut ctx = ReqContext::default_for_test(); @@ -246,4 +278,27 @@ mod tests { ctx.context.set_task_id(0); assert_eq!(ctx.build_task_id(), start_ts); } + + #[test] + fn test_deadline_from_req_ctx() { + let ctx = kvrpcpb::Context::default(); + let max_handle_duration = Duration::from_millis(100); + let req_ctx = default_req_ctx_with_ctx_duration(ctx, max_handle_duration); + // sleep at least 100ms + std::thread::sleep(Duration::from_millis(200)); + req_ctx + .deadline + .check() + .expect_err("deadline should exceed"); + + let mut ctx = kvrpcpb::Context::default(); + ctx.max_execution_duration_ms = 100_000; + let req_ctx = default_req_ctx_with_ctx_duration(ctx, max_handle_duration); + // sleep at least 100ms + std::thread::sleep(Duration::from_millis(200)); + req_ctx + .deadline + .check() + .expect("deadline should not exceed"); + } } diff --git a/src/coprocessor/tracker.rs b/src/coprocessor/tracker.rs index 18eaa0b6e98..94d6b56d2ef 100644 --- a/src/coprocessor/tracker.rs +++ b/src/coprocessor/tracker.rs @@ -7,7 +7,10 @@ use engine_traits::{PerfContext, PerfContextExt, PerfContextKind}; use kvproto::{kvrpcpb, kvrpcpb::ScanDetailV2}; use pd_client::BucketMeta; use tikv_kv::Engine; -use tikv_util::time::{self, Duration, Instant}; +use tikv_util::{ + memory::HeapSize, + time::{self, Duration, Instant}, +}; use txn_types::Key; use super::metrics::*; @@ -264,8 +267,11 @@ impl Tracker { .unwrap_or_default() }); + let source_stmt = self.req_ctx.context.get_source_stmt(); with_tls_tracker(|tracker| { info!(#"slow_log", "slow-query"; + "connection_id" => source_stmt.get_connection_id(), + "session_alias" => source_stmt.get_session_alias(), "region_id" => &self.req_ctx.context.get_region_id(), "remote_host" => &self.req_ctx.peer, "total_lifetime" => ?self.req_lifetime, @@ -350,20 +356,24 @@ impl Tracker { false }; - tls_collect_query( - region_id, - peer, - start_key.as_encoded(), - end_key.as_encoded(), - reverse_scan, - ); - tls_collect_read_flow( - self.req_ctx.context.get_region_id(), - Some(start_key.as_encoded()), - Some(end_key.as_encoded()), - &total_storage_stats, - self.buckets.as_ref(), - ); + // only collect metrics for select and index, exclude transient read flow such + // like analyze and checksum. + if self.req_ctx.tag == ReqTag::select || self.req_ctx.tag == ReqTag::index { + tls_collect_query( + region_id, + peer, + start_key.as_encoded(), + end_key.as_encoded(), + reverse_scan, + ); + tls_collect_read_flow( + self.req_ctx.context.get_region_id(), + Some(start_key.as_encoded()), + Some(end_key.as_encoded()), + &total_storage_stats, + self.buckets.as_ref(), + ); + } self.current_stage = TrackerState::Tracked; } @@ -427,6 +437,46 @@ impl Drop for Tracker { if let TrackerState::ItemFinished(_) = self.current_stage { self.on_finish_all_items(); } + + if self.current_stage != TrackerState::AllItemFinished + && self.req_ctx.deadline.check().is_err() + { + // record deadline exceeded error log. + let total_lifetime = self.request_begin_at.saturating_elapsed(); + let source_stmt = self.req_ctx.context.get_source_stmt(); + let first_range = self.req_ctx.ranges.first(); + let some_table_id = first_range.as_ref().map(|range| { + tidb_query_datatype::codec::table::decode_table_id(range.get_start()) + .unwrap_or_default() + }); + warn!("query deadline exceeded"; + "current_stage" => ?self.current_stage, + "connection_id" => source_stmt.get_connection_id(), + "session_alias" => source_stmt.get_session_alias(), + "region_id" => &self.req_ctx.context.get_region_id(), + "remote_host" => &self.req_ctx.peer, + "total_lifetime" => ?total_lifetime, + "wait_time" => ?self.wait_time, + "wait_time.schedule" => ?self.schedule_wait_time, + "wait_time.snapshot" => ?self.snapshot_wait_time, + "handler_build_time" => ?self.handler_build_time, + "total_process_time" => ?self.total_process_time, + "total_suspend_time" => ?self.total_suspend_time, + "txn_start_ts" => self.req_ctx.txn_start_ts, + "table_id" => some_table_id, + "tag" => self.req_ctx.tag.get_str(), + ); + } + } +} + +impl HeapSize for Tracker { + fn approximate_heap_size(&self) -> usize { + self.req_ctx.approximate_heap_size() + + self + .buckets + .as_ref() + .map_or(0, |b| b.approximate_heap_size()) } } @@ -443,69 +493,86 @@ mod tests { #[test] fn test_track() { - let mut context = kvrpcpb::Context::default(); - context.set_region_id(1); - - let mut req_ctx = ReqContext::new( - ReqTag::test, - context, - vec![], - Duration::from_secs(0), - None, - None, - TimeStamp::max(), - None, - PerfLevel::EnableCount, - ); - req_ctx.lower_bound = vec![ - 116, 128, 0, 0, 0, 0, 0, 0, 184, 95, 114, 128, 0, 0, 0, 0, 0, 70, 67, - ]; - req_ctx.upper_bound = vec![ - 116, 128, 0, 0, 0, 0, 0, 0, 184, 95, 114, 128, 0, 0, 0, 0, 0, 70, 167, - ]; - let mut track: Tracker = Tracker::new(req_ctx, Duration::default()); - let mut bucket = BucketMeta::default(); - bucket.region_id = 1; - bucket.version = 1; - bucket.keys = vec![ - vec![ - 116, 128, 0, 0, 0, 0, 0, 0, 255, 179, 95, 114, 128, 0, 0, 0, 0, 255, 0, 175, 155, - 0, 0, 0, 0, 0, 250, - ], - vec![ - 116, 128, 0, 255, 255, 255, 255, 255, 255, 254, 0, 0, 0, 0, 0, 0, 0, 248, - ], - ]; - bucket.sizes = vec![10]; - track.buckets = Some(Arc::new(bucket)); - - let mut stat = Statistics::default(); - stat.write.flow_stats.read_keys = 10; - track.total_storage_stats = stat; - - track.track(); - drop(track); - TLS_COP_METRICS.with(|m| { - assert_eq!( - 10, - m.borrow() - .local_read_stats() - .region_infos - .get(&1) - .unwrap() - .flow - .read_keys - ); - assert_eq!( - vec![10], - m.borrow() - .local_read_stats() - .region_buckets - .get(&1) - .unwrap() - .stats - .read_keys + let check = move |tag: ReqTag, flow: u64| { + let mut context = kvrpcpb::Context::default(); + context.set_region_id(1); + let mut req_ctx = ReqContext::new( + tag, + context, + vec![], + Duration::from_secs(0), + None, + None, + TimeStamp::max(), + None, + PerfLevel::EnableCount, ); - }); + + req_ctx.lower_bound = vec![ + 116, 128, 0, 0, 0, 0, 0, 0, 184, 95, 114, 128, 0, 0, 0, 0, 0, 70, 67, + ]; + req_ctx.upper_bound = vec![ + 116, 128, 0, 0, 0, 0, 0, 0, 184, 95, 114, 128, 0, 0, 0, 0, 0, 70, 167, + ]; + let mut track: Tracker = Tracker::new(req_ctx, Duration::default()); + let mut bucket = BucketMeta::default(); + bucket.region_id = 1; + bucket.version = 1; + bucket.keys = vec![ + vec![ + 116, 128, 0, 0, 0, 0, 0, 0, 255, 179, 95, 114, 128, 0, 0, 0, 0, 255, 0, 175, + 155, 0, 0, 0, 0, 0, 250, + ], + vec![ + 116, 128, 0, 255, 255, 255, 255, 255, 255, 254, 0, 0, 0, 0, 0, 0, 0, 248, + ], + ]; + bucket.sizes = vec![10]; + track.buckets = Some(Arc::new(bucket)); + + let mut stat = Statistics::default(); + stat.write.flow_stats.read_keys = 10; + track.total_storage_stats = stat; + + track.track(); + drop(track); + TLS_COP_METRICS.with(|m| { + if flow > 0 { + assert_eq!( + flow as usize, + m.borrow() + .local_read_stats() + .region_infos + .get(&1) + .unwrap() + .flow + .read_keys + ); + assert_eq!( + flow, + m.borrow() + .local_read_stats() + .region_buckets + .get(&1) + .unwrap() + .stats + .read_keys[0] + ); + } else { + assert!(m.borrow().local_read_stats().region_infos.get(&1).is_none()); + assert!( + m.borrow() + .local_read_stats() + .region_buckets + .get(&1) + .is_none() + ); + } + + m.borrow_mut().clear(); + }); + }; + check(ReqTag::select, 10); + check(ReqTag::analyze_full_sampling, 0); } } diff --git a/src/coprocessor_v2/OWNERS b/src/coprocessor_v2/OWNERS new file mode 100644 index 00000000000..0ce3dbff3a6 --- /dev/null +++ b/src/coprocessor_v2/OWNERS @@ -0,0 +1,7 @@ +# See the OWNERS docs at https://go.k8s.io/owners +options: + no_parent_owners: true +filters: + "(OWNERS|config\\.rs)$": + approvers: + - sig-critical-approvers-config-src diff --git a/src/import/sst_service.rs b/src/import/sst_service.rs index 0c81873c130..81c947847d4 100644 --- a/src/import/sst_service.rs +++ b/src/import/sst_service.rs @@ -5,7 +5,10 @@ use std::{ convert::identity, future::Future, path::PathBuf, - sync::{Arc, Mutex}, + sync::{ + atomic::{AtomicU64, Ordering}, + Arc, Mutex, + }, time::Duration, }; @@ -20,9 +23,16 @@ use kvproto::{ errorpb, import_sstpb::{ Error as ImportPbError, ImportSst, Range, RawWriteRequest_oneof_chunk as RawChunk, SstMeta, - SwitchMode, WriteRequest_oneof_chunk as Chunk, *, + SuspendImportRpcRequest, SuspendImportRpcResponse, SwitchMode, + WriteRequest_oneof_chunk as Chunk, *, }, kvrpcpb::Context, + metapb::RegionEpoch, +}; +use raftstore::{ + coprocessor::{RegionInfo, RegionInfoProvider}, + store::util::is_epoch_stale, + RegionInfoAccessor, }; use raftstore_v2::StoreMeta; use resource_control::{with_resource_limiter, ResourceGroupManager}; @@ -35,13 +45,16 @@ use tikv_kv::{ }; use tikv_util::{ config::ReadableSize, - future::create_stream_with_buffer, - sys::thread::ThreadBuildWrapper, + future::{create_stream_with_buffer, paired_future_callback}, + sys::{ + disk::{get_disk_status, DiskUsage}, + thread::ThreadBuildWrapper, + }, time::{Instant, Limiter}, HandyRwLock, }; use tokio::{runtime::Runtime, time::sleep}; -use txn_types::{Key, WriteRef, WriteType}; +use txn_types::{Key, TimeStamp, WriteRef, WriteType}; use super::{ make_rpc_error, @@ -49,6 +62,7 @@ use super::{ }; use crate::{ import::duplicate_detect::DuplicateDetector, + send_rpc_response, server::CONFIG_ROCKSDB_GAUGE, storage::{self, errors::extract_region_error_from_error}, }; @@ -80,6 +94,10 @@ const WIRE_EXTRA_BYTES: usize = 12; /// [`raft_writer::ThrottledTlsEngineWriter`]. There aren't too many items held /// in the writer. So we can run the GC less frequently. const WRITER_GC_INTERVAL: Duration = Duration::from_secs(300); +/// The max time of suspending requests. +/// This may save us from some client sending insane value to the server. +const SUSPEND_REQUEST_MAX_SECS: u64 = // 6h + 6 * 60 * 60; fn transfer_error(err: storage::Error) -> ImportPbError { let mut e = ImportPbError::default(); @@ -115,12 +133,16 @@ pub struct ImportSstService { limiter: Limiter, task_slots: Arc>>, raft_entry_max_size: ReadableSize, + region_info_accessor: Arc, writer: raft_writer::ThrottledTlsEngineWriter, // it's some iff multi-rocksdb is enabled store_meta: Option>>>, resource_manager: Option>, + + // When less than now, don't accept any requests. + suspend_req_until: Arc, } struct RequestCollector { @@ -165,6 +187,9 @@ impl RequestCollector { } fn accept_kv(&mut self, cf: &str, is_delete: bool, k: Vec, v: Vec) { + debug!("Accepting KV."; "cf" => %cf, + "key" => %log_wrappers::Value::key(&k), + "value" => %log_wrappers::Value::key(&v)); // Need to skip the empty key/value that could break the transaction or cause // data corruption. see details at https://github.com/pingcap/tiflow/issues/5468. if k.is_empty() || (!is_delete && v.is_empty()) { @@ -303,6 +328,7 @@ impl ImportSstService { importer: Arc, store_meta: Option>>>, resource_manager: Option>, + region_info_accessor: Arc, ) -> Self { let props = tikv_util::thread_group::current_properties(); let eng = Mutex::new(engine.clone()); @@ -350,9 +376,11 @@ impl ImportSstService { limiter: Limiter::new(f64::INFINITY), task_slots: Arc::new(Mutex::new(HashSet::default())), raft_entry_max_size, + region_info_accessor, writer, store_meta, resource_manager, + suspend_req_until: Arc::new(AtomicU64::new(0)), } } @@ -567,7 +595,6 @@ impl ImportSstService { let buff = importer .read_from_kv_file( meta, - rule, ext_storage.clone(), req.get_storage_backend(), &limiter, @@ -579,6 +606,7 @@ impl ImportSstService { meta.get_start_ts(), meta.get_restore_ts(), buff, + rule, |k, v| collector.accept_kv(meta.get_cf(), meta.get_is_delete(), k, v), )? { if let Some(range) = range.as_mut() { @@ -616,6 +644,86 @@ impl ImportSstService { Ok(range) } + + /// Check whether we should suspend the current request. + fn check_suspend(&self) -> Result<()> { + let now = TimeStamp::physical_now(); + let suspend_until = self.suspend_req_until.load(Ordering::SeqCst); + if now < suspend_until { + Err(Error::Suspended { + time_to_lease_expire: Duration::from_millis(suspend_until - now), + }) + } else { + Ok(()) + } + } + + /// suspend requests for a period. + /// + /// # returns + /// + /// whether for now, the requests has already been suspended. + pub fn suspend_requests(&self, for_time: Duration) -> bool { + let now = TimeStamp::physical_now(); + let last_suspend_until = self.suspend_req_until.load(Ordering::SeqCst); + let suspended = now < last_suspend_until; + let suspend_until = TimeStamp::physical_now() + for_time.as_millis() as u64; + self.suspend_req_until + .store(suspend_until, Ordering::SeqCst); + suspended + } + + /// allow all requests to enter. + /// + /// # returns + /// + /// whether requests has already been previously suspended. + pub fn allow_requests(&self) -> bool { + let now = TimeStamp::physical_now(); + let last_suspend_until = self.suspend_req_until.load(Ordering::SeqCst); + let suspended = now < last_suspend_until; + self.suspend_req_until.store(0, Ordering::SeqCst); + suspended + } +} + +fn check_local_region_stale( + region_id: u64, + epoch: &RegionEpoch, + local_region_info: Option, +) -> Result<()> { + match local_region_info { + Some(local_region_info) => { + let local_region_epoch = local_region_info.region.region_epoch.unwrap(); + + // when local region epoch is stale, client can retry write later + if is_epoch_stale(&local_region_epoch, epoch) { + return Err(Error::RequestTooNew(format!( + "request region {} is ahead of local region, local epoch {:?}, request epoch {:?}, please retry write later", + region_id, local_region_epoch, epoch + ))); + } + // when local region epoch is ahead, client need to rescan region from PD to get + // latest region later + if is_epoch_stale(epoch, &local_region_epoch) { + return Err(Error::RequestTooOld(format!( + "request region {} is staler than local region, local epoch {:?}, request epoch {:?}", + region_id, local_region_epoch, epoch + ))); + } + + // not match means to rescan + Ok(()) + } + None => { + // when region not found, we can't tell whether it's stale or ahead, so we just + // return the safest case + Err(Error::RequestTooOld(format!( + "region {} is not found", + region_id + ))) + } + } } #[macro_export] @@ -629,6 +737,7 @@ macro_rules! impl_write { ) { let import = self.importer.clone(); let tablets = self.tablets.clone(); + let region_info_accessor = self.region_info_accessor.clone(); let (rx, buf_driver) = create_stream_with_buffer(stream, self.cfg.rl().stream_channel_window); let mut rx = rx.map_err(Error::from); @@ -637,12 +746,15 @@ macro_rules! impl_write { let label = stringify!($fn); let resource_manager = self.resource_manager.clone(); let handle_task = async move { - let res = async move { - let first_req = rx.try_next().await?; + let (res, rx) = async move { + let first_req = match rx.try_next().await { + Ok(r) => r, + Err(e) => return (Err(e), Some(rx)), + }; let (meta, resource_limiter) = match first_req { Some(r) => { let limiter = resource_manager.as_ref().and_then(|m| { - m.get_resource_limiter( + m.get_background_resource_limiter( r.get_context() .get_resource_control_context() .get_resource_group_name(), @@ -651,18 +763,49 @@ macro_rules! impl_write { }); match r.chunk { Some($chunk_ty::Meta(m)) => (m, limiter), - _ => return Err(Error::InvalidChunk), + _ => return (Err(Error::InvalidChunk), Some(rx)), } } - _ => return Err(Error::InvalidChunk), + _ => return (Err(Error::InvalidChunk), Some(rx)), }; + // wait the region epoch on this TiKV to catch up with the epoch + // in request, which comes from PD and represents the majority + // peers' status. let region_id = meta.get_region_id(); + let (cb, f) = paired_future_callback(); + if let Err(e) = region_info_accessor + .find_region_by_id(region_id, cb) + .map_err(|e| { + // when region not found, we can't tell whether it's stale or ahead, so + // we just return the safest case + Error::RequestTooOld(format!( + "failed to find region {} err {:?}", + region_id, e + )) + }) + { + return (Err(e), Some(rx)); + }; + let res = match f.await { + Ok(r) => r, + Err(e) => return (Err(From::from(e)), Some(rx)), + }; + if let Err(e) = + check_local_region_stale(region_id, meta.get_region_epoch(), res) + { + return (Err(e), Some(rx)); + }; + let tablet = match tablets.get(region_id) { Some(t) => t, None => { - return Err(Error::Engine( - format!("region {} not found", region_id).into(), - )); + return ( + Err(Error::RequestTooOld(format!( + "region {} not found", + region_id + ))), + Some(rx), + ); } }; @@ -670,10 +813,10 @@ macro_rules! impl_write { Ok(w) => w, Err(e) => { error!("build writer failed {:?}", e); - return Err(Error::InvalidChunk); + return (Err(Error::InvalidChunk), Some(rx)); } }; - let (writer, resource_limiter) = rx + let result = rx .try_fold( (writer, resource_limiter), |(mut writer, limiter), req| async move { @@ -690,7 +833,11 @@ macro_rules! impl_write { .map(|w| (w, limiter)) }, ) - .await?; + .await; + let (writer, resource_limiter) = match result { + Ok(r) => r, + Err(e) => return (Err(e), None), + }; let finish_fn = async { let metas = writer.finish()?; @@ -699,13 +846,18 @@ macro_rules! impl_write { }; let metas: Result<_> = with_resource_limiter(finish_fn, resource_limiter).await; - let metas = metas?; + let metas = match metas { + Ok(r) => r, + Err(e) => return (Err(e), None), + }; let mut resp = $resp_ty::default(); resp.set_metas(metas.into()); - Ok(resp) + (Ok(resp), None) } .await; $crate::send_rpc_response!(res, sink, label, timer); + // don't drop rx before send response + _ = rx; }; self.threads.spawn(buf_driver); @@ -883,6 +1035,10 @@ impl ImportSst for ImportSstService { .observe(start.saturating_elapsed().as_secs_f64()); let mut resp = ApplyResponse::default(); + if get_disk_status(0) != DiskUsage::Normal { + resp.set_error(Error::DiskSpaceNotEnough.into()); + return crate::send_rpc_response!(Ok(resp), sink, label, start); + } match Self::apply_imp(req, importer, applier, limiter, max_raft_size).await { Ok(Some(r)) => resp.set_range(r), @@ -911,7 +1067,7 @@ impl ImportSst for ImportSstService { let tablets = self.tablets.clone(); let start = Instant::now(); let resource_limiter = self.resource_manager.as_ref().and_then(|r| { - r.get_resource_limiter( + r.get_background_resource_limiter( req.get_context() .get_resource_control_context() .get_resource_group_name(), @@ -924,6 +1080,11 @@ impl ImportSst for ImportSstService { sst_importer::metrics::IMPORTER_DOWNLOAD_DURATION .with_label_values(&["queue"]) .observe(start.saturating_elapsed().as_secs_f64()); + if get_disk_status(0) != DiskUsage::Normal { + let mut resp = DownloadResponse::default(); + resp.set_error(Error::DiskSpaceNotEnough.into()); + return crate::send_rpc_response!(Ok(resp), sink, label, timer); + } // FIXME: download() should be an async fn, to allow BR to cancel // a download task. @@ -990,8 +1151,14 @@ impl ImportSst for ImportSstService { ) { let label = "ingest"; let timer = Instant::now_coarse(); - let mut resp = IngestResponse::default(); + + if let Err(err) = self.check_suspend() { + resp.set_error(ImportPbError::from(err).take_store_error()); + ctx.spawn(async move { crate::send_rpc_response!(Ok(resp), sink, label, timer) }); + return; + } + let region_id = req.get_context().get_region_id(); if let Some(errorpb) = self.check_write_stall(region_id) { resp.set_error(errorpb); @@ -1033,8 +1200,13 @@ impl ImportSst for ImportSstService { ) { let label = "multi-ingest"; let timer = Instant::now_coarse(); - let mut resp = IngestResponse::default(); + if let Err(err) = self.check_suspend() { + resp.set_error(ImportPbError::from(err).take_store_error()); + ctx.spawn(async move { crate::send_rpc_response!(Ok(resp), sink, label, timer) }); + return; + } + if let Some(errorpb) = self.check_write_stall(req.get_context().get_region_id()) { resp.set_error(errorpb); ctx.spawn( @@ -1237,6 +1409,37 @@ impl ImportSst for ImportSstService { RawChunk, new_raw_writer ); + + fn suspend_import_rpc( + &mut self, + ctx: RpcContext<'_>, + req: SuspendImportRpcRequest, + sink: UnarySink, + ) { + let label = "suspend_import_rpc"; + let timer = Instant::now_coarse(); + + if req.should_suspend_imports && req.get_duration_in_secs() > SUSPEND_REQUEST_MAX_SECS { + ctx.spawn(async move { + send_rpc_response!(Err(Error::Io( + std::io::Error::new(std::io::ErrorKind::InvalidInput, + format!("you are going to suspend the import RPCs too long. (for {} seconds, max acceptable duration is {} seconds)", + req.get_duration_in_secs(), SUSPEND_REQUEST_MAX_SECS)))), sink, label, timer); + }); + return; + } + + let suspended = if req.should_suspend_imports { + info!("suspend incoming import RPCs."; "for_second" => req.get_duration_in_secs(), "caller" => req.get_caller()); + self.suspend_requests(Duration::from_secs(req.get_duration_in_secs())) + } else { + info!("allow incoming import RPCs."; "caller" => req.get_caller()); + self.allow_requests() + }; + let mut resp = SuspendImportRpcResponse::default(); + resp.set_already_suspended(suspended); + ctx.spawn(async move { send_rpc_response!(Ok(resp), sink, label, timer) }); + } } // add error statistics from pb error response @@ -1296,14 +1499,19 @@ mod test { use engine_traits::{CF_DEFAULT, CF_WRITE}; use kvproto::{ kvrpcpb::Context, - metapb::RegionEpoch, + metapb::{Region, RegionEpoch}, raft_cmdpb::{RaftCmdRequest, Request}, }; - use protobuf::Message; + use protobuf::{Message, SingularPtrField}; + use raft::StateRole::Follower; + use raftstore::RegionInfo; use tikv_kv::{Modify, WriteData}; use txn_types::{Key, TimeStamp, Write, WriteBatchFlags, WriteType}; - use crate::{import::sst_service::RequestCollector, server::raftkv}; + use crate::{ + import::sst_service::{check_local_region_stale, RequestCollector}, + server::raftkv, + }; fn write(key: &[u8], ty: WriteType, commit_ts: u64, start_ts: u64) -> (Vec, Vec) { let k = Key::from_raw(key).append_ts(TimeStamp::new(commit_ts)); @@ -1587,4 +1795,72 @@ mod test { } assert_eq!(total, 100); } + + #[test] + fn test_write_rpc_check_region_epoch() { + let mut req_epoch = RegionEpoch { + conf_ver: 10, + version: 10, + ..Default::default() + }; + // test for region not found + let result = check_local_region_stale(1, &req_epoch, None); + assert!(result.is_err()); + // check error message contains "rescan region later", client will match this + // string pattern + assert!( + result + .unwrap_err() + .to_string() + .contains("rescan region later") + ); + + let mut local_region_info = RegionInfo { + region: Region { + id: 1, + region_epoch: SingularPtrField::some(req_epoch.clone()), + ..Default::default() + }, + role: Follower, + buckets: 1, + }; + // test the local region epoch is same as request + let result = check_local_region_stale(1, &req_epoch, Some(local_region_info.clone())); + result.unwrap(); + + // test the local region epoch is ahead of request + local_region_info + .region + .region_epoch + .as_mut() + .unwrap() + .conf_ver = 11; + let result = check_local_region_stale(1, &req_epoch, Some(local_region_info.clone())); + assert!(result.is_err()); + // check error message contains "rescan region later", client will match this + // string pattern + assert!( + result + .unwrap_err() + .to_string() + .contains("rescan region later") + ); + + req_epoch.conf_ver = 11; + let result = check_local_region_stale(1, &req_epoch, Some(local_region_info.clone())); + result.unwrap(); + + // test the local region epoch is staler than request + req_epoch.version = 12; + let result = check_local_region_stale(1, &req_epoch, Some(local_region_info)); + assert!(result.is_err()); + // check error message contains "retry write later", client will match this + // string pattern + assert!( + result + .unwrap_err() + .to_string() + .contains("retry write later") + ); + } } diff --git a/src/read_pool.rs b/src/read_pool.rs index a5898ea4f63..18e9a11cbd5 100644 --- a/src/read_pool.rs +++ b/src/read_pool.rs @@ -12,7 +12,10 @@ use std::{ }; use file_system::{set_io_type, IoType}; -use futures::{channel::oneshot, future::TryFutureExt}; +use futures::{ + channel::oneshot, + future::{FutureExt, TryFutureExt}, +}; use kvproto::{errorpb, kvrpcpb::CommandPri}; use online_config::{ConfigChange, ConfigManager, ConfigValue, Result as CfgResult}; use prometheus::{core::Metric, Histogram, IntCounter, IntGauge}; @@ -171,10 +174,9 @@ impl ReadPoolHandle { TaskCell::new( TrackedFuture::new(with_resource_limiter( ControlledFuture::new( - async move { - f.await; + f.map(move |_| { running_tasks.dec(); - }, + }), resource_ctl.clone(), group_name, ), @@ -184,10 +186,9 @@ impl ReadPoolHandle { ) } else { TaskCell::new( - TrackedFuture::new(async move { - f.await; + TrackedFuture::new(f.map(move |_| { running_tasks.dec(); - }), + })), extras, ) }; @@ -211,10 +212,9 @@ impl ReadPoolHandle { { let (tx, rx) = oneshot::channel::(); let res = self.spawn( - async move { - let res = f.await; + f.map(move |res| { let _ = tx.send(res); - }, + }), priority, task_id, metadata, @@ -312,6 +312,10 @@ impl ReadPoolHandle { let mut busy_err = errorpb::ServerIsBusy::default(); busy_err.set_reason("estimated wait time exceeds threshold".to_owned()); busy_err.estimated_wait_ms = u32::try_from(estimated_wait.as_millis()).unwrap_or(u32::MAX); + warn!("Already many pending tasks in the read queue, task is rejected"; + "busy_threshold" => ?&busy_threshold, + "busy_err" => ?&busy_err, + ); Err(busy_err) } } @@ -429,6 +433,7 @@ pub fn build_yatp_read_pool( engine: E, resource_ctl: Option>, cleanup_method: CleanupMethod, + metric_idx_from_task_meta_fn: Option usize + Send + Sync + 'static>>, ) -> ReadPool { let unified_read_pool_name = get_unified_read_pool_name(); build_yatp_read_pool_with_name( @@ -438,6 +443,7 @@ pub fn build_yatp_read_pool( resource_ctl, cleanup_method, unified_read_pool_name, + metric_idx_from_task_meta_fn, ) } @@ -448,9 +454,10 @@ pub fn build_yatp_read_pool_with_name( resource_ctl: Option>, cleanup_method: CleanupMethod, unified_read_pool_name: String, + metric_idx_from_task_meta_fn: Option usize + Send + Sync + 'static>>, ) -> ReadPool { let raftkv = Arc::new(Mutex::new(engine)); - let builder = YatpPoolBuilder::new(ReporterTicker { reporter }) + let mut builder = YatpPoolBuilder::new(ReporterTicker { reporter }) .name_prefix(&unified_read_pool_name) .cleanup_method(cleanup_method) .stack_size(config.stack_size.0 as usize) @@ -473,6 +480,12 @@ pub fn build_yatp_read_pool_with_name( .before_stop(|| unsafe { destroy_tls_engine::(); }); + if let Some(metric_idx_from_task_meta_fn) = metric_idx_from_task_meta_fn { + builder = builder + .enable_task_wait_metrics() + .metric_idx_from_task_meta(metric_idx_from_task_meta_fn); + } + let pool = if let Some(ref r) = resource_ctl { builder.build_priority_pool(r.clone()) } else { @@ -483,8 +496,12 @@ pub fn build_yatp_read_pool_with_name( pool, running_tasks: UNIFIED_READ_POOL_RUNNING_TASKS .with_label_values(&[&unified_read_pool_name]), - running_threads: UNIFIED_READ_POOL_RUNNING_THREADS - .with_label_values(&[&unified_read_pool_name]), + running_threads: { + let running_threads = + UNIFIED_READ_POOL_RUNNING_THREADS.with_label_values(&[&unified_read_pool_name]); + running_threads.set(config.max_thread_count as _); + running_threads + }, max_tasks: config .max_tasks_per_worker .saturating_mul(config.max_thread_count), @@ -796,8 +813,14 @@ mod tests { // max running tasks number should be 2*1 = 2 let engine = TestEngineBuilder::new().build().unwrap(); - let pool = - build_yatp_read_pool(&config, DummyReporter, engine, None, CleanupMethod::InPlace); + let pool = build_yatp_read_pool( + &config, + DummyReporter, + engine, + None, + CleanupMethod::InPlace, + None, + ); let gen_task = || { let (tx, rx) = oneshot::channel::<()>(); @@ -844,8 +867,14 @@ mod tests { // max running tasks number should be 2*1 = 2 let engine = TestEngineBuilder::new().build().unwrap(); - let pool = - build_yatp_read_pool(&config, DummyReporter, engine, None, CleanupMethod::InPlace); + let pool = build_yatp_read_pool( + &config, + DummyReporter, + engine, + None, + CleanupMethod::InPlace, + None, + ); let gen_task = || { let (tx, rx) = oneshot::channel::<()>(); @@ -900,8 +929,14 @@ mod tests { // max running tasks number should be 2*1 = 2 let engine = TestEngineBuilder::new().build().unwrap(); - let pool = - build_yatp_read_pool(&config, DummyReporter, engine, None, CleanupMethod::InPlace); + let pool = build_yatp_read_pool( + &config, + DummyReporter, + engine, + None, + CleanupMethod::InPlace, + None, + ); let gen_task = || { let (tx, rx) = oneshot::channel::<()>(); @@ -1027,6 +1062,7 @@ mod tests { resource_manager, CleanupMethod::InPlace, name.clone(), + None, ); let gen_task = || { diff --git a/src/server/OWNERS b/src/server/OWNERS new file mode 100644 index 00000000000..0ce3dbff3a6 --- /dev/null +++ b/src/server/OWNERS @@ -0,0 +1,7 @@ +# See the OWNERS docs at https://go.k8s.io/owners +options: + no_parent_owners: true +filters: + "(OWNERS|config\\.rs)$": + approvers: + - sig-critical-approvers-config-src diff --git a/src/server/config.rs b/src/server/config.rs index 013d1a66238..4b3d47f7338 100644 --- a/src/server/config.rs +++ b/src/server/config.rs @@ -40,6 +40,30 @@ const DEFAULT_ENDPOINT_REQUEST_MAX_HANDLE_SECS: u64 = 60; // Number of rows in each chunk for streaming coprocessor. const DEFAULT_ENDPOINT_STREAM_BATCH_ROW_LIMIT: usize = 128; +// By default, endpoint memory quota will be set to 12.5% of system memory. +// +// TPCC check test shows that: +// * The actual endpoint memory usage is about 3 times to memory quota. +// * Setting memory quota too low can lead to ServerIsBusy errors, which slow +// down performance. +// * With 1000 warehouses and 1000 threads, the peak memory usage of the TPCC +// check is 11.5 GiB, which is too large for common scenario 16GiB memory, +// because default block cache takes about 45% memory (7.2GiB). +// +// The 12.5% default quota is a balance between efficient memory usage and +// maintaining performance under load. +const DEFAULT_ENDPOINT_MEMORY_QUOTA_RATIO: f64 = 0.125; + +lazy_static! { + static ref DEFAULT_ENDPOINT_MEMORY_QUOTA: ReadableSize = { + let total_mem = SysQuota::memory_limit_in_bytes(); + let quota = (total_mem as f64) * DEFAULT_ENDPOINT_MEMORY_QUOTA_RATIO; + // In order to ensure that coprocessor can function properly under low + // memory conditions, we use 500MB as the minimum default value. + ReadableSize(cmp::max(ReadableSize::mb(500).0, quota as _)) + }; +} + // At least 4 long coprocessor requests are allowed to run concurrently. const MIN_ENDPOINT_MAX_CONCURRENCY: usize = 4; @@ -145,6 +169,7 @@ pub struct Config { #[serde(with = "perf_level_serde")] #[online_config(skip)] pub end_point_perf_level: PerfLevel, + pub end_point_memory_quota: ReadableSize, #[serde(alias = "snap-max-write-bytes-per-sec")] pub snap_io_max_bytes_per_sec: ReadableSize, pub snap_max_total_size: ReadableSize, @@ -187,26 +212,27 @@ pub struct Config { #[online_config(skip)] pub labels: HashMap, - // deprecated. use readpool.coprocessor.xx_concurrency. #[doc(hidden)] #[serde(skip_serializing)] - #[online_config(skip)] + #[online_config(hidden)] + #[deprecated = "The configuration has been moved to readpool.coprocessor.*_concurrency."] pub end_point_concurrency: Option, - // deprecated. use readpool.coprocessor.stack_size. #[doc(hidden)] #[serde(skip_serializing)] - #[online_config(skip)] + #[online_config(hidden)] + #[deprecated = "The configuration has been moved to readpool.coprocessor.stack_size."] pub end_point_stack_size: Option, - // deprecated. use readpool.coprocessor.max_tasks_per_worker_xx. #[doc(hidden)] #[serde(skip_serializing)] - #[online_config(skip)] + #[online_config(hidden)] + #[deprecated = "The configuration has been moved to readpool.coprocessor.max_tasks_per_worker_*."] pub end_point_max_tasks: Option, } impl Default for Config { + #[allow(deprecated)] fn default() -> Config { let cpu_num = SysQuota::cpu_cores_quota(); let background_thread_count = if cpu_num > 16.0 { 3 } else { 2 }; @@ -249,6 +275,7 @@ impl Default for Config { end_point_request_max_handle_duration: None, end_point_max_concurrency: cmp::max(cpu_num as usize, MIN_ENDPOINT_MAX_CONCURRENCY), end_point_perf_level: PerfLevel::Uninitialized, + end_point_memory_quota: *DEFAULT_ENDPOINT_MEMORY_QUOTA, snap_io_max_bytes_per_sec: ReadableSize(DEFAULT_SNAP_MAX_BYTES_PER_SEC), snap_max_total_size: ReadableSize(0), stats_concurrency: 1, @@ -357,6 +384,11 @@ impl Config { )); } + if self.end_point_memory_quota == *DEFAULT_ENDPOINT_MEMORY_QUOTA { + info!("using default coprocessor quota"; + "quota" => ?*DEFAULT_ENDPOINT_MEMORY_QUOTA); + } + if self.max_grpc_send_msg_len <= 0 { return Err(box_err!( "server.max-grpc-send-msg-len must be bigger than 0." @@ -435,6 +467,7 @@ pub struct ServerConfigManager { tx: Scheduler, config: Arc>, grpc_mem_quota: ResourceQuota, + copr_config_manager: Box, } unsafe impl Send for ServerConfigManager {} @@ -445,32 +478,38 @@ impl ServerConfigManager { tx: Scheduler, config: Arc>, grpc_mem_quota: ResourceQuota, + copr_config_manager: Box, ) -> ServerConfigManager { ServerConfigManager { tx, config, grpc_mem_quota, + copr_config_manager, } } } impl ConfigManager for ServerConfigManager { fn dispatch(&mut self, c: ConfigChange) -> std::result::Result<(), Box> { - { - let change = c.clone(); - self.config.update(move |cfg| cfg.update(change))?; - if let Some(value) = c.get("grpc_memory_pool_quota") { - let mem_quota: ReadableSize = value.clone().into(); - // the resize is done inplace indeed, but grpc-rs's api need self, so we just - // clone it here, but this no extra side effect here. - self.grpc_mem_quota - .clone() - .resize_memory(mem_quota.0 as usize); - } - if let Err(e) = self.tx.schedule(SnapTask::RefreshConfigEvent) { - error!("server configuration manager schedule refresh snapshot work task failed"; "err"=> ?e); - } + let change = c.clone(); + self.config.update(move |cfg| cfg.update(change))?; + if let Some(value) = c.get("grpc_memory_pool_quota") { + let mem_quota: ReadableSize = value.clone().into(); + // the resize is done inplace indeed, but grpc-rs's api need self, so we just + // clone it here, but this no extra side effect here. + self.grpc_mem_quota + .clone() + .resize_memory(mem_quota.0 as usize); } + if let Err(e) = self.tx.schedule(SnapTask::RefreshConfigEvent) { + error!("server configuration manager schedule refresh snapshot work task failed"; "err"=> ?e); + } + + // Dispatch coprocessor config. + if let Err(e) = self.copr_config_manager.dispatch(c.clone()) { + error!("server configuration manager fails to update coprocessor config"; "err"=> ?e); + } + info!("server configuration changed"; "change" => ?c); Ok(()) } diff --git a/src/server/debug.rs b/src/server/debug.rs index 9e01852455c..70e1df855d5 100644 --- a/src/server/debug.rs +++ b/src/server/debug.rs @@ -1111,9 +1111,11 @@ async fn async_key_range_flashback_to_version ?resp.get_error(), "region_err" => ?resp.get_region_error()); - return Err(Error::FlashbackFailed( - "exec prepare flashback failed.".into(), - )); + return Err(Error::FlashbackFailed(format!( + "exec prepare flashback failed: resp err is: {:?}, region err is: {:?}", + resp.get_error(), + resp.get_region_error() + ))); } } else { let mut req = kvrpcpb::FlashbackToVersionRequest::new(); @@ -1127,9 +1129,11 @@ async fn async_key_range_flashback_to_version ?resp.get_error(), "region_err" => ?resp.get_region_error()); - return Err(Error::FlashbackFailed( - "exec finish flashback failed.".into(), - )); + return Err(Error::FlashbackFailed(format!( + "exec finish flashback failed: resp err is: {:?}, region err is: {:?}", + resp.get_error(), + resp.get_region_error() + ))); } } Ok(()) diff --git a/src/server/debug2.rs b/src/server/debug2.rs index e914b353760..4230828dff1 100644 --- a/src/server/debug2.rs +++ b/src/server/debug2.rs @@ -10,7 +10,7 @@ use engine_traits::{ TabletRegistry, CF_DEFAULT, CF_LOCK, CF_RAFT, CF_WRITE, }; use futures::future::Future; -use keys::{data_key, DATA_MAX_KEY, DATA_PREFIX_KEY}; +use keys::{data_key, enc_end_key, enc_start_key, DATA_MAX_KEY, DATA_PREFIX_KEY}; use kvproto::{ debugpb::Db as DbType, kvrpcpb::MvccInfo, @@ -36,6 +36,34 @@ use crate::{ storage::mvcc::{MvccInfoCollector, MvccInfoScanner}, }; +// `key1` and `key2` should both be start_key or end_key. +fn smaller_key<'a>(key1: &'a [u8], key2: &'a [u8], is_end_key: bool) -> &'a [u8] { + if is_end_key && key1.is_empty() { + return key2; + } + if is_end_key && key2.is_empty() { + return key1; + } + if key1 < key2 { + return key1; + } + key2 +} + +// `key1` and `key2` should both be start_key or end_key. +fn larger_key<'a>(key1: &'a [u8], key2: &'a [u8], is_end_key: bool) -> &'a [u8] { + if is_end_key && key1.is_empty() { + return key1; + } + if is_end_key && key2.is_empty() { + return key2; + } + if key1 < key2 { + return key2; + } + key1 +} + // return the region containing the seek_key or the next region if not existed fn seek_region( seek_key: &[u8], @@ -98,11 +126,16 @@ impl MvccInfoIteratorV2 { )?; let tablet = tablet_cache.latest().unwrap(); + let region_start_key = enc_start_key(first_region_state.get_region()); + let region_end_key = enc_end_key(first_region_state.get_region()); + let iter_start = larger_key(start, ®ion_start_key, false); + let iter_end = smaller_key(end, ®ion_end_key, true); + assert!(!iter_start.is_empty() && !iter_start.is_empty()); let scanner = Some( MvccInfoScanner::new( |cf, opts| tablet.iterator_opt(cf, opts).map_err(|e| box_err!(e)), - if start.is_empty() { None } else { Some(start) }, - if end.is_empty() { None } else { Some(end) }, + Some(iter_start), + Some(iter_end), MvccInfoCollector::default(), ) .map_err(|e| -> Error { box_err!(e) })?, @@ -171,19 +204,16 @@ impl Iterator for MvccInfoIteratorV2 { ) .unwrap(); let tablet = tablet_cache.latest().unwrap(); + let region_start_key = enc_start_key(&self.cur_region); + let region_end_key = enc_end_key(&self.cur_region); + let iter_start = larger_key(&self.start, ®ion_start_key, false); + let iter_end = smaller_key(&self.end, ®ion_end_key, true); + assert!(!iter_start.is_empty() && !iter_start.is_empty()); self.scanner = Some( MvccInfoScanner::new( |cf, opts| tablet.iterator_opt(cf, opts).map_err(|e| box_err!(e)), - if self.start.is_empty() { - None - } else { - Some(self.start.as_bytes()) - }, - if self.end.is_empty() { - None - } else { - Some(self.end.as_bytes()) - }, + Some(iter_start), + Some(iter_end), MvccInfoCollector::default(), ) .unwrap(), @@ -658,19 +688,19 @@ impl Debugger for DebuggerImplV2 { fn region_size>(&self, region_id: u64, cfs: Vec) -> Result> { match self.raft_engine.get_region_state(region_id, u64::MAX) { Ok(Some(region_state)) => { - if region_state.get_state() != PeerState::Normal { - return Err(Error::NotFound(format!( - "region {:?} has been deleted", - region_id - ))); - } let region = region_state.get_region(); + let state = region_state.get_state(); let start_key = &keys::data_key(region.get_start_key()); let end_key = &keys::data_end_key(region.get_end_key()); let mut sizes = vec![]; let mut tablet_cache = get_tablet_cache(&self.tablet_reg, region.id, Some(region_state))?; - let tablet = tablet_cache.latest().unwrap(); + let Some(tablet) = tablet_cache.latest() else { + return Err(Error::NotFound(format!( + "tablet not found, region_id={:?}, peer_state={:?}", + region_id, state + ))); + }; for cf in cfs { let mut size = 0; box_try!(tablet.scan(cf.as_ref(), start_key, end_key, false, |k, v| { @@ -701,7 +731,7 @@ impl Debugger for DebuggerImplV2 { )); } - let mut region_states = get_all_region_states_with_normal_state(&self.raft_engine); + let mut region_states = get_all_active_region_states(&self.raft_engine); region_states.sort_by(|r1, r2| { r1.get_region() @@ -756,12 +786,21 @@ impl Debugger for DebuggerImplV2 { fn get_all_regions_in_store(&self) -> Result> { let mut region_ids = vec![]; + let raft_engine = &self.raft_engine; self.raft_engine .for_each_raft_group::(&mut |region_id| { + let region_state = raft_engine + .get_region_state(region_id, u64::MAX) + .unwrap() + .unwrap(); + if region_state.state == PeerState::Tombstone { + return Ok(()); + } region_ids.push(region_id); Ok(()) }) .unwrap(); + region_ids.sort_unstable(); Ok(region_ids) } @@ -814,21 +853,29 @@ impl Debugger for DebuggerImplV2 { Err(e) => return Err(Error::EngineTrait(e)), }; - if region_state.state != PeerState::Normal { - return Err(Error::NotFound(format!("none region {:?}", region_id))); + let state = region_state.get_state(); + if state == PeerState::Tombstone { + return Err(Error::NotFound(format!( + "region {:?} is tombstone", + region_id + ))); } - let region = region_state.get_region(); - let start = keys::enc_start_key(region); - let end = keys::enc_end_key(region); - - let mut tablet_cache = - get_tablet_cache(&self.tablet_reg, region.id, Some(region_state.clone())).unwrap(); - let tablet = tablet_cache.latest().unwrap(); + let region = region_state.get_region().clone(); + let start = keys::enc_start_key(®ion); + let end = keys::enc_end_key(®ion); + + let mut tablet_cache = get_tablet_cache(&self.tablet_reg, region.id, Some(region_state))?; + let Some(tablet) = tablet_cache.latest() else { + return Err(Error::NotFound(format!( + "tablet not found, region_id={:?}, peer_state={:?}", + region_id, state + ))); + }; let mut res = dump_write_cf_properties(tablet, &start, &end)?; let mut res1 = dump_default_cf_properties(tablet, &start, &end)?; res.append(&mut res1); - let middle_key = match box_try!(get_region_approximate_middle(tablet, region)) { + let middle_key = match box_try!(get_region_approximate_middle(tablet, ®ion)) { Some(data_key) => keys::origin_key(&data_key).to_vec(), None => Vec::new(), }; @@ -1072,9 +1119,7 @@ fn get_tablet_cache( } } -fn get_all_region_states_with_normal_state( - raft_engine: &ER, -) -> Vec { +fn get_all_active_region_states(raft_engine: &ER) -> Vec { let mut region_states = vec![]; raft_engine .for_each_raft_group::(&mut |region_id| { @@ -1082,7 +1127,7 @@ fn get_all_region_states_with_normal_state( .get_region_state(region_id, u64::MAX) .unwrap() .unwrap(); - if region_state.state == PeerState::Normal { + if region_state.state != PeerState::Tombstone { region_states.push(region_state); } Ok(()) @@ -1103,7 +1148,7 @@ fn deivde_regions_for_concurrency( registry: &TabletRegistry, threads: u64, ) -> Result>> { - let region_states = get_all_region_states_with_normal_state(raft_engine); + let region_states = get_all_active_region_states(raft_engine); if threads == 1 { return Ok(vec![ @@ -1154,38 +1199,28 @@ fn deivde_regions_for_concurrency( Ok(regions_groups) } -// `key1` and `key2` should both be start_key or end_key. -fn smaller_key<'a>(key1: &'a [u8], key2: &'a [u8], end_key: bool) -> &'a [u8] { - if end_key && key1.is_empty() { - return key2; - } - if end_key && key2.is_empty() { - return key1; - } - if key1 < key2 { - return key1; - } - key2 -} +#[cfg(any(test, feature = "testexport"))] +pub fn new_debugger(path: &std::path::Path) -> DebuggerImplV2 { + use crate::{config::TikvConfig, server::KvEngineFactoryBuilder}; -// `key1` and `key2` should both be start_key or end_key. -fn larger_key<'a>(key1: &'a [u8], key2: &'a [u8], end_key: bool) -> &'a [u8] { - if end_key && key1.is_empty() { - return key1; - } - if end_key && key2.is_empty() { - return key2; - } - if key1 < key2 { - return key2; - } - key1 + let mut cfg = TikvConfig::default(); + cfg.storage.data_dir = path.to_str().unwrap().to_string(); + cfg.raft_store.raftdb_path = cfg.infer_raft_db_path(None).unwrap(); + cfg.raft_engine.mut_config().dir = cfg.infer_raft_engine_path(None).unwrap(); + let cache = cfg.storage.block_cache.build_shared_cache(); + let env = cfg.build_shared_rocks_env(None, None).unwrap(); + + let factory = KvEngineFactoryBuilder::new(env, &cfg, cache, None).build(); + let reg = TabletRegistry::new(Box::new(factory), path).unwrap(); + + let raft_engine = + raft_log_engine::RaftLogEngine::new(cfg.raft_engine.config(), None, None).unwrap(); + + DebuggerImplV2::new(reg, raft_engine, ConfigController::default()) } #[cfg(test)] mod tests { - use std::path::Path; - use collections::HashMap; use engine_traits::{ RaftEngineReadOnly, RaftLogBatch, SyncMutable, ALL_CFS, CF_DEFAULT, CF_LOCK, CF_WRITE, @@ -1196,36 +1231,13 @@ mod tests { raft_serverpb::*, }; use raft::prelude::EntryType; - use raft_log_engine::RaftLogEngine; use raftstore::store::RAFT_INIT_LOG_INDEX; use tikv_util::store::new_peer; use super::*; - use crate::{ - config::TikvConfig, - server::KvEngineFactoryBuilder, - storage::{txn::tests::must_prewrite_put, TestEngineBuilder}, - }; - const INITIAL_TABLET_INDEX: u64 = 5; const INITIAL_APPLY_INDEX: u64 = 5; - fn new_debugger(path: &Path) -> DebuggerImplV2 { - let mut cfg = TikvConfig::default(); - cfg.storage.data_dir = path.to_str().unwrap().to_string(); - cfg.raft_store.raftdb_path = cfg.infer_raft_db_path(None).unwrap(); - cfg.raft_engine.mut_config().dir = cfg.infer_raft_engine_path(None).unwrap(); - let cache = cfg.storage.block_cache.build_shared_cache(); - let env = cfg.build_shared_rocks_env(None, None).unwrap(); - - let factory = KvEngineFactoryBuilder::new(env, &cfg, cache, None).build(); - let reg = TabletRegistry::new(Box::new(factory), path).unwrap(); - - let raft_engine = RaftLogEngine::new(cfg.raft_engine.config(), None, None).unwrap(); - - DebuggerImplV2::new(reg, raft_engine, ConfigController::default()) - } - impl DebuggerImplV2 { fn set_store_id(&self, store_id: u64) { let mut ident = self.get_store_ident().unwrap_or_default(); @@ -1455,126 +1467,10 @@ mod tests { let mut wb = raft_engine.log_batch(10); wb.put_region_state(region_id, 10, &state).unwrap(); raft_engine.consume(&mut wb, true).unwrap(); + debugger.tablet_reg.remove(region_id); debugger.region_size(region_id, cfs.clone()).unwrap_err(); } - // For simplicity, the format of the key is inline with data in - // prepare_data_on_disk - fn extract_key(key: &[u8]) -> &[u8] { - &key[1..4] - } - - // Prepare some data - // Data for each region: - // Region 1: k00 .. k04 - // Region 2: k05 .. k09 - // Region 3: k10 .. k14 - // Region 4: k15 .. k19 - // Region 5: k20 .. k24 - // Region 6: k26 .. k28 - fn prepare_data_on_disk(path: &Path) { - let mut cfg = TikvConfig::default(); - cfg.storage.data_dir = path.to_str().unwrap().to_string(); - cfg.raft_store.raftdb_path = cfg.infer_raft_db_path(None).unwrap(); - cfg.raft_engine.mut_config().dir = cfg.infer_raft_engine_path(None).unwrap(); - cfg.gc.enable_compaction_filter = false; - let cache = cfg.storage.block_cache.build_shared_cache(); - let env = cfg.build_shared_rocks_env(None, None).unwrap(); - - let factory = KvEngineFactoryBuilder::new(env, &cfg, cache, None).build(); - let reg = TabletRegistry::new(Box::new(factory), path).unwrap(); - - let raft_engine = RaftLogEngine::new(cfg.raft_engine.config(), None, None).unwrap(); - let mut wb = raft_engine.log_batch(5); - for i in 0..6 { - let mut region = metapb::Region::default(); - let start_key = format!("k{:02}", i * 5); - let end_key = format!("k{:02}", (i + 1) * 5); - region.set_id(i + 1); - region.set_start_key(start_key.into_bytes()); - region.set_end_key(end_key.into_bytes()); - let mut region_state = RegionLocalState::default(); - region_state.set_tablet_index(INITIAL_TABLET_INDEX); - if region.get_id() == 4 { - region_state.set_state(PeerState::Tombstone); - } else if region.get_id() == 6 { - region.set_start_key(b"k26".to_vec()); - region.set_end_key(b"k28".to_vec()); - } - region_state.set_region(region); - - let tablet_path = reg.tablet_path(i + 1, INITIAL_TABLET_INDEX); - // Use tikv_kv::RocksEngine instead of loading tablet from registry in order to - // use prewrite method to prepare mvcc data - let mut engine = TestEngineBuilder::new().path(tablet_path).build().unwrap(); - for i in i * 5..(i + 1) * 5 { - let key = format!("zk{:02}", i); - let val = format!("val{:02}", i); - // Use prewrite only is enough for preparing mvcc data - must_prewrite_put( - &mut engine, - key.as_bytes(), - val.as_bytes(), - key.as_bytes(), - 10, - ); - } - - wb.put_region_state(i + 1, INITIAL_APPLY_INDEX, ®ion_state) - .unwrap(); - } - raft_engine.consume(&mut wb, true).unwrap(); - } - - #[test] - fn test_scan_mvcc() { - let dir = test_util::temp_dir("test-debugger", false); - prepare_data_on_disk(dir.path()); - let debugger = new_debugger(dir.path()); - // Test scan with bad start, end or limit. - assert!(debugger.scan_mvcc(b"z", b"", 0).is_err()); - assert!(debugger.scan_mvcc(b"z", b"x", 3).is_err()); - - let verify_scanner = - |range, scanner: &mut dyn Iterator, MvccInfo)>>| { - for i in range { - let key = format!("k{:02}", i).into_bytes(); - assert_eq!(key, extract_key(&scanner.next().unwrap().unwrap().0)); - } - }; - - // full scann - let mut scanner = debugger.scan_mvcc(b"", b"", 100).unwrap(); - verify_scanner(0..15, &mut scanner); - verify_scanner(20..25, &mut scanner); - verify_scanner(26..28, &mut scanner); - assert!(scanner.next().is_none()); - - // Range has more elements than limit - let mut scanner = debugger.scan_mvcc(b"zk01", b"zk09", 5).unwrap(); - verify_scanner(1..6, &mut scanner); - assert!(scanner.next().is_none()); - - // Range has less elements than limit - let mut scanner = debugger.scan_mvcc(b"zk07", b"zk10", 10).unwrap(); - verify_scanner(7..10, &mut scanner); - assert!(scanner.next().is_none()); - - // Start from the key where no region contains it - let mut scanner = debugger.scan_mvcc(b"zk16", b"", 100).unwrap(); - verify_scanner(20..25, &mut scanner); - verify_scanner(26..28, &mut scanner); - assert!(scanner.next().is_none()); - - // Scan a range not existed in the cluster - let mut scanner = debugger.scan_mvcc(b"zk16", b"zk19", 100).unwrap(); - assert!(scanner.next().is_none()); - - // The end key is less than the start_key of the first region - let mut scanner = debugger.scan_mvcc(b"", b"zj", 100).unwrap(); - assert!(scanner.next().is_none()); - } - #[test] fn test_compact() { let dir = test_util::temp_dir("test-debugger", false); @@ -2050,9 +1946,9 @@ mod tests { assert_eq!(region_info_2, region_info_2_before); } - #[test] // It tests that the latest apply state cannot be read as it is invisible // on persisted_applied + #[test] fn test_drop_unapplied_raftlog_2() { let dir = test_util::temp_dir("test-debugger", false); let debugger = new_debugger(dir.path()); @@ -2088,4 +1984,34 @@ mod tests { 80 ); } + + #[test] + fn test_get_all_regions_in_store() { + let dir = test_util::temp_dir("test-debugger", false); + let debugger = new_debugger(dir.path()); + let raft_engine = &debugger.raft_engine; + + init_region_state(raft_engine, 1, &[100, 101], 1); + init_region_state(raft_engine, 3, &[100, 101], 1); + init_region_state(raft_engine, 4, &[100, 101], 1); + + let mut lb = raft_engine.log_batch(3); + + let mut put_tombsotne_region = |region_id: u64| { + let mut region = metapb::Region::default(); + region.set_id(region_id); + let mut region_state = RegionLocalState::default(); + region_state.set_state(PeerState::Tombstone); + region_state.set_region(region.clone()); + lb.put_region_state(region_id, INITIAL_APPLY_INDEX, ®ion_state) + .unwrap(); + raft_engine.consume(&mut lb, true).unwrap(); + }; + + put_tombsotne_region(2); + put_tombsotne_region(5); + + let regions = debugger.get_all_regions_in_store().unwrap(); + assert_eq!(regions, vec![1, 3, 4]); + } } diff --git a/src/server/engine_factory.rs b/src/server/engine_factory.rs index 85de282b137..3593c01ca7f 100644 --- a/src/server/engine_factory.rs +++ b/src/server/engine_factory.rs @@ -56,7 +56,7 @@ impl KvEngineFactoryBuilder { flow_listener: None, sst_recovery_sender: None, encryption_key_manager: key_manager, - db_resources: config.rocksdb.build_resources(env), + db_resources: config.rocksdb.build_resources(env, config.storage.engine), cf_resources: config.rocksdb.build_cf_resources(cache), state_storage: None, lite: false, diff --git a/src/server/gc_worker/OWNERS b/src/server/gc_worker/OWNERS new file mode 100644 index 00000000000..0ce3dbff3a6 --- /dev/null +++ b/src/server/gc_worker/OWNERS @@ -0,0 +1,7 @@ +# See the OWNERS docs at https://go.k8s.io/owners +options: + no_parent_owners: true +filters: + "(OWNERS|config\\.rs)$": + approvers: + - sig-critical-approvers-config-src diff --git a/src/server/gc_worker/compaction_filter.rs b/src/server/gc_worker/compaction_filter.rs index 665824a1bac..2bea0cf347b 100644 --- a/src/server/gc_worker/compaction_filter.rs +++ b/src/server/gc_worker/compaction_filter.rs @@ -888,7 +888,7 @@ pub mod test_utils { cfg.ratio_threshold = ratio_threshold; } cfg.enable_compaction_filter = true; - GcWorkerConfigManager(Arc::new(VersionTrack::new(cfg))) + GcWorkerConfigManager(Arc::new(VersionTrack::new(cfg)), None) }; let feature_gate = { let feature_gate = FeatureGate::default(); diff --git a/src/server/gc_worker/config.rs b/src/server/gc_worker/config.rs index 1816dd845e1..809c55e1268 100644 --- a/src/server/gc_worker/config.rs +++ b/src/server/gc_worker/config.rs @@ -3,7 +3,10 @@ use std::sync::Arc; use online_config::{ConfigChange, ConfigManager, OnlineConfig}; -use tikv_util::config::{ReadableSize, VersionTrack}; +use tikv_util::{ + config::{ReadableSize, VersionTrack}, + yatp_pool::FuturePool, +}; const DEFAULT_GC_RATIO_THRESHOLD: f64 = 1.1; pub const DEFAULT_GC_BATCH_KEYS: usize = 512; @@ -22,6 +25,8 @@ pub struct GcConfig { /// greater than 5.0.0. Change `compaction_filter_skip_version_check` /// can enable it by force. pub compaction_filter_skip_version_check: bool, + /// gc threads count + pub num_threads: usize, } impl Default for GcConfig { @@ -32,6 +37,7 @@ impl Default for GcConfig { max_write_bytes_per_sec: ReadableSize(DEFAULT_GC_MAX_WRITE_BYTES_PER_SEC), enable_compaction_filter: true, compaction_filter_skip_version_check: false, + num_threads: 1, } } } @@ -41,12 +47,15 @@ impl GcConfig { if self.batch_keys == 0 { return Err("gc.batch_keys should not be 0".into()); } + if self.num_threads == 0 { + return Err("gc.thread_count should not be 0".into()); + } Ok(()) } } #[derive(Clone, Default)] -pub struct GcWorkerConfigManager(pub Arc>); +pub struct GcWorkerConfigManager(pub Arc>, pub Option); impl ConfigManager for GcWorkerConfigManager { fn dispatch( @@ -55,6 +64,16 @@ impl ConfigManager for GcWorkerConfigManager { ) -> std::result::Result<(), Box> { { let change = change.clone(); + if let Some(pool) = self.1.as_ref() { + if let Some(v) = change.get("num_threads") { + let pool_size: usize = v.into(); + pool.scale_pool_size(pool_size); + info!( + "GC worker thread count is changed"; + "new_thread_count" => pool_size, + ); + } + } self.0 .update(move |cfg: &mut GcConfig| cfg.update(change))?; } diff --git a/src/server/gc_worker/gc_manager.rs b/src/server/gc_worker/gc_manager.rs index be18f8216d5..d9c5287b67d 100644 --- a/src/server/gc_worker/gc_manager.rs +++ b/src/server/gc_worker/gc_manager.rs @@ -4,7 +4,7 @@ use std::{ cmp::Ordering, sync::{ atomic::{AtomicU64, Ordering as AtomicOrdering}, - mpsc, Arc, + mpsc, Arc, Condvar, Mutex, }, thread::{self, Builder as ThreadBuilder, JoinHandle}, time::Duration, @@ -20,10 +20,10 @@ use txn_types::{Key, TimeStamp}; use super::{ compaction_filter::is_compaction_filter_allowed, config::GcWorkerConfigManager, - gc_worker::{sync_gc, GcSafePointProvider, GcTask}, + gc_worker::{schedule_gc, GcSafePointProvider, GcTask}, Result, }; -use crate::{server::metrics::*, tikv_util::sys::thread::StdThreadBuildWrapper}; +use crate::{server::metrics::*, storage::Callback, tikv_util::sys::thread::StdThreadBuildWrapper}; const POLL_SAFE_POINT_INTERVAL_SECS: u64 = 10; @@ -245,6 +245,8 @@ pub(super) struct GcManager GcManager { @@ -254,6 +256,7 @@ impl GcMan worker_scheduler: Scheduler>, cfg_tracker: GcWorkerConfigManager, feature_gate: FeatureGate, + concurrent_tasks: usize, ) -> GcManager { GcManager { cfg, @@ -263,6 +266,7 @@ impl GcMan gc_manager_ctx: GcManagerContext::new(), cfg_tracker, feature_gate, + max_concurrent_tasks: concurrent_tasks, } } @@ -442,13 +446,27 @@ impl GcMan let mut progress = Some(Key::from_encoded(BEGIN_KEY.to_vec())); // Records how many region we have GC-ed. - let mut processed_regions = 0; + let mut scheduled_regions = 0; + let task_controller = Arc::new((Mutex::new(0), Condvar::new())); + // the task_controller is the combination to control the number + // of tasks The mutex is used for protecting the number of current + // tasks, while the condvar is used for notifying/get notified when the + // number of current tasks is changed. + let (lock, cvar) = &*task_controller; + let maybe_wait = |max_tasks| { + let mut current_tasks: std::sync::MutexGuard<'_, usize> = lock.lock().unwrap(); + while *current_tasks > max_tasks { + // Wait until the number of current tasks is below the limit + current_tasks = cvar.wait(current_tasks).unwrap(); + } + }; info!("gc_worker: auto gc starts"; "safe_point" => self.curr_safe_point()); // The following loop iterates all regions whose leader is on this TiKV and does // GC on them. At the same time, check whether safe_point is updated // periodically. If it's updated, rewinding will happen. + loop { self.gc_manager_ctx.check_stopped()?; if is_compaction_filter_allowed(&self.cfg_tracker.value(), &self.feature_gate) { @@ -462,9 +480,9 @@ impl GcMan // We have worked to the end and we need to rewind. Restart from beginning. progress = Some(Key::from_encoded(BEGIN_KEY.to_vec())); need_rewind = false; - info!("gc_worker: auto gc rewinds"; "processed_regions" => processed_regions); + info!("gc_worker: auto gc rewinds"; "scheduled_regions" => scheduled_regions); - processed_regions = 0; + scheduled_regions = 0; // Set the metric to zero to show that rewinding has happened. AUTO_GC_PROCESSED_REGIONS_GAUGE_VEC .with_label_values(&[PROCESS_TYPE_GC]) @@ -483,19 +501,40 @@ impl GcMan if finished { // We have worked to the end of the TiKV or our progress has reached `end`, and // we don't need to rewind. In this case, the round of GC has finished. - info!("gc_worker: auto gc finishes"; "processed_regions" => processed_regions); - return Ok(()); + info!("gc_worker: all regions task are scheduled"; + "processed_regions" => scheduled_regions, + ); + break; } } - assert!(progress.is_some()); // Before doing GC, check whether safe_point is updated periodically to // determine if rewinding is needed. self.check_if_need_rewind(&progress, &mut need_rewind, &mut end); - progress = self.gc_next_region(progress.unwrap(), &mut processed_regions)?; + let controller: Arc<(Mutex, Condvar)> = Arc::clone(&task_controller); + let cb = Box::new(move |_res| { + let (lock, cvar) = &*controller; + let mut current_tasks = lock.lock().unwrap(); + *current_tasks -= 1; + cvar.notify_one(); + AUTO_GC_PROCESSED_REGIONS_GAUGE_VEC + .with_label_values(&[PROCESS_TYPE_GC]) + .inc(); + }); + maybe_wait(self.max_concurrent_tasks - 1); + let mut current_tasks = lock.lock().unwrap(); + progress = self.async_gc_next_region(progress.unwrap(), cb, &mut current_tasks)?; + scheduled_regions += 1; } + + // wait for all tasks finished + self.gc_manager_ctx.check_stopped()?; + maybe_wait(0); + info!("gc_worker: auto gc finishes"; "processed_regions" => scheduled_regions); + + Ok(()) } /// Checks whether we need to rewind in this round of GC. Only used in @@ -536,13 +575,14 @@ impl GcMan } } - /// Does GC on the next region after `from_key`. Returns the end key of the - /// region it processed. If we have processed to the end of all regions, - /// returns `None`. - fn gc_next_region( + /// Does GC on the next region after `from_key` asynchronously. Returns the + /// end key of the region it processed. If we have processed to the end + /// of all regions, returns `None`. + fn async_gc_next_region( &mut self, from_key: Key, - processed_regions: &mut usize, + callback: Callback<()>, + running_tasks: &mut usize, ) -> GcManagerResult> { // Get the information of the next region to do GC. let (region, next_key) = self.get_next_gc_context(from_key); @@ -552,16 +592,16 @@ impl GcMan let hex_end = format!("{:?}", log_wrappers::Value::key(region.get_end_key())); debug!("trying gc"; "region_id" => region.id, "start_key" => &hex_start, "end_key" => &hex_end); - if let Err(e) = sync_gc(&self.worker_scheduler, region, self.curr_safe_point()) { - // Ignore the error and continue, since it's useless to retry this. - // TODO: Find a better way to handle errors. Maybe we should retry. - warn!("failed gc"; "start_key" => &hex_start, "end_key" => &hex_end, "err" => ?e); - } - - *processed_regions += 1; - AUTO_GC_PROCESSED_REGIONS_GAUGE_VEC - .with_label_values(&[PROCESS_TYPE_GC]) - .inc(); + let _ = schedule_gc( + &self.worker_scheduler, + region, + self.curr_safe_point(), + callback, + ) + .map(|_| { + *running_tasks += 1; + Ok::<(), GcManagerError>(()) + }); Ok(next_key) } @@ -710,8 +750,16 @@ mod tests { impl GcManagerTestUtil { pub fn new(regions: BTreeMap, RegionInfo>) -> Self { let (gc_task_sender, gc_task_receiver) = channel(); - let worker = WorkerBuilder::new("test-gc-manager").create(); - let scheduler = worker.start("gc-manager", MockGcRunner { tx: gc_task_sender }); + let worker = WorkerBuilder::new("test-gc-manager") + .thread_count(2) + .create(); + let scheduler = worker.start( + "gc-manager", + MockGcRunner { + tx: gc_task_sender.clone(), + }, + ); + worker.start("gc-manager", MockGcRunner { tx: gc_task_sender }); let (safe_point_sender, safe_point_receiver) = channel(); @@ -731,6 +779,7 @@ mod tests { scheduler, GcWorkerConfigManager::default(), Default::default(), + 2, ); Self { gc_manager: Some(gc_manager), diff --git a/src/server/gc_worker/gc_worker.rs b/src/server/gc_worker/gc_worker.rs index c608470ba87..a0537a478d0 100644 --- a/src/server/gc_worker/gc_worker.rs +++ b/src/server/gc_worker/gc_worker.rs @@ -34,6 +34,7 @@ use tikv_util::{ Either, }; use txn_types::{Key, TimeStamp}; +use yatp::{task::future::TaskCell, Remote}; use super::{ check_need_gc, @@ -178,7 +179,7 @@ where } /// Used to perform GC operations on the engine. -pub struct GcRunner { +pub struct GcRunnerCore { store_id: u64, engine: E, @@ -193,6 +194,26 @@ pub struct GcRunner { stats_map: HashMap, } +impl Clone for GcRunnerCore { + fn clone(&self) -> Self { + GcRunnerCore { + store_id: self.store_id, + engine: self.engine.clone(), + flow_info_sender: self.flow_info_sender.clone(), + limiter: self.limiter.clone(), + cfg: self.cfg.clone(), + cfg_tracker: self.cfg_tracker.clone(), + stats_map: HashMap::default(), + } + } +} + +/// Used to perform GC operations on the engine. +pub struct GcRunner { + inner: GcRunnerCore, + pool: Remote, +} + pub const MAX_RAW_WRITE_SIZE: usize = 32 * 1024; pub struct MvccRaw { @@ -282,7 +303,7 @@ fn init_snap_ctx(store_id: u64, region: &Region) -> Context { ctx } -impl GcRunner { +impl GcRunnerCore { pub fn new( store_id: u64, engine: E, @@ -918,18 +939,12 @@ impl GcRunner { error!("failed to flush deletes, will leave garbage"; "err" => ?e); } } -} - -impl Runnable for GcRunner { - type Task = GcTask; #[inline] fn run(&mut self, task: GcTask) { let _io_type_guard = WithIoType::new(IoType::Gc); let enum_label = task.get_enum_label(); - GC_GCTASK_COUNTER_STATIC.get(enum_label).inc(); - let timer = SlowTimer::from_secs(GC_TASK_SLOW_SECONDS); let update_metrics = |is_err| { GC_TASK_DURATION_HISTOGRAM_VEC @@ -941,9 +956,6 @@ impl Runnable for GcRunner { } }; - // Refresh config before handle task - self.refresh_cfg(); - match task { GcTask::Gc { region, @@ -1062,6 +1074,37 @@ impl Runnable for GcRunner { } } +impl GcRunner { + pub fn new( + store_id: u64, + engine: E, + flow_info_sender: Sender, + cfg_tracker: Tracker, + cfg: GcConfig, + pool: Remote, + ) -> Self { + Self { + inner: GcRunnerCore::new(store_id, engine, flow_info_sender, cfg_tracker, cfg), + pool, + } + } +} + +impl Runnable for GcRunner { + type Task = GcTask; + + #[inline] + fn run(&mut self, task: GcTask) { + // Refresh config before handle task + self.inner.refresh_cfg(); + + let mut inner = self.inner.clone(); + self.pool.spawn(async move { + inner.run(task); + }); + } +} + /// When we failed to schedule a `GcTask` to `GcRunner`, use this to handle the /// `ScheduleError`. fn handle_gc_task_schedule_error(e: ScheduleError>) -> Result<()> { @@ -1081,7 +1124,7 @@ fn handle_gc_task_schedule_error(e: ScheduleError>) -> Res } /// Schedules a `GcTask` to the `GcRunner`. -fn schedule_gc( +pub fn schedule_gc( scheduler: &Scheduler>, region: Region, safe_point: TimeStamp, @@ -1174,13 +1217,18 @@ impl GcWorker { feature_gate: FeatureGate, region_info_provider: Arc, ) -> Self { - let worker_builder = WorkerBuilder::new("gc-worker").pending_capacity(GC_MAX_PENDING_TASKS); + let worker_builder = WorkerBuilder::new("gc-worker") + .pending_capacity(GC_MAX_PENDING_TASKS) + .thread_count(cfg.num_threads); let worker = worker_builder.create().lazy_build("gc-worker"); let worker_scheduler = worker.scheduler(); GcWorker { engine, flow_info_sender: Some(flow_info_sender), - config_manager: GcWorkerConfigManager(Arc::new(VersionTrack::new(cfg))), + config_manager: GcWorkerConfigManager( + Arc::new(VersionTrack::new(cfg)), + Some(worker.pool()), + ), refs: Arc::new(AtomicUsize::new(1)), worker: Arc::new(Mutex::new(worker)), worker_scheduler, @@ -1219,6 +1267,7 @@ impl GcWorker { self.scheduler(), self.config_manager.clone(), self.feature_gate.clone(), + self.config_manager.value().num_threads, ) .start()?; *handle = Some(new_handle); @@ -1226,14 +1275,20 @@ impl GcWorker { } pub fn start(&mut self, store_id: u64) -> Result<()> { + let mut worker = self.worker.lock().unwrap(); let runner = GcRunner::new( store_id, self.engine.clone(), self.flow_info_sender.take().unwrap(), - self.config_manager.0.clone().tracker("gc-woker".to_owned()), + self.config_manager + .0 + .clone() + .tracker("gc-worker".to_owned()), self.config_manager.value().clone(), + worker.remote(), ); - self.worker.lock().unwrap().start(runner); + worker.start(runner); + Ok(()) } @@ -1296,6 +1351,10 @@ impl GcWorker { pub fn get_config_manager(&self) -> GcWorkerConfigManager { self.config_manager.clone() } + + pub fn get_worker_thread_count(&self) -> usize { + self.worker.lock().unwrap().pool_size() + } } #[cfg(any(test, feature = "testexport"))] @@ -1486,6 +1545,7 @@ mod tests { use engine_traits::Peekable as _; use futures::executor::block_on; use kvproto::{kvrpcpb::ApiVersion, metapb::Peer}; + use online_config::{ConfigChange, ConfigManager, ConfigValue}; use raft::StateRole; use raftstore::coprocessor::{ region_info_accessor::{MockRegionInfoProvider, RegionInfoAccessor}, @@ -1634,10 +1694,12 @@ mod tests { region2.mut_peers().push(new_peer(store_id, 2)); region2.set_start_key(split_key.to_vec()); + let mut gc_config = GcConfig::default(); + gc_config.num_threads = 2; let mut gc_worker = GcWorker::new( engine, tx, - GcConfig::default(), + gc_config, gate, Arc::new(MockRegionInfoProvider::new(vec![region1, region2])), ); @@ -1810,10 +1872,12 @@ mod tests { let mut host = CoprocessorHost::::default(); let ri_provider = RegionInfoAccessor::new(&mut host); + let mut gc_config = GcConfig::default(); + gc_config.num_threads = 2; let mut gc_worker = GcWorker::new( prefixed_engine.clone(), tx, - GcConfig::default(), + gc_config, feature_gate, Arc::new(ri_provider.clone()), ); @@ -1902,13 +1966,13 @@ mod tests { let (tx, _rx) = mpsc::channel(); let cfg = GcConfig::default(); - let mut runner = GcRunner::new( + let mut runner = GcRunnerCore::new( store_id, prefixed_engine.clone(), tx, - GcWorkerConfigManager(Arc::new(VersionTrack::new(cfg.clone()))) + GcWorkerConfigManager(Arc::new(VersionTrack::new(cfg.clone())), None) .0 - .tracker("gc-woker".to_owned()), + .tracker("gc-worker".to_owned()), cfg, ); @@ -1966,13 +2030,13 @@ mod tests { let (tx, _rx) = mpsc::channel(); let cfg = GcConfig::default(); - let mut runner = GcRunner::new( + let mut runner = GcRunnerCore::new( store_id, prefixed_engine.clone(), tx, - GcWorkerConfigManager(Arc::new(VersionTrack::new(cfg.clone()))) + GcWorkerConfigManager(Arc::new(VersionTrack::new(cfg.clone())), None) .0 - .tracker("gc-woker".to_owned()), + .tracker("gc-worker".to_owned()), cfg, ); @@ -2067,13 +2131,13 @@ mod tests { let (tx, _rx) = mpsc::channel(); let cfg = GcConfig::default(); - let mut runner = GcRunner::new( + let mut runner = GcRunnerCore::new( 1, prefixed_engine.clone(), tx, - GcWorkerConfigManager(Arc::new(VersionTrack::new(cfg.clone()))) + GcWorkerConfigManager(Arc::new(VersionTrack::new(cfg.clone())), None) .0 - .tracker("gc-woker".to_owned()), + .tracker("gc-worker".to_owned()), cfg, ); @@ -2202,10 +2266,12 @@ mod tests { let mut region = Region::default(); region.mut_peers().push(new_peer(store_id, 1)); + let mut gc_config = GcConfig::default(); + gc_config.num_threads = 2; let mut gc_worker = GcWorker::new( engine.clone(), tx, - GcConfig::default(), + gc_config, gate, Arc::new(MockRegionInfoProvider::new(vec![region.clone()])), ); @@ -2333,7 +2399,7 @@ mod tests { ) -> ( MultiRocksEngine, Arc, - GcRunner, + GcRunnerCore, Vec, mpsc::Receiver, ) { @@ -2386,13 +2452,13 @@ mod tests { ])); let cfg = GcConfig::default(); - let gc_runner = GcRunner::new( + let gc_runner = GcRunnerCore::new( store_id, engine.clone(), tx, - GcWorkerConfigManager(Arc::new(VersionTrack::new(cfg.clone()))) + GcWorkerConfigManager(Arc::new(VersionTrack::new(cfg.clone())), None) .0 - .tracker("gc-woker".to_owned()), + .tracker("gc-worker".to_owned()), cfg, ); @@ -2564,13 +2630,13 @@ mod tests { let ri_provider = Arc::new(MockRegionInfoProvider::new(vec![r1, r2])); let cfg = GcConfig::default(); - let mut gc_runner = GcRunner::new( + let mut gc_runner = GcRunnerCore::new( store_id, engine.clone(), tx, - GcWorkerConfigManager(Arc::new(VersionTrack::new(cfg.clone()))) + GcWorkerConfigManager(Arc::new(VersionTrack::new(cfg.clone())), None) .0 - .tracker("gc-woker".to_owned()), + .tracker("gc-worker".to_owned()), cfg, ); @@ -2756,4 +2822,33 @@ mod tests { test_destroy_range_for_multi_rocksdb_impl(b"k05", b"k195", vec![1, 2]); test_destroy_range_for_multi_rocksdb_impl(b"k099", b"k25", vec![2, 3]); } + + #[test] + fn test_update_gc_thread_count() { + let engine = TestEngineBuilder::new().build().unwrap(); + let (tx, _rx) = mpsc::channel(); + let gate = FeatureGate::default(); + gate.set_version("5.0.0").unwrap(); + let mut gc_config = GcConfig::default(); + gc_config.num_threads = 1; + let gc_worker = GcWorker::new( + engine, + tx, + gc_config, + gate, + Arc::new(MockRegionInfoProvider::new(vec![])), + ); + let mut config_change = ConfigChange::new(); + config_change.insert(String::from("num_threads"), ConfigValue::Usize(5)); + let mut cfg_manager = gc_worker.get_config_manager(); + cfg_manager.dispatch(config_change).unwrap(); + + assert_eq!(gc_worker.get_worker_thread_count(), 5); + + let mut config_change = ConfigChange::new(); + config_change.insert(String::from("num_threads"), ConfigValue::Usize(2)); + cfg_manager.dispatch(config_change).unwrap(); + + assert_eq!(gc_worker.get_worker_thread_count(), 2); + } } diff --git a/src/server/lock_manager/OWNERS b/src/server/lock_manager/OWNERS new file mode 100644 index 00000000000..0ce3dbff3a6 --- /dev/null +++ b/src/server/lock_manager/OWNERS @@ -0,0 +1,7 @@ +# See the OWNERS docs at https://go.k8s.io/owners +options: + no_parent_owners: true +filters: + "(OWNERS|config\\.rs)$": + approvers: + - sig-critical-approvers-config-src diff --git a/src/server/lock_manager/deadlock.rs b/src/server/lock_manager/deadlock.rs index 9583df80dd6..fd749cc3175 100644 --- a/src/server/lock_manager/deadlock.rs +++ b/src/server/lock_manager/deadlock.rs @@ -1119,7 +1119,7 @@ pub mod tests { use tikv_util::worker::FutureWorker; use super::*; - use crate::server::resolve::Callback; + use crate::server::resolve; #[test] fn test_detect_table() { @@ -1467,15 +1467,6 @@ pub mod tests { impl PdClient for MockPdClient {} - #[derive(Clone)] - pub(crate) struct MockResolver; - - impl StoreAddrResolver for MockResolver { - fn resolve(&self, _store_id: u64, _cb: Callback) -> Result<()> { - Err(Error::Other(box_err!("unimplemented"))) - } - } - fn start_deadlock_detector( host: &mut CoprocessorHost, ) -> (FutureWorker, Scheduler) { @@ -1485,7 +1476,7 @@ pub mod tests { let detector_runner = Detector::new( 1, Arc::new(MockPdClient {}), - MockResolver {}, + resolve::MockStoreAddrResolver::default(), Arc::new(SecurityManager::new(&SecurityConfig::default()).unwrap()), waiter_mgr_scheduler, &Config::default(), diff --git a/src/server/lock_manager/mod.rs b/src/server/lock_manager/mod.rs index 243d533a0e5..c42531ae0fd 100644 --- a/src/server/lock_manager/mod.rs +++ b/src/server/lock_manager/mod.rs @@ -318,7 +318,7 @@ mod tests { use self::{deadlock::tests::*, metrics::*, waiter_manager::tests::*}; use super::*; - use crate::storage::lock_manager::LockDigest; + use crate::{server::resolve::MockStoreAddrResolver, storage::lock_manager::LockDigest}; fn start_lock_manager() -> LockManager { let mut coprocessor_host = CoprocessorHost::::default(); @@ -336,7 +336,7 @@ mod tests { .start( 1, Arc::new(MockPdClient {}), - MockResolver {}, + MockStoreAddrResolver::default(), Arc::new(SecurityManager::new(&SecurityConfig::default()).unwrap()), &cfg, ) diff --git a/src/server/lock_manager/waiter_manager.rs b/src/server/lock_manager/waiter_manager.rs index c0e97e25e3a..f1b3c9b3dbe 100644 --- a/src/server/lock_manager/waiter_manager.rs +++ b/src/server/lock_manager/waiter_manager.rs @@ -554,7 +554,9 @@ impl WaiterManager { continue; } - if let Some((previous_wait_info, diag_ctx)) = previous_wait_info { + if let Some((previous_wait_info, diag_ctx)) = previous_wait_info + && previous_wait_info.allow_lock_with_conflict + { self.detector_scheduler .clean_up_wait_for(event.start_ts, previous_wait_info); self.detector_scheduler @@ -678,6 +680,7 @@ pub mod tests { key: Key::from_raw(b""), lock_digest: LockDigest { ts: lock_ts, hash }, lock_info: Default::default(), + allow_lock_with_conflict: false, }, cancel_callback: Box::new(|_| ()), diag_ctx: DiagnosticContext::default(), @@ -798,6 +801,7 @@ pub mod tests { key: Key::from_raw(&raw_key), lock_digest: lock, lock_info: info.clone(), + allow_lock_with_conflict: false, }, cb, Instant::now() + Duration::from_millis(3000), @@ -1202,6 +1206,7 @@ pub mod tests { key: key.to_raw().unwrap(), ..Default::default() }, + allow_lock_with_conflict: false, }, }; scheduler.update_wait_for(vec![event]); diff --git a/src/server/metrics.rs b/src/server/metrics.rs index 2745be59a71..e3c9029c12f 100644 --- a/src/server/metrics.rs +++ b/src/server/metrics.rs @@ -86,6 +86,7 @@ make_auto_flush_static_metric! { failed, success, tombstone, + not_found, } pub label_enum ReplicaReadLockCheckResult { @@ -208,10 +209,11 @@ lazy_static! { &["type"] ) .unwrap(); + // TODO: deprecate the "name" label in v8.0. pub static ref GRPC_RESOURCE_GROUP_COUNTER_VEC: IntCounterVec = register_int_counter_vec!( "tikv_grpc_resource_group_total", "Total number of handle grpc message for each resource group", - &["name"] + &["name", "resource_group"] ) .unwrap(); pub static ref GRPC_PROXY_MSG_COUNTER_VEC: IntCounterVec = register_int_counter_vec!( diff --git a/src/server/node.rs b/src/server/node.rs index 228f679ed14..4e7ca0fcfe6 100644 --- a/src/server/node.rs +++ b/src/server/node.rs @@ -20,8 +20,8 @@ use raftstore::{ store::{ self, fsm::{store::StoreMeta, ApplyRouter, RaftBatchSystem, RaftRouter}, - initial_region, AutoSplitController, Config as StoreConfig, GlobalReplicationState, PdTask, - RefreshConfigTask, SnapManager, SplitCheckTask, Transport, + initial_region, AutoSplitController, Config as StoreConfig, DiskCheckRunner, + GlobalReplicationState, PdTask, RefreshConfigTask, SnapManager, SplitCheckTask, Transport, }, }; use resource_metering::CollectorRegHandle; @@ -173,6 +173,7 @@ where concurrency_manager: ConcurrencyManager, collector_reg_handle: CollectorRegHandle, causal_ts_provider: Option>, // used for rawkv apiv2 + disk_check_runner: DiskCheckRunner, grpc_service_mgr: GrpcServiceManager, safe_point: Arc, ) -> Result<()> @@ -212,6 +213,7 @@ where concurrency_manager, collector_reg_handle, causal_ts_provider, + disk_check_runner, grpc_service_mgr, safe_point, )?; @@ -461,6 +463,7 @@ where concurrency_manager: ConcurrencyManager, collector_reg_handle: CollectorRegHandle, causal_ts_provider: Option>, // used for rawkv apiv2 + disk_check_runner: DiskCheckRunner, grpc_service_mgr: GrpcServiceManager, safe_point: Arc, ) -> Result<()> @@ -496,6 +499,7 @@ where collector_reg_handle, self.health_service.clone(), causal_ts_provider, + disk_check_runner, grpc_service_mgr, safe_point, )?; diff --git a/src/server/raft_client.rs b/src/server/raft_client.rs index f30e5b36045..b120011c490 100644 --- a/src/server/raft_client.rs +++ b/src/server/raft_client.rs @@ -46,8 +46,11 @@ use tikv_util::{ use yatp::{task::future::TaskCell, ThreadPool}; use crate::server::{ - self, load_statistics::ThreadLoadPool, metrics::*, snap::Task as SnapTask, Config, - StoreAddrResolver, + load_statistics::ThreadLoadPool, + metrics::*, + resolve::{Error as ResolveError, Result as ResolveResult}, + snap::Task as SnapTask, + Config, StoreAddrResolver, }; pub struct MetadataSourceStoreId {} @@ -642,7 +645,7 @@ where S: StoreAddrResolver, R: RaftExtension + Unpin + 'static, { - fn resolve(&self) -> impl Future> { + fn resolve(&self) -> impl Future> { let (tx, rx) = oneshot::channel(); let store_id = self.store_id; let res = self.builder.resolver.resolve( @@ -673,7 +676,7 @@ where res?; match rx.await { Ok(a) => a, - Err(_) => Err(server::Error::Other( + Err(_) => Err(ResolveError::Other( "failed to receive resolve result".into(), )), } @@ -824,8 +827,7 @@ async fn start( RESOLVE_STORE_COUNTER.with_label_values(&["failed"]).inc(); back_end.clear_pending_message("resolve"); error_unknown!(?e; "resolve store address failed"; "store_id" => back_end.store_id,); - // TOMBSTONE - if format!("{}", e).contains("has been removed") { + if let ResolveError::StoreTombstone(_) = e { let mut pool = pool.lock().unwrap(); if let Some(s) = pool.connections.remove(&(back_end.store_id, conn_id)) { s.set_conn_state(ConnState::Disconnected); @@ -940,7 +942,7 @@ struct CachedQueue { /// ```text /// for m in msgs { /// if !raft_client.send(m) { -/// // handle error. +/// // handle error. /// } /// } /// raft_client.flush(); diff --git a/src/server/raftkv/mod.rs b/src/server/raftkv/mod.rs index 2074d469310..9e6cc711558 100644 --- a/src/server/raftkv/mod.rs +++ b/src/server/raftkv/mod.rs @@ -45,7 +45,7 @@ use raftstore::{ router::{LocalReadRouter, RaftStoreRouter}, store::{ self, util::encode_start_ts_into_flag_data, Callback as StoreCallback, RaftCmdExtraOpts, - ReadCallback, ReadIndexContext, ReadResponse, RegionSnapshot, StoreMsg, WriteResponse, + ReadIndexContext, ReadResponse, RegionSnapshot, StoreMsg, WriteResponse, }, }; use thiserror::Error; @@ -55,7 +55,7 @@ use tikv_util::{ future::{paired_future_callback, paired_must_called_future_callback}, time::Instant, }; -use tracker::GLOBAL_TRACKERS; +use tracker::{get_tls_tracker_token, GLOBAL_TRACKERS}; use txn_types::{Key, TimeStamp, TxnExtra, TxnExtraScheduler, WriteBatchFlags}; use super::metrics::*; @@ -548,6 +548,10 @@ where }); let mut res = match on_write_result::(resp) { Ok(CmdRes::Resp(_)) => { + ASYNC_REQUESTS_COUNTER_VEC.write.success.inc(); + ASYNC_REQUESTS_DURATIONS_VEC + .write + .observe(begin_instant.saturating_elapsed_secs()); fail_point!("raftkv_async_write_finish"); Ok(()) } @@ -581,18 +585,9 @@ where tx.notify(res); } rx.inspect(move |ev| { - let WriteEvent::Finished(res) = ev else { return }; - match res { - Ok(()) => { - ASYNC_REQUESTS_COUNTER_VEC.write.success.inc(); - ASYNC_REQUESTS_DURATIONS_VEC - .write - .observe(begin_instant.saturating_elapsed_secs()); - } - Err(e) => { - let status_kind = get_status_kind_from_engine_error(e); - ASYNC_REQUESTS_COUNTER_VEC.write.get(status_kind).inc(); - } + if let WriteEvent::Finished(Err(e)) = ev { + let status_kind = get_status_kind_from_engine_error(e); + ASYNC_REQUESTS_COUNTER_VEC.write.get(status_kind).inc(); } }) } @@ -639,10 +634,39 @@ where let mut cmd = RaftCmdRequest::default(); cmd.set_header(header); cmd.set_requests(vec![req].into()); + let tracker = get_tls_tracker_token(); let store_cb = StoreCallback::read(Box::new(move |resp| { - cb(on_read_result(resp).map_err(Error::into)); + let res = on_read_result(resp).map_err(Error::into); + if res.is_ok() { + let elapse = begin_instant.saturating_elapsed_secs(); + GLOBAL_TRACKERS.with_tracker(tracker, |tracker| { + if tracker.metrics.read_index_propose_wait_nanos > 0 { + ASYNC_REQUESTS_DURATIONS_VEC + .snapshot_read_index_propose_wait + .observe( + tracker.metrics.read_index_propose_wait_nanos as f64 + / 1_000_000_000.0, + ); + // snapshot may be handled by lease read in raftstore + if tracker.metrics.read_index_confirm_wait_nanos > 0 { + ASYNC_REQUESTS_DURATIONS_VEC + .snapshot_read_index_confirm + .observe( + tracker.metrics.read_index_confirm_wait_nanos as f64 + / 1_000_000_000.0, + ); + } + } else if tracker.metrics.local_read { + ASYNC_REQUESTS_DURATIONS_VEC + .snapshot_local_read + .observe(elapse); + } + }); + ASYNC_REQUESTS_DURATIONS_VEC.snapshot.observe(elapse); + ASYNC_REQUESTS_COUNTER_VEC.snapshot.success.inc(); + } + cb(res); })); - let tracker = store_cb.read_tracker().unwrap(); if res.is_ok() { res = self @@ -673,35 +697,7 @@ where }; Err(e) } - Ok(CmdRes::Snap(s)) => { - let elapse = begin_instant.saturating_elapsed_secs(); - GLOBAL_TRACKERS.with_tracker(tracker, |tracker| { - if tracker.metrics.read_index_propose_wait_nanos > 0 { - ASYNC_REQUESTS_DURATIONS_VEC - .snapshot_read_index_propose_wait - .observe( - tracker.metrics.read_index_propose_wait_nanos as f64 - / 1_000_000_000.0, - ); - // snapshot may be hanlded by lease read in raftstore - if tracker.metrics.read_index_confirm_wait_nanos > 0 { - ASYNC_REQUESTS_DURATIONS_VEC - .snapshot_read_index_confirm - .observe( - tracker.metrics.read_index_confirm_wait_nanos as f64 - / 1_000_000_000.0, - ); - } - } else if tracker.metrics.local_read { - ASYNC_REQUESTS_DURATIONS_VEC - .snapshot_local_read - .observe(elapse); - } - }); - ASYNC_REQUESTS_DURATIONS_VEC.snapshot.observe(elapse); - ASYNC_REQUESTS_COUNTER_VEC.snapshot.success.inc(); - Ok(s) - } + Ok(CmdRes::Snap(s)) => Ok(s), Err(e) => { let status_kind = get_status_kind_from_engine_error(&e); ASYNC_REQUESTS_COUNTER_VEC.snapshot.get(status_kind).inc(); diff --git a/src/server/raftkv/raft_extension.rs b/src/server/raftkv/raft_extension.rs index d3178842489..733d60c838c 100644 --- a/src/server/raftkv/raft_extension.rs +++ b/src/server/raftkv/raft_extension.rs @@ -121,6 +121,7 @@ where split_keys, callback: raftstore::store::Callback::write(cb), source: source.into(), + share_source_region_size: false, }; let res = self.router.send_casual_msg(region_id, req); Box::pin(async move { diff --git a/src/server/raftkv2/mod.rs b/src/server/raftkv2/mod.rs index dacc90a91f0..64c5a1a3ca1 100644 --- a/src/server/raftkv2/mod.rs +++ b/src/server/raftkv2/mod.rs @@ -19,7 +19,13 @@ use kvproto::{ }; pub use node::NodeV2; pub use raft_extension::Extension; -use raftstore::store::{util::encode_start_ts_into_flag_data, RegionSnapshot}; +use raftstore::{ + store::{ + cmd_resp, msg::ErrorCallback, util::encode_start_ts_into_flag_data, RaftCmdExtraOpts, + RegionSnapshot, + }, + Error, +}; use raftstore_v2::{ router::{ message::SimpleWrite, CmdResChannelBuilder, CmdResEvent, CmdResStream, PeerMsg, RaftRouter, @@ -28,6 +34,7 @@ use raftstore_v2::{ }; use tikv_kv::{Modify, WriteEvent}; use tikv_util::time::Instant; +use tracker::{get_tls_tracker_token, GLOBAL_TRACKERS}; use txn_types::{TxnExtra, TxnExtraScheduler, WriteBatchFlags}; use super::{ @@ -172,7 +179,7 @@ impl tikv_kv::Engine for RaftKv2 { .set_key_ranges(mem::take(&mut ctx.key_ranges).into()); } ASYNC_REQUESTS_COUNTER_VEC.snapshot.all.inc(); - let begin_instant = Instant::now_coarse(); + let begin_instant = Instant::now(); let mut header = new_request_header(ctx.pb_ctx); let mut flags = 0; @@ -200,9 +207,36 @@ impl tikv_kv::Engine for RaftKv2 { let res = f.await; match res { Ok(snap) => { - ASYNC_REQUESTS_DURATIONS_VEC - .snapshot - .observe(begin_instant.saturating_elapsed_secs()); + let elapse = begin_instant.saturating_elapsed_secs(); + let tracker = get_tls_tracker_token(); + GLOBAL_TRACKERS.with_tracker(tracker, |tracker| { + if tracker.metrics.read_index_propose_wait_nanos > 0 { + ASYNC_REQUESTS_DURATIONS_VEC + .snapshot_read_index_propose_wait + .observe( + tracker.metrics.read_index_propose_wait_nanos as f64 + / 1_000_000_000.0, + ); + // snapshot may be handled by lease read in raftstore + if tracker.metrics.read_index_confirm_wait_nanos > 0 { + ASYNC_REQUESTS_DURATIONS_VEC + .snapshot_read_index_confirm + .observe( + tracker.metrics.read_index_confirm_wait_nanos as f64 + / 1_000_000_000.0, + ); + } + } else if tracker.metrics.local_read { + ASYNC_REQUESTS_DURATIONS_VEC + .snapshot_local_read + .observe(elapse); + } + }); + // The observed snapshot duration is larger than the actual + // snapshot duration, because it includes the waiting time + // of this future. + // TODO: Fix the inaccuracy, see #17581. + ASYNC_REQUESTS_DURATIONS_VEC.snapshot.observe(elapse); ASYNC_REQUESTS_COUNTER_VEC.snapshot.success.inc(); Ok(snap) } @@ -241,6 +275,17 @@ impl tikv_kv::Engine for RaftKv2 { let region_id = ctx.region_id; ASYNC_REQUESTS_COUNTER_VEC.write.all.inc(); + + let inject_region_not_found = (|| { + // If rid is some, only the specified region reports error. + // If rid is None, all regions report error. + fail_point!("raftkv_early_error_report", |rid| -> bool { + rid.and_then(|rid| rid.parse().ok()) + .map_or(true, |rid: u64| rid == region_id) + }); + false + })(); + let begin_instant = Instant::now_coarse(); let mut header = Box::new(new_request_header(ctx)); let mut flags = 0; @@ -264,45 +309,48 @@ impl tikv_kv::Engine for RaftKv2 { if WriteEvent::subscribed_committed(subscribed) { builder.subscribe_committed(); } - if let Some(cb) = on_applied { - builder.before_set(move |resp| { - let mut res = if !resp.get_header().has_error() { - Ok(()) - } else { - Err(tikv_kv::Error::from(resp.get_header().get_error().clone())) - }; + builder.before_set(move |resp| { + let mut res = if !resp.get_header().has_error() { + ASYNC_REQUESTS_COUNTER_VEC.write.success.inc(); + ASYNC_REQUESTS_DURATIONS_VEC + .write + .observe(begin_instant.saturating_elapsed_secs()); + Ok(()) + } else { + Err(tikv_kv::Error::from(resp.get_header().get_error().clone())) + }; + if let Some(cb) = on_applied { cb(&mut res); - }); - } - let (ch, sub) = builder.build(); - let msg = PeerMsg::SimpleWrite(SimpleWrite { - header, - data, - ch, - send_time: Instant::now_coarse(), + } }); - let res = self - .router - .store_router() - .check_send(region_id, msg) - .map_err(tikv_kv::Error::from); + let (ch, sub) = builder.build(); + let res = if inject_region_not_found { + ch.report_error(cmd_resp::new_error(Error::RegionNotFound(region_id))); + Err(tikv_kv::Error::from(Error::RegionNotFound(region_id))) + } else { + let msg = PeerMsg::SimpleWrite(SimpleWrite { + header, + data, + ch, + send_time: Instant::now_coarse(), + extra_opts: RaftCmdExtraOpts { + deadline: batch.deadline, + disk_full_opt: batch.disk_full_opt, + }, + }); + self.router + .store_router() + .check_send(region_id, msg) + .map_err(tikv_kv::Error::from) + }; (Transform { resp: CmdResStream::new(sub), early_err: res.err(), }) .inspect(move |ev| { - let WriteEvent::Finished(res) = ev else { return }; - match res { - Ok(()) => { - ASYNC_REQUESTS_COUNTER_VEC.write.success.inc(); - ASYNC_REQUESTS_DURATIONS_VEC - .write - .observe(begin_instant.saturating_elapsed_secs()); - } - Err(e) => { - let status_kind = get_status_kind_from_engine_error(e); - ASYNC_REQUESTS_COUNTER_VEC.write.get(status_kind).inc(); - } + if let WriteEvent::Finished(Err(e)) = ev { + let status_kind = get_status_kind_from_engine_error(e); + ASYNC_REQUESTS_COUNTER_VEC.write.get(status_kind).inc(); } }) } diff --git a/src/server/raftkv2/raft_extension.rs b/src/server/raftkv2/raft_extension.rs index f2f433999b9..8b15c73fb65 100644 --- a/src/server/raftkv2/raft_extension.rs +++ b/src/server/raftkv2/raft_extension.rs @@ -49,6 +49,11 @@ impl tikv_kv::RaftExtension for Extension .send_control(StoreMsg::StoreUnreachable { to_store_id }); } + fn report_store_maybe_tombstone(&self, store_id: u64) { + self.router + .broadcast_normal(|| PeerMsg::StoreMaybeTombstone { store_id }); + } + fn report_snapshot_status( &self, region_id: u64, @@ -71,7 +76,7 @@ impl tikv_kv::RaftExtension for Extension split_keys: Vec>, source: String, ) -> futures::future::BoxFuture<'static, tikv_kv::Result>> { - let (msg, sub) = PeerMsg::request_split(region_epoch, split_keys, source); + let (msg, sub) = PeerMsg::request_split(region_epoch, split_keys, source, true); let res = self.router.check_send(region_id, msg); Box::pin(async move { res?; diff --git a/src/server/resolve.rs b/src/server/resolve.rs index c831ff28d17..013511183e2 100644 --- a/src/server/resolve.rs +++ b/src/server/resolve.rs @@ -1,6 +1,7 @@ // Copyright 2016 TiKV Project Authors. Licensed under Apache-2.0. use std::{ + error::Error as StdError, fmt::{self, Display, Formatter}, sync::{Arc, Mutex}, }; @@ -9,16 +10,28 @@ use collections::HashMap; use kvproto::replication_modepb::ReplicationMode; use pd_client::{take_peer_address, PdClient}; use raftstore::store::GlobalReplicationState; +use thiserror::Error; use tikv_kv::RaftExtension; use tikv_util::{ + info, time::Instant, worker::{Runnable, Scheduler, Worker}, }; -use super::{metrics::*, Result}; +use super::metrics::*; const STORE_ADDRESS_REFRESH_SECONDS: u64 = 60; +#[derive(Debug, Error)] +pub enum Error { + #[error("{0:?}")] + Other(#[from] Box), + #[error("store {0} has been removed")] + StoreTombstone(u64), +} + +pub type Result = std::result::Result; + pub type Callback = Box) + Send>; pub fn store_address_refresh_interval_secs() -> u64 { @@ -95,9 +108,21 @@ where // it explicitly. Err(pd_client::Error::StoreTombstone(_)) => { RESOLVE_STORE_COUNTER_STATIC.tombstone.inc(); - return Err(box_err!("store {} has been removed", store_id)); + self.router.report_store_maybe_tombstone(store_id); + return Err(Error::StoreTombstone(store_id)); + } + Err(e) => { + // Tombstone store may be removed manually or automatically + // after 30 days of deletion. PD returns + // "invalid store ID %d, not found" for such store id. + // See https://github.com/tikv/pd/blob/v7.3.0/server/grpc_service.go#L777-L780 + if format!("{:?}", e).contains("not found") { + RESOLVE_STORE_COUNTER_STATIC.not_found.inc(); + info!("resolve store not found"; "store_id" => store_id); + self.router.report_store_maybe_tombstone(store_id); + } + return Err(box_err!(e)); } - Err(e) => return Err(box_err!(e)), }; let mut group_id = None; let mut state = self.state.lock().unwrap(); @@ -181,6 +206,25 @@ impl StoreAddrResolver for PdStoreAddrResolver { } } +#[derive(Clone)] +pub struct MockStoreAddrResolver { + pub resolve_fn: Arc Result<()> + Send + Sync>, +} + +impl StoreAddrResolver for MockStoreAddrResolver { + fn resolve(&self, store_id: u64, cb: Callback) -> Result<()> { + (self.resolve_fn)(store_id, cb) + } +} + +impl Default for MockStoreAddrResolver { + fn default() -> MockStoreAddrResolver { + MockStoreAddrResolver { + resolve_fn: Arc::new(|_, _| unimplemented!()), + } + } +} + #[cfg(test)] mod tests { use std::{net::SocketAddr, ops::Sub, str::FromStr, sync::Arc, thread, time::Duration}; diff --git a/src/server/server.rs b/src/server/server.rs index 948930ae7ae..9f62fe583cf 100644 --- a/src/server/server.rs +++ b/src/server/server.rs @@ -101,6 +101,8 @@ where .http2_max_ping_strikes(i32::MAX) // For pings without data from clients. .keepalive_time(self.cfg.value().grpc_keepalive_time.into()) .keepalive_timeout(self.cfg.value().grpc_keepalive_timeout.into()) + .default_compression_algorithm(self.cfg.value().grpc_compression_algorithm()) + .default_gzip_compression_level(self.cfg.value().grpc_gzip_compression_level) .build_args(); let sb = ServerBuilder::new(Arc::clone(&env)) @@ -437,6 +439,7 @@ pub mod test_router { use engine_rocks::{RocksEngine, RocksSnapshot}; use kvproto::raft_serverpb::RaftMessage; use raftstore::{router::RaftStoreRouter, store::*, Result as RaftStoreResult}; + use tikv_util::time::Instant as TiInstant; use super::*; @@ -471,14 +474,18 @@ pub mod test_router { cmd: RaftCommand, ) -> std::result::Result<(), crossbeam::channel::TrySendError>> { - let _ = self.tx.send(Either::Left(PeerMsg::RaftCommand(cmd))); + let _ = self + .tx + .send(Either::Left(PeerMsg::RaftCommand(Box::new(cmd)))); Ok(()) } } impl CasualRouter for TestRaftStoreRouter { fn send(&self, _: u64, msg: CasualMessage) -> RaftStoreResult<()> { - let _ = self.tx.send(Either::Left(PeerMsg::CasualMessage(msg))); + let _ = self + .tx + .send(Either::Left(PeerMsg::CasualMessage(Box::new(msg)))); Ok(()) } } @@ -496,12 +503,10 @@ pub mod test_router { impl RaftStoreRouter for TestRaftStoreRouter { fn send_raft_msg(&self, msg: RaftMessage) -> RaftStoreResult<()> { - let _ = self - .tx - .send(Either::Left(PeerMsg::RaftMessage(InspectedRaftMessage { - heap_size: 0, - msg, - }))); + let _ = self.tx.send(Either::Left(PeerMsg::RaftMessage( + Box::new(InspectedRaftMessage { heap_size: 0, msg }), + Some(TiInstant::now()), + ))); Ok(()) } @@ -533,8 +538,8 @@ mod tests { use super::{ super::{ - resolve::{Callback as ResolveCallback, StoreAddrResolver}, - Config, Result, + resolve::{self, Callback as ResolveCallback, StoreAddrResolver}, + Config, }, *, }; @@ -552,7 +557,7 @@ mod tests { } impl StoreAddrResolver for MockResolver { - fn resolve(&self, _: u64, cb: ResolveCallback) -> Result<()> { + fn resolve(&self, _: u64, cb: ResolveCallback) -> resolve::Result<()> { if self.quick_fail.load(Ordering::SeqCst) { return Err(box_err!("quick fail")); } diff --git a/src/server/service/kv.rs b/src/server/service/kv.rs index 5a4327ba46e..a528e40e484 100644 --- a/src/server/service/kv.rs +++ b/src/server/service/kv.rs @@ -202,7 +202,7 @@ macro_rules! handle_request { resource_manager.consume_penalty(resource_control_ctx); } GRPC_RESOURCE_GROUP_COUNTER_VEC - .with_label_values(&[resource_control_ctx.get_resource_group_name()]) + .with_label_values(&[resource_control_ctx.get_resource_group_name(), resource_control_ctx.get_resource_group_name()]) .inc(); let resp = $future_name(&self.storage, req); let task = async move { @@ -484,7 +484,10 @@ impl Tikv for Service { resource_manager.consume_penalty(resource_control_ctx); } GRPC_RESOURCE_GROUP_COUNTER_VEC - .with_label_values(&[resource_control_ctx.get_resource_group_name()]) + .with_label_values(&[ + resource_control_ctx.get_resource_group_name(), + resource_control_ctx.get_resource_group_name(), + ]) .inc(); let begin_instant = Instant::now(); @@ -522,7 +525,10 @@ impl Tikv for Service { resource_manager.consume_penalty(resource_control_ctx); } GRPC_RESOURCE_GROUP_COUNTER_VEC - .with_label_values(&[resource_control_ctx.get_resource_group_name()]) + .with_label_values(&[ + resource_control_ctx.get_resource_group_name(), + resource_control_ctx.get_resource_group_name(), + ]) .inc(); let begin_instant = Instant::now(); @@ -611,7 +617,10 @@ impl Tikv for Service { resource_manager.consume_penalty(resource_control_ctx); } GRPC_RESOURCE_GROUP_COUNTER_VEC - .with_label_values(&[resource_control_ctx.get_resource_group_name()]) + .with_label_values(&[ + resource_control_ctx.get_resource_group_name(), + resource_control_ctx.get_resource_group_name(), + ]) .inc(); let mut stream = self @@ -1155,7 +1164,7 @@ fn handle_batch_commands_request( resource_manager.consume_penalty(resource_control_ctx); } GRPC_RESOURCE_GROUP_COUNTER_VEC - .with_label_values(&[resource_control_ctx.get_resource_group_name()]) + .with_label_values(&[resource_control_ctx.get_resource_group_name(), resource_control_ctx.get_resource_group_name()]) .inc(); if batcher.as_mut().map_or(false, |req_batch| { req_batch.can_batch_get(&req) @@ -1176,8 +1185,8 @@ fn handle_batch_commands_request( resource_manager.consume_penalty(resource_control_ctx); } GRPC_RESOURCE_GROUP_COUNTER_VEC - .with_label_values(&[resource_control_ctx.get_resource_group_name()]) - .inc(); + .with_label_values(&[resource_control_ctx.get_resource_group_name(), resource_control_ctx.get_resource_group_name()]) + .inc(); if batcher.as_mut().map_or(false, |req_batch| { req_batch.can_batch_raw_get(&req) }) { @@ -1191,16 +1200,16 @@ fn handle_batch_commands_request( response_batch_commands_request(id, resp, tx.clone(), begin_instant, GrpcTypeKind::raw_get, source); } }, - Some(batch_commands_request::request::Cmd::Coprocessor(mut req)) => { + Some(batch_commands_request::request::Cmd::Coprocessor(req)) => { let resource_control_ctx = req.get_context().get_resource_control_context(); if let Some(resource_manager) = resource_manager { resource_manager.consume_penalty(resource_control_ctx); } GRPC_RESOURCE_GROUP_COUNTER_VEC - .with_label_values(&[resource_control_ctx.get_resource_group_name()]) + .with_label_values(&[resource_control_ctx.get_resource_group_name(), resource_control_ctx.get_resource_group_name()]) .inc(); let begin_instant = Instant::now(); - let source = req.mut_context().take_request_source(); + let source = req.get_context().get_request_source().to_owned(); let resp = future_copr(copr, Some(peer.to_string()), req) .map_ok(|resp| { resp.map(oneof!(batch_commands_response::response::Cmd::Coprocessor)) @@ -1225,16 +1234,16 @@ fn handle_batch_commands_request( String::default(), ); } - $(Some(batch_commands_request::request::Cmd::$cmd(mut req)) => { + $(Some(batch_commands_request::request::Cmd::$cmd(req)) => { let resource_control_ctx = req.get_context().get_resource_control_context(); if let Some(resource_manager) = resource_manager { resource_manager.consume_penalty(resource_control_ctx); } GRPC_RESOURCE_GROUP_COUNTER_VEC - .with_label_values(&[resource_control_ctx.get_resource_group_name()]) + .with_label_values(&[resource_control_ctx.get_resource_group_name(), resource_control_ctx.get_resource_group_name()]) .inc(); let begin_instant = Instant::now(); - let source = req.mut_context().take_request_source(); + let source = req.get_context().get_request_source().to_owned(); let resp = $future_fn($($arg,)* req) .map_ok(oneof!(batch_commands_response::response::Cmd::$cmd)) .map_err(|_| GRPC_MSG_FAIL_COUNTER.$metric_name.inc()); diff --git a/src/server/snap.rs b/src/server/snap.rs index 34b32848ad3..4b7540f7fec 100644 --- a/src/server/snap.rs +++ b/src/server/snap.rs @@ -8,12 +8,14 @@ use std::{ atomic::{AtomicUsize, Ordering}, Arc, }, - time::Duration, + time::{Duration, Instant as StdInstant}, }; use file_system::{IoType, WithIoType}; use futures::{ - future::{Future, TryFutureExt}, + compat::Future01CompatExt, + future::{select, Either, Future, TryFutureExt}, + pin_mut, sink::SinkExt, stream::{Stream, StreamExt, TryStreamExt}, task::{Context, Poll}, @@ -36,8 +38,10 @@ use raftstore::store::{SnapEntry, SnapKey, SnapManager, Snapshot}; use security::SecurityManager; use tikv_kv::RaftExtension; use tikv_util::{ - config::{Tracker, VersionTrack}, + box_err, + config::{Tracker, VersionTrack, MIB}, time::{Instant, UnixSecs}, + timer::GLOBAL_TIMER_HANDLE, worker::Runnable, DeferContext, }; @@ -50,6 +54,25 @@ pub type Callback = Box) + Send>; pub const DEFAULT_POOL_SIZE: usize = 4; +// the default duration before a snapshot sending task is canceled. +const SNAP_SEND_TIMEOUT_DURATION: Duration = Duration::from_secs(600); +// the minimum expected send speed for sending snapshot, this is used to avoid +// timeout too early when the snapshot size is too big. +const MIN_SNAP_SEND_SPEED: u64 = MIB; + +#[inline] +fn get_snap_timeout(size: u64) -> Duration { + let timeout = (|| { + fail_point!("snap_send_duration_timeout", |t| -> Duration { + let t = t.unwrap().parse::(); + Duration::from_millis(t.unwrap()) + }); + SNAP_SEND_TIMEOUT_DURATION + })(); + let max_expected_dur = Duration::from_secs(size / MIN_SNAP_SEND_SPEED); + std::cmp::max(timeout, max_expected_dur) +} + /// A task for either receiving Snapshot or sending Snapshot pub enum Task { Recv { @@ -191,10 +214,36 @@ pub fn send_snap( let (sink, receiver) = client.snapshot()?; let send_task = async move { - let mut sink = sink.sink_map_err(Error::from); - sink.send_all(&mut chunks).await?; - sink.close().await?; - let recv_result = receiver.map_err(Error::from).await; + let send_and_recv = async { + let mut sink = sink.sink_map_err(Error::from); + + #[cfg(feature = "failpoints")] + { + let should_delay = (|| { + fail::fail_point!("snap_send_timer_delay", |_| { true }); + false + })(); + if should_delay { + _ = GLOBAL_TIMER_HANDLE + .delay(StdInstant::now() + Duration::from_secs(1)) + .compat() + .await; + } + } + sink.send_all(&mut chunks).await?; + sink.close().await?; + Ok(receiver.map_err(Error::from).await) + }; + let wait_timeout = GLOBAL_TIMER_HANDLE + .delay(StdInstant::now() + get_snap_timeout(total_size)) + .compat(); + let recv_result = { + pin_mut!(send_and_recv, wait_timeout); + match select(send_and_recv, wait_timeout).await { + Either::Left((r, _)) => r, + Either::Right((..)) => Err(Error::Other(box_err!("send snapshot timeout"))), + } + }; send_timer.observe_duration(); drop(deregister); drop(client); diff --git a/src/server/status_server/jeprof.in b/src/server/status_server/jeprof.in new file mode 100644 index 00000000000..cadf15d7d8e --- /dev/null +++ b/src/server/status_server/jeprof.in @@ -0,0 +1,5727 @@ +#! /usr/bin/env perl + +# Copyright (c) 1998-2007, Google Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following disclaimer +# in the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Google Inc. nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# --- +# Program for printing the profile generated by common/profiler.cc, +# or by the heap profiler (common/debugallocation.cc) +# +# The profile contains a sequence of entries of the form: +# +# This program parses the profile, and generates user-readable +# output. +# +# Examples: +# +# % tools/jeprof "program" "profile" +# Enters "interactive" mode +# +# % tools/jeprof --text "program" "profile" +# Generates one line per procedure +# +# % tools/jeprof --gv "program" "profile" +# Generates annotated call-graph and displays via "gv" +# +# % tools/jeprof --gv --focus=Mutex "program" "profile" +# Restrict to code paths that involve an entry that matches "Mutex" +# +# % tools/jeprof --gv --focus=Mutex --ignore=string "program" "profile" +# Restrict to code paths that involve an entry that matches "Mutex" +# and does not match "string" +# +# % tools/jeprof --list=IBF_CheckDocid "program" "profile" +# Generates disassembly listing of all routines with at least one +# sample that match the --list= pattern. The listing is +# annotated with the flat and cumulative sample counts at each line. +# +# % tools/jeprof --disasm=IBF_CheckDocid "program" "profile" +# Generates disassembly listing of all routines with at least one +# sample that match the --disasm= pattern. The listing is +# annotated with the flat and cumulative sample counts at each PC value. +# +# TODO: Use color to indicate files? + +use strict; +use warnings; +use Getopt::Long; +use Cwd; + +my $JEPROF_VERSION = "unknown"; +my $PPROF_VERSION = "2.0"; + +# These are the object tools we use which can come from a +# user-specified location using --tools, from the JEPROF_TOOLS +# environment variable, or from the environment. +my %obj_tool_map = ( + "objdump" => "objdump", + "nm" => "nm", + "addr2line" => "addr2line", + "c++filt" => "c++filt", + ## ConfigureObjTools may add architecture-specific entries: + #"nm_pdb" => "nm-pdb", # for reading windows (PDB-format) executables + #"addr2line_pdb" => "addr2line-pdb", # ditto + #"otool" => "otool", # equivalent of objdump on OS X +); +# NOTE: these are lists, so you can put in commandline flags if you want. +my @DOT = ("dot"); # leave non-absolute, since it may be in /usr/local +my @GV = ("gv"); +my @EVINCE = ("evince"); # could also be xpdf or perhaps acroread +my @KCACHEGRIND = ("kcachegrind"); +my @PS2PDF = ("ps2pdf"); +# These are used for dynamic profiles +my @URL_FETCHER = ("curl", "-s", "--fail"); + +# These are the web pages that servers need to support for dynamic profiles +my $HEAP_PAGE = "/pprof/heap"; +my $PROFILE_PAGE = "/pprof/profile"; # must support cgi-param "?seconds=#" +my $PMUPROFILE_PAGE = "/pprof/pmuprofile(?:\\?.*)?"; # must support cgi-param + # ?seconds=#&event=x&period=n +my $GROWTH_PAGE = "/pprof/growth"; +my $CONTENTION_PAGE = "/pprof/contention"; +my $WALL_PAGE = "/pprof/wall(?:\\?.*)?"; # accepts options like namefilter +my $FILTEREDPROFILE_PAGE = "/pprof/filteredprofile(?:\\?.*)?"; +my $CENSUSPROFILE_PAGE = "/pprof/censusprofile(?:\\?.*)?"; # must support cgi-param + # "?seconds=#", + # "?tags_regexp=#" and + # "?type=#". +my $SYMBOL_PAGE = "/pprof/symbol"; # must support symbol lookup via POST +my $PROGRAM_NAME_PAGE = "/pprof/cmdline"; + +# These are the web pages that can be named on the command line. +# All the alternatives must begin with /. +my $PROFILES = "($HEAP_PAGE|$PROFILE_PAGE|$PMUPROFILE_PAGE|" . + "$GROWTH_PAGE|$CONTENTION_PAGE|$WALL_PAGE|" . + "$FILTEREDPROFILE_PAGE|$CENSUSPROFILE_PAGE)"; + +# default binary name +my $UNKNOWN_BINARY = "(unknown)"; + +# There is a pervasive dependency on the length (in hex characters, +# i.e., nibbles) of an address, distinguishing between 32-bit and +# 64-bit profiles. To err on the safe size, default to 64-bit here: +my $address_length = 16; + +my $dev_null = "/dev/null"; +if (! -e $dev_null && $^O =~ /MSWin/) { # $^O is the OS perl was built for + $dev_null = "nul"; +} + +# A list of paths to search for shared object files +my @prefix_list = (); + +# Special routine name that should not have any symbols. +# Used as separator to parse "addr2line -i" output. +my $sep_symbol = '_fini'; +my $sep_address = undef; + +##### Argument parsing ##### + +sub usage_string { + return < + is a space separated list of profile names. +jeprof [options] + is a list of profile files where each file contains + the necessary symbol mappings as well as profile data (likely generated + with --raw). +jeprof [options] + is a remote form. Symbols are obtained from host:port$SYMBOL_PAGE + + Each name can be: + /path/to/profile - a path to a profile file + host:port[/] - a location of a service to get profile from + + The / can be $HEAP_PAGE, $PROFILE_PAGE, /pprof/pmuprofile, + $GROWTH_PAGE, $CONTENTION_PAGE, /pprof/wall, + $CENSUSPROFILE_PAGE, or /pprof/filteredprofile. + For instance: + jeprof http://myserver.com:80$HEAP_PAGE + If / is omitted, the service defaults to $PROFILE_PAGE (cpu profiling). +jeprof --symbols + Maps addresses to symbol names. In this mode, stdin should be a + list of library mappings, in the same format as is found in the heap- + and cpu-profile files (this loosely matches that of /proc/self/maps + on linux), followed by a list of hex addresses to map, one per line. + + For more help with querying remote servers, including how to add the + necessary server-side support code, see this filename (or one like it): + + /usr/doc/gperftools-$PPROF_VERSION/pprof_remote_servers.html + +Options: + --cum Sort by cumulative data + --base= Subtract from before display + --interactive Run in interactive mode (interactive "help" gives help) [default] + --seconds= Length of time for dynamic profiles [default=30 secs] + --add_lib= Read additional symbols and line info from the given library + --lib_prefix=

Comma separated list of library path prefixes + +Reporting Granularity: + --addresses Report at address level + --lines Report at source line level + --functions Report at function level [default] + --files Report at source file level + +Output type: + --text Generate text report + --callgrind Generate callgrind format to stdout + --gv Generate Postscript and display + --evince Generate PDF and display + --web Generate SVG and display + --list= Generate source listing of matching routines + --disasm= Generate disassembly of matching routines + --symbols Print demangled symbol names found at given addresses + --dot Generate DOT file to stdout + --ps Generate Postcript to stdout + --pdf Generate PDF to stdout + --svg Generate SVG to stdout + --gif Generate GIF to stdout + --raw Generate symbolized jeprof data (useful with remote fetch) + --collapsed Generate collapsed stacks for building flame graphs + (see http://www.brendangregg.com/flamegraphs.html) + +Heap-Profile Options: + --inuse_space Display in-use (mega)bytes [default] + --inuse_objects Display in-use objects + --alloc_space Display allocated (mega)bytes + --alloc_objects Display allocated objects + --show_bytes Display space in bytes + --drop_negative Ignore negative differences + +Contention-profile options: + --total_delay Display total delay at each region [default] + --contentions Display number of delays at each region + --mean_delay Display mean delay at each region + +Call-graph Options: + --nodecount= Show at most so many nodes [default=80] + --nodefraction= Hide nodes below *total [default=.005] + --edgefraction= Hide edges below *total [default=.001] + --maxdegree= Max incoming/outgoing edges per node [default=8] + --focus= Focus on backtraces with nodes matching + --thread= Show profile for thread + --ignore= Ignore backtraces with nodes matching + --scale= Set GV scaling [default=0] + --heapcheck Make nodes with non-0 object counts + (i.e. direct leak generators) more visible + --retain= Retain only nodes that match + --exclude= Exclude all nodes that match + +Miscellaneous: + --tools=[,...] \$PATH for object tool pathnames + --test Run unit tests + --help This message + --version Version information + --debug-syms-by-id (Linux only) Find debug symbol files by build ID as well as by name + +Environment Variables: + JEPROF_TMPDIR Profiles directory. Defaults to \$HOME/jeprof + JEPROF_TOOLS Prefix for object tools pathnames + +Examples: + +jeprof /bin/ls ls.prof + Enters "interactive" mode +jeprof --text /bin/ls ls.prof + Outputs one line per procedure +jeprof --web /bin/ls ls.prof + Displays annotated call-graph in web browser +jeprof --gv /bin/ls ls.prof + Displays annotated call-graph via 'gv' +jeprof --gv --focus=Mutex /bin/ls ls.prof + Restricts to code paths including a .*Mutex.* entry +jeprof --gv --focus=Mutex --ignore=string /bin/ls ls.prof + Code paths including Mutex but not string +jeprof --list=getdir /bin/ls ls.prof + (Per-line) annotated source listing for getdir() +jeprof --disasm=getdir /bin/ls ls.prof + (Per-PC) annotated disassembly for getdir() + +jeprof http://localhost:1234/ + Enters "interactive" mode +jeprof --text localhost:1234 + Outputs one line per procedure for localhost:1234 +jeprof --raw localhost:1234 > ./local.raw +jeprof --text ./local.raw + Fetches a remote profile for later analysis and then + analyzes it in text mode. +EOF +} + +sub version_string { + return < \$main::opt_help, + "version!" => \$main::opt_version, + "cum!" => \$main::opt_cum, + "base=s" => \$main::opt_base, + "seconds=i" => \$main::opt_seconds, + "add_lib=s" => \$main::opt_lib, + "lib_prefix=s" => \$main::opt_lib_prefix, + "functions!" => \$main::opt_functions, + "lines!" => \$main::opt_lines, + "addresses!" => \$main::opt_addresses, + "files!" => \$main::opt_files, + "text!" => \$main::opt_text, + "callgrind!" => \$main::opt_callgrind, + "list=s" => \$main::opt_list, + "disasm=s" => \$main::opt_disasm, + "symbols!" => \$main::opt_symbols, + "gv!" => \$main::opt_gv, + "evince!" => \$main::opt_evince, + "web!" => \$main::opt_web, + "dot!" => \$main::opt_dot, + "ps!" => \$main::opt_ps, + "pdf!" => \$main::opt_pdf, + "svg!" => \$main::opt_svg, + "gif!" => \$main::opt_gif, + "raw!" => \$main::opt_raw, + "collapsed!" => \$main::opt_collapsed, + "interactive!" => \$main::opt_interactive, + "nodecount=i" => \$main::opt_nodecount, + "nodefraction=f" => \$main::opt_nodefraction, + "edgefraction=f" => \$main::opt_edgefraction, + "maxdegree=i" => \$main::opt_maxdegree, + "focus=s" => \$main::opt_focus, + "thread=s" => \$main::opt_thread, + "ignore=s" => \$main::opt_ignore, + "scale=i" => \$main::opt_scale, + "heapcheck" => \$main::opt_heapcheck, + "retain=s" => \$main::opt_retain, + "exclude=s" => \$main::opt_exclude, + "inuse_space!" => \$main::opt_inuse_space, + "inuse_objects!" => \$main::opt_inuse_objects, + "alloc_space!" => \$main::opt_alloc_space, + "alloc_objects!" => \$main::opt_alloc_objects, + "show_bytes!" => \$main::opt_show_bytes, + "drop_negative!" => \$main::opt_drop_negative, + "total_delay!" => \$main::opt_total_delay, + "contentions!" => \$main::opt_contentions, + "mean_delay!" => \$main::opt_mean_delay, + "tools=s" => \$main::opt_tools, + "test!" => \$main::opt_test, + "debug!" => \$main::opt_debug, + "debug-syms-by-id!" => \$main::opt_debug_syms_by_id, + # Undocumented flags used only by unittests: + "test_stride=i" => \$main::opt_test_stride, + ) || usage("Invalid option(s)"); + + # Deal with the standard --help and --version + if ($main::opt_help) { + print usage_string(); + exit(0); + } + + if ($main::opt_version) { + print version_string(); + exit(0); + } + + # Disassembly/listing/symbols mode requires address-level info + if ($main::opt_disasm || $main::opt_list || $main::opt_symbols) { + $main::opt_functions = 0; + $main::opt_lines = 0; + $main::opt_addresses = 1; + $main::opt_files = 0; + } + + # Check heap-profiling flags + if ($main::opt_inuse_space + + $main::opt_inuse_objects + + $main::opt_alloc_space + + $main::opt_alloc_objects > 1) { + usage("Specify at most on of --inuse/--alloc options"); + } + + # Check output granularities + my $grains = + $main::opt_functions + + $main::opt_lines + + $main::opt_addresses + + $main::opt_files + + 0; + if ($grains > 1) { + usage("Only specify one output granularity option"); + } + if ($grains == 0) { + $main::opt_functions = 1; + } + + # Check output modes + my $modes = + $main::opt_text + + $main::opt_callgrind + + ($main::opt_list eq '' ? 0 : 1) + + ($main::opt_disasm eq '' ? 0 : 1) + + ($main::opt_symbols == 0 ? 0 : 1) + + $main::opt_gv + + $main::opt_evince + + $main::opt_web + + $main::opt_dot + + $main::opt_ps + + $main::opt_pdf + + $main::opt_svg + + $main::opt_gif + + $main::opt_raw + + $main::opt_collapsed + + $main::opt_interactive + + 0; + if ($modes > 1) { + usage("Only specify one output mode"); + } + if ($modes == 0) { + if (-t STDOUT) { # If STDOUT is a tty, activate interactive mode + $main::opt_interactive = 1; + } else { + $main::opt_text = 1; + } + } + + if ($main::opt_test) { + RunUnitTests(); + # Should not return + exit(1); + } + + # Binary name and profile arguments list + $main::prog = ""; + @main::pfile_args = (); + + # Remote profiling without a binary (using $SYMBOL_PAGE instead) + if (@ARGV > 0) { + if (IsProfileURL($ARGV[0])) { + $main::use_symbol_page = 1; + } elsif (IsSymbolizedProfileFile($ARGV[0])) { + $main::use_symbolized_profile = 1; + $main::prog = $UNKNOWN_BINARY; # will be set later from the profile file + } + } + + if ($main::use_symbol_page || $main::use_symbolized_profile) { + # We don't need a binary! + my %disabled = ('--lines' => $main::opt_lines, + '--disasm' => $main::opt_disasm); + for my $option (keys %disabled) { + usage("$option cannot be used without a binary") if $disabled{$option}; + } + # Set $main::prog later... + scalar(@ARGV) || usage("Did not specify profile file"); + } elsif ($main::opt_symbols) { + # --symbols needs a binary-name (to run nm on, etc) but not profiles + $main::prog = shift(@ARGV) || usage("Did not specify program"); + } else { + $main::prog = shift(@ARGV) || usage("Did not specify program"); + scalar(@ARGV) || usage("Did not specify profile file"); + } + + # Parse profile file/location arguments + foreach my $farg (@ARGV) { + if ($farg =~ m/(.*)\@([0-9]+)(|\/.*)$/ ) { + my $machine = $1; + my $num_machines = $2; + my $path = $3; + for (my $i = 0; $i < $num_machines; $i++) { + unshift(@main::pfile_args, "$i.$machine$path"); + } + } else { + unshift(@main::pfile_args, $farg); + } + } + + if ($main::use_symbol_page) { + unless (IsProfileURL($main::pfile_args[0])) { + error("The first profile should be a remote form to use $SYMBOL_PAGE\n"); + } + CheckSymbolPage(); + $main::prog = FetchProgramName(); + } elsif (!$main::use_symbolized_profile) { # may not need objtools! + ConfigureObjTools($main::prog) + } + + # Break the opt_lib_prefix into the prefix_list array + @prefix_list = split (',', $main::opt_lib_prefix); + + # Remove trailing / from the prefixes, in the list to prevent + # searching things like /my/path//lib/mylib.so + foreach (@prefix_list) { + s|/+$||; + } + + # Flag to prevent us from trying over and over to use + # elfutils if it's not installed (used only with + # --debug-syms-by-id option). + $main::gave_up_on_elfutils = 0; +} + +sub FilterAndPrint { + my ($profile, $symbols, $libs, $thread) = @_; + + # Get total data in profile + my $total = TotalProfile($profile); + + # Remove uniniteresting stack items + $profile = RemoveUninterestingFrames($symbols, $profile); + + # Focus? + if ($main::opt_focus ne '') { + $profile = FocusProfile($symbols, $profile, $main::opt_focus); + } + + # Ignore? + if ($main::opt_ignore ne '') { + $profile = IgnoreProfile($symbols, $profile, $main::opt_ignore); + } + + my $calls = ExtractCalls($symbols, $profile); + + # Reduce profiles to required output granularity, and also clean + # each stack trace so a given entry exists at most once. + my $reduced = ReduceProfile($symbols, $profile); + + # Get derived profiles + my $flat = FlatProfile($reduced); + my $cumulative = CumulativeProfile($reduced); + + # Print + if (!$main::opt_interactive) { + if ($main::opt_disasm) { + PrintDisassembly($libs, $flat, $cumulative, $main::opt_disasm); + } elsif ($main::opt_list) { + PrintListing($total, $libs, $flat, $cumulative, $main::opt_list, 0); + } elsif ($main::opt_text) { + # Make sure the output is empty when have nothing to report + # (only matters when --heapcheck is given but we must be + # compatible with old branches that did not pass --heapcheck always): + if ($total != 0) { + printf("Total%s: %s %s\n", + (defined($thread) ? " (t$thread)" : ""), + Unparse($total), Units()); + } + PrintText($symbols, $flat, $cumulative, -1); + } elsif ($main::opt_raw) { + PrintSymbolizedProfile($symbols, $profile, $main::prog); + } elsif ($main::opt_collapsed) { + PrintCollapsedStacks($symbols, $profile); + } elsif ($main::opt_callgrind) { + PrintCallgrind($calls); + } else { + if (PrintDot($main::prog, $symbols, $profile, $flat, $cumulative, $total)) { + if ($main::opt_gv) { + RunGV(TempName($main::next_tmpfile, "ps"), ""); + } elsif ($main::opt_evince) { + RunEvince(TempName($main::next_tmpfile, "pdf"), ""); + } elsif ($main::opt_web) { + my $tmp = TempName($main::next_tmpfile, "svg"); + RunWeb($tmp); + # The command we run might hand the file name off + # to an already running browser instance and then exit. + # Normally, we'd remove $tmp on exit (right now), + # but fork a child to remove $tmp a little later, so that the + # browser has time to load it first. + delete $main::tempnames{$tmp}; + if (fork() == 0) { + sleep 5; + unlink($tmp); + exit(0); + } + } + } else { + cleanup(); + exit(1); + } + } + } else { + InteractiveMode($profile, $symbols, $libs, $total); + } +} + +sub Main() { + Init(); + $main::collected_profile = undef; + @main::profile_files = (); + $main::op_time = time(); + + # Printing symbols is special and requires a lot less info that most. + if ($main::opt_symbols) { + PrintSymbols(*STDIN); # Get /proc/maps and symbols output from stdin + return; + } + + # Fetch all profile data + FetchDynamicProfiles(); + + # this will hold symbols that we read from the profile files + my $symbol_map = {}; + + # Read one profile, pick the last item on the list + my $data = ReadProfile($main::prog, pop(@main::profile_files)); + my $profile = $data->{profile}; + my $pcs = $data->{pcs}; + my $libs = $data->{libs}; # Info about main program and shared libraries + $symbol_map = MergeSymbols($symbol_map, $data->{symbols}); + + # Add additional profiles, if available. + if (scalar(@main::profile_files) > 0) { + foreach my $pname (@main::profile_files) { + my $data2 = ReadProfile($main::prog, $pname); + $profile = AddProfile($profile, $data2->{profile}); + $pcs = AddPcs($pcs, $data2->{pcs}); + $symbol_map = MergeSymbols($symbol_map, $data2->{symbols}); + } + } + + # Subtract base from profile, if specified + if ($main::opt_base ne '') { + my $base = ReadProfile($main::prog, $main::opt_base); + $profile = SubtractProfile($profile, $base->{profile}); + $pcs = AddPcs($pcs, $base->{pcs}); + $symbol_map = MergeSymbols($symbol_map, $base->{symbols}); + } + + # Collect symbols + my $symbols; + if ($main::use_symbolized_profile) { + $symbols = FetchSymbols($pcs, $symbol_map); + } elsif ($main::use_symbol_page) { + $symbols = FetchSymbols($pcs); + } else { + # TODO(csilvers): $libs uses the /proc/self/maps data from profile1, + # which may differ from the data from subsequent profiles, especially + # if they were run on different machines. Use appropriate libs for + # each pc somehow. + $symbols = ExtractSymbols($libs, $pcs); + } + + if (!defined($main::opt_thread)) { + FilterAndPrint($profile, $symbols, $libs); + } + if (defined($data->{threads})) { + foreach my $thread (sort { $a <=> $b } keys(%{$data->{threads}})) { + if (defined($main::opt_thread) && + ($main::opt_thread eq '*' || $main::opt_thread == $thread)) { + my $thread_profile = $data->{threads}{$thread}; + FilterAndPrint($thread_profile, $symbols, $libs, $thread); + } + } + } + + cleanup(); + exit(0); +} + +##### Entry Point ##### + +Main(); + +# Temporary code to detect if we're running on a Goobuntu system. +# These systems don't have the right stuff installed for the special +# Readline libraries to work, so as a temporary workaround, we default +# to using the normal stdio code, rather than the fancier readline-based +# code +sub ReadlineMightFail { + if (-e '/lib/libtermcap.so.2') { + return 0; # libtermcap exists, so readline should be okay + } else { + return 1; + } +} + +sub RunGV { + my $fname = shift; + my $bg = shift; # "" or " &" if we should run in background + if (!system(ShellEscape(@GV, "--version") . " >$dev_null 2>&1")) { + # Options using double dash are supported by this gv version. + # Also, turn on noantialias to better handle bug in gv for + # postscript files with large dimensions. + # TODO: Maybe we should not pass the --noantialias flag + # if the gv version is known to work properly without the flag. + system(ShellEscape(@GV, "--scale=$main::opt_scale", "--noantialias", $fname) + . $bg); + } else { + # Old gv version - only supports options that use single dash. + print STDERR ShellEscape(@GV, "-scale", $main::opt_scale) . "\n"; + system(ShellEscape(@GV, "-scale", "$main::opt_scale", $fname) . $bg); + } +} + +sub RunEvince { + my $fname = shift; + my $bg = shift; # "" or " &" if we should run in background + system(ShellEscape(@EVINCE, $fname) . $bg); +} + +sub RunWeb { + my $fname = shift; + print STDERR "Loading web page file:///$fname\n"; + + if (`uname` =~ /Darwin/) { + # OS X: open will use standard preference for SVG files. + system("/usr/bin/open", $fname); + return; + } + + # Some kind of Unix; try generic symlinks, then specific browsers. + # (Stop once we find one.) + # Works best if the browser is already running. + my @alt = ( + "/etc/alternatives/gnome-www-browser", + "/etc/alternatives/x-www-browser", + "google-chrome", + "firefox", + ); + foreach my $b (@alt) { + if (system($b, $fname) == 0) { + return; + } + } + + print STDERR "Could not load web browser.\n"; +} + +sub RunKcachegrind { + my $fname = shift; + my $bg = shift; # "" or " &" if we should run in background + print STDERR "Starting '@KCACHEGRIND " . $fname . $bg . "'\n"; + system(ShellEscape(@KCACHEGRIND, $fname) . $bg); +} + + +##### Interactive helper routines ##### + +sub InteractiveMode { + $| = 1; # Make output unbuffered for interactive mode + my ($orig_profile, $symbols, $libs, $total) = @_; + + print STDERR "Welcome to jeprof! For help, type 'help'.\n"; + + # Use ReadLine if it's installed and input comes from a console. + if ( -t STDIN && + !ReadlineMightFail() && + defined(eval {require Term::ReadLine}) ) { + my $term = new Term::ReadLine 'jeprof'; + while ( defined ($_ = $term->readline('(jeprof) '))) { + $term->addhistory($_) if /\S/; + if (!InteractiveCommand($orig_profile, $symbols, $libs, $total, $_)) { + last; # exit when we get an interactive command to quit + } + } + } else { # don't have readline + while (1) { + print STDERR "(jeprof) "; + $_ = ; + last if ! defined $_ ; + s/\r//g; # turn windows-looking lines into unix-looking lines + + # Save some flags that might be reset by InteractiveCommand() + my $save_opt_lines = $main::opt_lines; + + if (!InteractiveCommand($orig_profile, $symbols, $libs, $total, $_)) { + last; # exit when we get an interactive command to quit + } + + # Restore flags + $main::opt_lines = $save_opt_lines; + } + } +} + +# Takes two args: orig profile, and command to run. +# Returns 1 if we should keep going, or 0 if we were asked to quit +sub InteractiveCommand { + my($orig_profile, $symbols, $libs, $total, $command) = @_; + $_ = $command; # just to make future m//'s easier + if (!defined($_)) { + print STDERR "\n"; + return 0; + } + if (m/^\s*quit/) { + return 0; + } + if (m/^\s*help/) { + InteractiveHelpMessage(); + return 1; + } + # Clear all the mode options -- mode is controlled by "$command" + $main::opt_text = 0; + $main::opt_callgrind = 0; + $main::opt_disasm = 0; + $main::opt_list = 0; + $main::opt_gv = 0; + $main::opt_evince = 0; + $main::opt_cum = 0; + + if (m/^\s*(text|top)(\d*)\s*(.*)/) { + $main::opt_text = 1; + + my $line_limit = ($2 ne "") ? int($2) : 10; + + my $routine; + my $ignore; + ($routine, $ignore) = ParseInteractiveArgs($3); + + my $profile = ProcessProfile($total, $orig_profile, $symbols, "", $ignore); + my $reduced = ReduceProfile($symbols, $profile); + + # Get derived profiles + my $flat = FlatProfile($reduced); + my $cumulative = CumulativeProfile($reduced); + + PrintText($symbols, $flat, $cumulative, $line_limit); + return 1; + } + if (m/^\s*callgrind\s*([^ \n]*)/) { + $main::opt_callgrind = 1; + + # Get derived profiles + my $calls = ExtractCalls($symbols, $orig_profile); + my $filename = $1; + if ( $1 eq '' ) { + $filename = TempName($main::next_tmpfile, "callgrind"); + } + PrintCallgrind($calls, $filename); + if ( $1 eq '' ) { + RunKcachegrind($filename, " & "); + $main::next_tmpfile++; + } + + return 1; + } + if (m/^\s*(web)?list\s*(.+)/) { + my $html = (defined($1) && ($1 eq "web")); + $main::opt_list = 1; + + my $routine; + my $ignore; + ($routine, $ignore) = ParseInteractiveArgs($2); + + my $profile = ProcessProfile($total, $orig_profile, $symbols, "", $ignore); + my $reduced = ReduceProfile($symbols, $profile); + + # Get derived profiles + my $flat = FlatProfile($reduced); + my $cumulative = CumulativeProfile($reduced); + + PrintListing($total, $libs, $flat, $cumulative, $routine, $html); + return 1; + } + if (m/^\s*disasm\s*(.+)/) { + $main::opt_disasm = 1; + + my $routine; + my $ignore; + ($routine, $ignore) = ParseInteractiveArgs($1); + + # Process current profile to account for various settings + my $profile = ProcessProfile($total, $orig_profile, $symbols, "", $ignore); + my $reduced = ReduceProfile($symbols, $profile); + + # Get derived profiles + my $flat = FlatProfile($reduced); + my $cumulative = CumulativeProfile($reduced); + + PrintDisassembly($libs, $flat, $cumulative, $routine); + return 1; + } + if (m/^\s*(gv|web|evince)\s*(.*)/) { + $main::opt_gv = 0; + $main::opt_evince = 0; + $main::opt_web = 0; + if ($1 eq "gv") { + $main::opt_gv = 1; + } elsif ($1 eq "evince") { + $main::opt_evince = 1; + } elsif ($1 eq "web") { + $main::opt_web = 1; + } + + my $focus; + my $ignore; + ($focus, $ignore) = ParseInteractiveArgs($2); + + # Process current profile to account for various settings + my $profile = ProcessProfile($total, $orig_profile, $symbols, + $focus, $ignore); + my $reduced = ReduceProfile($symbols, $profile); + + # Get derived profiles + my $flat = FlatProfile($reduced); + my $cumulative = CumulativeProfile($reduced); + + if (PrintDot($main::prog, $symbols, $profile, $flat, $cumulative, $total)) { + if ($main::opt_gv) { + RunGV(TempName($main::next_tmpfile, "ps"), " &"); + } elsif ($main::opt_evince) { + RunEvince(TempName($main::next_tmpfile, "pdf"), " &"); + } elsif ($main::opt_web) { + RunWeb(TempName($main::next_tmpfile, "svg")); + } + $main::next_tmpfile++; + } + return 1; + } + if (m/^\s*$/) { + return 1; + } + print STDERR "Unknown command: try 'help'.\n"; + return 1; +} + + +sub ProcessProfile { + my $total_count = shift; + my $orig_profile = shift; + my $symbols = shift; + my $focus = shift; + my $ignore = shift; + + # Process current profile to account for various settings + my $profile = $orig_profile; + printf("Total: %s %s\n", Unparse($total_count), Units()); + if ($focus ne '') { + $profile = FocusProfile($symbols, $profile, $focus); + my $focus_count = TotalProfile($profile); + printf("After focusing on '%s': %s %s of %s (%0.1f%%)\n", + $focus, + Unparse($focus_count), Units(), + Unparse($total_count), ($focus_count*100.0) / $total_count); + } + if ($ignore ne '') { + $profile = IgnoreProfile($symbols, $profile, $ignore); + my $ignore_count = TotalProfile($profile); + printf("After ignoring '%s': %s %s of %s (%0.1f%%)\n", + $ignore, + Unparse($ignore_count), Units(), + Unparse($total_count), + ($ignore_count*100.0) / $total_count); + } + + return $profile; +} + +sub InteractiveHelpMessage { + print STDERR <{$k}; + my @addrs = split(/\n/, $k); + if ($#addrs >= 0) { + my $depth = $#addrs + 1; + # int(foo / 2**32) is the only reliable way to get rid of bottom + # 32 bits on both 32- and 64-bit systems. + print pack('L*', $count & 0xFFFFFFFF, int($count / 2**32)); + print pack('L*', $depth & 0xFFFFFFFF, int($depth / 2**32)); + + foreach my $full_addr (@addrs) { + my $addr = $full_addr; + $addr =~ s/0x0*//; # strip off leading 0x, zeroes + if (length($addr) > 16) { + print STDERR "Invalid address in profile: $full_addr\n"; + next; + } + my $low_addr = substr($addr, -8); # get last 8 hex chars + my $high_addr = substr($addr, -16, 8); # get up to 8 more hex chars + print pack('L*', hex('0x' . $low_addr), hex('0x' . $high_addr)); + } + } + } +} + +# Print symbols and profile data +sub PrintSymbolizedProfile { + my $symbols = shift; + my $profile = shift; + my $prog = shift; + + $SYMBOL_PAGE =~ m,[^/]+$,; # matches everything after the last slash + my $symbol_marker = $&; + + print '--- ', $symbol_marker, "\n"; + if (defined($prog)) { + print 'binary=', $prog, "\n"; + } + while (my ($pc, $name) = each(%{$symbols})) { + my $sep = ' '; + print '0x', $pc; + # We have a list of function names, which include the inlined + # calls. They are separated (and terminated) by --, which is + # illegal in function names. + for (my $j = 2; $j <= $#{$name}; $j += 3) { + print $sep, $name->[$j]; + $sep = '--'; + } + print "\n"; + } + print '---', "\n"; + + my $profile_marker; + if ($main::profile_type eq 'heap') { + $HEAP_PAGE =~ m,[^/]+$,; # matches everything after the last slash + $profile_marker = $&; + } elsif ($main::profile_type eq 'growth') { + $GROWTH_PAGE =~ m,[^/]+$,; # matches everything after the last slash + $profile_marker = $&; + } elsif ($main::profile_type eq 'contention') { + $CONTENTION_PAGE =~ m,[^/]+$,; # matches everything after the last slash + $profile_marker = $&; + } else { # elsif ($main::profile_type eq 'cpu') + $PROFILE_PAGE =~ m,[^/]+$,; # matches everything after the last slash + $profile_marker = $&; + } + + print '--- ', $profile_marker, "\n"; + if (defined($main::collected_profile)) { + # if used with remote fetch, simply dump the collected profile to output. + open(SRC, "<$main::collected_profile"); + while () { + print $_; + } + close(SRC); + } else { + # --raw/http: For everything to work correctly for non-remote profiles, we + # would need to extend PrintProfileData() to handle all possible profile + # types, re-enable the code that is currently disabled in ReadCPUProfile() + # and FixCallerAddresses(), and remove the remote profile dumping code in + # the block above. + die "--raw/http: jeprof can only dump remote profiles for --raw\n"; + # dump a cpu-format profile to standard out + PrintProfileData($profile); + } +} + +# Print text output +sub PrintText { + my $symbols = shift; + my $flat = shift; + my $cumulative = shift; + my $line_limit = shift; + + my $total = TotalProfile($flat); + + # Which profile to sort by? + my $s = $main::opt_cum ? $cumulative : $flat; + + my $running_sum = 0; + my $lines = 0; + foreach my $k (sort { GetEntry($s, $b) <=> GetEntry($s, $a) || $a cmp $b } + keys(%{$cumulative})) { + my $f = GetEntry($flat, $k); + my $c = GetEntry($cumulative, $k); + $running_sum += $f; + + my $sym = $k; + if (exists($symbols->{$k})) { + $sym = $symbols->{$k}->[0] . " " . $symbols->{$k}->[1]; + if ($main::opt_addresses) { + $sym = $k . " " . $sym; + } + } + + if ($f != 0 || $c != 0) { + printf("%8s %6s %6s %8s %6s %s\n", + Unparse($f), + Percent($f, $total), + Percent($running_sum, $total), + Unparse($c), + Percent($c, $total), + $sym); + } + $lines++; + last if ($line_limit >= 0 && $lines >= $line_limit); + } +} + +# Callgrind format has a compression for repeated function and file +# names. You show the name the first time, and just use its number +# subsequently. This can cut down the file to about a third or a +# quarter of its uncompressed size. $key and $val are the key/value +# pair that would normally be printed by callgrind; $map is a map from +# value to number. +sub CompressedCGName { + my($key, $val, $map) = @_; + my $idx = $map->{$val}; + # For very short keys, providing an index hurts rather than helps. + if (length($val) <= 3) { + return "$key=$val\n"; + } elsif (defined($idx)) { + return "$key=($idx)\n"; + } else { + # scalar(keys $map) gives the number of items in the map. + $idx = scalar(keys(%{$map})) + 1; + $map->{$val} = $idx; + return "$key=($idx) $val\n"; + } +} + +# Print the call graph in a way that's suiteable for callgrind. +sub PrintCallgrind { + my $calls = shift; + my $filename; + my %filename_to_index_map; + my %fnname_to_index_map; + + if ($main::opt_interactive) { + $filename = shift; + print STDERR "Writing callgrind file to '$filename'.\n" + } else { + $filename = "&STDOUT"; + } + open(CG, ">$filename"); + printf CG ("events: Hits\n\n"); + foreach my $call ( map { $_->[0] } + sort { $a->[1] cmp $b ->[1] || + $a->[2] <=> $b->[2] } + map { /([^:]+):(\d+):([^ ]+)( -> ([^:]+):(\d+):(.+))?/; + [$_, $1, $2] } + keys %$calls ) { + my $count = int($calls->{$call}); + $call =~ /([^:]+):(\d+):([^ ]+)( -> ([^:]+):(\d+):(.+))?/; + my ( $caller_file, $caller_line, $caller_function, + $callee_file, $callee_line, $callee_function ) = + ( $1, $2, $3, $5, $6, $7 ); + + # TODO(csilvers): for better compression, collect all the + # caller/callee_files and functions first, before printing + # anything, and only compress those referenced more than once. + printf CG CompressedCGName("fl", $caller_file, \%filename_to_index_map); + printf CG CompressedCGName("fn", $caller_function, \%fnname_to_index_map); + if (defined $6) { + printf CG CompressedCGName("cfl", $callee_file, \%filename_to_index_map); + printf CG CompressedCGName("cfn", $callee_function, \%fnname_to_index_map); + printf CG ("calls=$count $callee_line\n"); + } + printf CG ("$caller_line $count\n\n"); + } +} + +# Print disassembly for all all routines that match $main::opt_disasm +sub PrintDisassembly { + my $libs = shift; + my $flat = shift; + my $cumulative = shift; + my $disasm_opts = shift; + + my $total = TotalProfile($flat); + + foreach my $lib (@{$libs}) { + my $symbol_table = GetProcedureBoundaries($lib->[0], $disasm_opts); + my $offset = AddressSub($lib->[1], $lib->[3]); + foreach my $routine (sort ByName keys(%{$symbol_table})) { + my $start_addr = $symbol_table->{$routine}->[0]; + my $end_addr = $symbol_table->{$routine}->[1]; + # See if there are any samples in this routine + my $length = hex(AddressSub($end_addr, $start_addr)); + my $addr = AddressAdd($start_addr, $offset); + for (my $i = 0; $i < $length; $i++) { + if (defined($cumulative->{$addr})) { + PrintDisassembledFunction($lib->[0], $offset, + $routine, $flat, $cumulative, + $start_addr, $end_addr, $total); + last; + } + $addr = AddressInc($addr); + } + } + } +} + +# Return reference to array of tuples of the form: +# [start_address, filename, linenumber, instruction, limit_address] +# E.g., +# ["0x806c43d", "/foo/bar.cc", 131, "ret", "0x806c440"] +sub Disassemble { + my $prog = shift; + my $offset = shift; + my $start_addr = shift; + my $end_addr = shift; + + my $objdump = $obj_tool_map{"objdump"}; + my $cmd = ShellEscape($objdump, "-C", "-d", "-l", "--no-show-raw-insn", + "--start-address=0x$start_addr", + "--stop-address=0x$end_addr", $prog); + open(OBJDUMP, "$cmd |") || error("$cmd: $!\n"); + my @result = (); + my $filename = ""; + my $linenumber = -1; + my $last = ["", "", "", ""]; + while () { + s/\r//g; # turn windows-looking lines into unix-looking lines + chop; + if (m|\s*([^:\s]+):(\d+)\s*$|) { + # Location line of the form: + # : + $filename = $1; + $linenumber = $2; + } elsif (m/^ +([0-9a-f]+):\s*(.*)/) { + # Disassembly line -- zero-extend address to full length + my $addr = HexExtend($1); + my $k = AddressAdd($addr, $offset); + $last->[4] = $k; # Store ending address for previous instruction + $last = [$k, $filename, $linenumber, $2, $end_addr]; + push(@result, $last); + } + } + close(OBJDUMP); + return @result; +} + +# The input file should contain lines of the form /proc/maps-like +# output (same format as expected from the profiles) or that looks +# like hex addresses (like "0xDEADBEEF"). We will parse all +# /proc/maps output, and for all the hex addresses, we will output +# "short" symbol names, one per line, in the same order as the input. +sub PrintSymbols { + my $maps_and_symbols_file = shift; + + # ParseLibraries expects pcs to be in a set. Fine by us... + my @pclist = (); # pcs in sorted order + my $pcs = {}; + my $map = ""; + foreach my $line (<$maps_and_symbols_file>) { + $line =~ s/\r//g; # turn windows-looking lines into unix-looking lines + if ($line =~ /\b(0x[0-9a-f]+)\b/i) { + push(@pclist, HexExtend($1)); + $pcs->{$pclist[-1]} = 1; + } else { + $map .= $line; + } + } + + my $libs = ParseLibraries($main::prog, $map, $pcs); + my $symbols = ExtractSymbols($libs, $pcs); + + foreach my $pc (@pclist) { + # ->[0] is the shortname, ->[2] is the full name + print(($symbols->{$pc}->[0] || "??") . "\n"); + } +} + + +# For sorting functions by name +sub ByName { + return ShortFunctionName($a) cmp ShortFunctionName($b); +} + +# Print source-listing for all all routines that match $list_opts +sub PrintListing { + my $total = shift; + my $libs = shift; + my $flat = shift; + my $cumulative = shift; + my $list_opts = shift; + my $html = shift; + + my $output = \*STDOUT; + my $fname = ""; + + if ($html) { + # Arrange to write the output to a temporary file + $fname = TempName($main::next_tmpfile, "html"); + $main::next_tmpfile++; + if (!open(TEMP, ">$fname")) { + print STDERR "$fname: $!\n"; + return; + } + $output = \*TEMP; + print $output HtmlListingHeader(); + printf $output ("
%s
Total: %s %s
\n", + $main::prog, Unparse($total), Units()); + } + + my $listed = 0; + foreach my $lib (@{$libs}) { + my $symbol_table = GetProcedureBoundaries($lib->[0], $list_opts); + my $offset = AddressSub($lib->[1], $lib->[3]); + foreach my $routine (sort ByName keys(%{$symbol_table})) { + # Print if there are any samples in this routine + my $start_addr = $symbol_table->{$routine}->[0]; + my $end_addr = $symbol_table->{$routine}->[1]; + my $length = hex(AddressSub($end_addr, $start_addr)); + my $addr = AddressAdd($start_addr, $offset); + for (my $i = 0; $i < $length; $i++) { + if (defined($cumulative->{$addr})) { + $listed += PrintSource( + $lib->[0], $offset, + $routine, $flat, $cumulative, + $start_addr, $end_addr, + $html, + $output); + last; + } + $addr = AddressInc($addr); + } + } + } + + if ($html) { + if ($listed > 0) { + print $output HtmlListingFooter(); + close($output); + RunWeb($fname); + } else { + close($output); + unlink($fname); + } + } +} + +sub HtmlListingHeader { + return <<'EOF'; + + + +Pprof listing + + + + +EOF +} + +sub HtmlListingFooter { + return <<'EOF'; + + +EOF +} + +sub HtmlEscape { + my $text = shift; + $text =~ s/&/&/g; + $text =~ s//>/g; + return $text; +} + +# Returns the indentation of the line, if it has any non-whitespace +# characters. Otherwise, returns -1. +sub Indentation { + my $line = shift; + if (m/^(\s*)\S/) { + return length($1); + } else { + return -1; + } +} + +# If the symbol table contains inlining info, Disassemble() may tag an +# instruction with a location inside an inlined function. But for +# source listings, we prefer to use the location in the function we +# are listing. So use MapToSymbols() to fetch full location +# information for each instruction and then pick out the first +# location from a location list (location list contains callers before +# callees in case of inlining). +# +# After this routine has run, each entry in $instructions contains: +# [0] start address +# [1] filename for function we are listing +# [2] line number for function we are listing +# [3] disassembly +# [4] limit address +# [5] most specific filename (may be different from [1] due to inlining) +# [6] most specific line number (may be different from [2] due to inlining) +sub GetTopLevelLineNumbers { + my ($lib, $offset, $instructions) = @_; + my $pcs = []; + for (my $i = 0; $i <= $#{$instructions}; $i++) { + push(@{$pcs}, $instructions->[$i]->[0]); + } + my $symbols = {}; + MapToSymbols($lib, $offset, $pcs, $symbols); + for (my $i = 0; $i <= $#{$instructions}; $i++) { + my $e = $instructions->[$i]; + push(@{$e}, $e->[1]); + push(@{$e}, $e->[2]); + my $addr = $e->[0]; + my $sym = $symbols->{$addr}; + if (defined($sym)) { + if ($#{$sym} >= 2 && $sym->[1] =~ m/^(.*):(\d+)$/) { + $e->[1] = $1; # File name + $e->[2] = $2; # Line number + } + } + } +} + +# Print source-listing for one routine +sub PrintSource { + my $prog = shift; + my $offset = shift; + my $routine = shift; + my $flat = shift; + my $cumulative = shift; + my $start_addr = shift; + my $end_addr = shift; + my $html = shift; + my $output = shift; + + # Disassemble all instructions (just to get line numbers) + my @instructions = Disassemble($prog, $offset, $start_addr, $end_addr); + GetTopLevelLineNumbers($prog, $offset, \@instructions); + + # Hack 1: assume that the first source file encountered in the + # disassembly contains the routine + my $filename = undef; + for (my $i = 0; $i <= $#instructions; $i++) { + if ($instructions[$i]->[2] >= 0) { + $filename = $instructions[$i]->[1]; + last; + } + } + if (!defined($filename)) { + print STDERR "no filename found in $routine\n"; + return 0; + } + + # Hack 2: assume that the largest line number from $filename is the + # end of the procedure. This is typically safe since if P1 contains + # an inlined call to P2, then P2 usually occurs earlier in the + # source file. If this does not work, we might have to compute a + # density profile or just print all regions we find. + my $lastline = 0; + for (my $i = 0; $i <= $#instructions; $i++) { + my $f = $instructions[$i]->[1]; + my $l = $instructions[$i]->[2]; + if (($f eq $filename) && ($l > $lastline)) { + $lastline = $l; + } + } + + # Hack 3: assume the first source location from "filename" is the start of + # the source code. + my $firstline = 1; + for (my $i = 0; $i <= $#instructions; $i++) { + if ($instructions[$i]->[1] eq $filename) { + $firstline = $instructions[$i]->[2]; + last; + } + } + + # Hack 4: Extend last line forward until its indentation is less than + # the indentation we saw on $firstline + my $oldlastline = $lastline; + { + if (!open(FILE, "<$filename")) { + print STDERR "$filename: $!\n"; + return 0; + } + my $l = 0; + my $first_indentation = -1; + while () { + s/\r//g; # turn windows-looking lines into unix-looking lines + $l++; + my $indent = Indentation($_); + if ($l >= $firstline) { + if ($first_indentation < 0 && $indent >= 0) { + $first_indentation = $indent; + last if ($first_indentation == 0); + } + } + if ($l >= $lastline && $indent >= 0) { + if ($indent >= $first_indentation) { + $lastline = $l+1; + } else { + last; + } + } + } + close(FILE); + } + + # Assign all samples to the range $firstline,$lastline, + # Hack 4: If an instruction does not occur in the range, its samples + # are moved to the next instruction that occurs in the range. + my $samples1 = {}; # Map from line number to flat count + my $samples2 = {}; # Map from line number to cumulative count + my $running1 = 0; # Unassigned flat counts + my $running2 = 0; # Unassigned cumulative counts + my $total1 = 0; # Total flat counts + my $total2 = 0; # Total cumulative counts + my %disasm = (); # Map from line number to disassembly + my $running_disasm = ""; # Unassigned disassembly + my $skip_marker = "---\n"; + if ($html) { + $skip_marker = ""; + for (my $l = $firstline; $l <= $lastline; $l++) { + $disasm{$l} = ""; + } + } + my $last_dis_filename = ''; + my $last_dis_linenum = -1; + my $last_touched_line = -1; # To detect gaps in disassembly for a line + foreach my $e (@instructions) { + # Add up counts for all address that fall inside this instruction + my $c1 = 0; + my $c2 = 0; + for (my $a = $e->[0]; $a lt $e->[4]; $a = AddressInc($a)) { + $c1 += GetEntry($flat, $a); + $c2 += GetEntry($cumulative, $a); + } + + if ($html) { + my $dis = sprintf(" %6s %6s \t\t%8s: %s ", + HtmlPrintNumber($c1), + HtmlPrintNumber($c2), + UnparseAddress($offset, $e->[0]), + CleanDisassembly($e->[3])); + + # Append the most specific source line associated with this instruction + if (length($dis) < 80) { $dis .= (' ' x (80 - length($dis))) }; + $dis = HtmlEscape($dis); + my $f = $e->[5]; + my $l = $e->[6]; + if ($f ne $last_dis_filename) { + $dis .= sprintf("%s:%d", + HtmlEscape(CleanFileName($f)), $l); + } elsif ($l ne $last_dis_linenum) { + # De-emphasize the unchanged file name portion + $dis .= sprintf("%s" . + ":%d", + HtmlEscape(CleanFileName($f)), $l); + } else { + # De-emphasize the entire location + $dis .= sprintf("%s:%d", + HtmlEscape(CleanFileName($f)), $l); + } + $last_dis_filename = $f; + $last_dis_linenum = $l; + $running_disasm .= $dis; + $running_disasm .= "\n"; + } + + $running1 += $c1; + $running2 += $c2; + $total1 += $c1; + $total2 += $c2; + my $file = $e->[1]; + my $line = $e->[2]; + if (($file eq $filename) && + ($line >= $firstline) && + ($line <= $lastline)) { + # Assign all accumulated samples to this line + AddEntry($samples1, $line, $running1); + AddEntry($samples2, $line, $running2); + $running1 = 0; + $running2 = 0; + if ($html) { + if ($line != $last_touched_line && $disasm{$line} ne '') { + $disasm{$line} .= "\n"; + } + $disasm{$line} .= $running_disasm; + $running_disasm = ''; + $last_touched_line = $line; + } + } + } + + # Assign any leftover samples to $lastline + AddEntry($samples1, $lastline, $running1); + AddEntry($samples2, $lastline, $running2); + if ($html) { + if ($lastline != $last_touched_line && $disasm{$lastline} ne '') { + $disasm{$lastline} .= "\n"; + } + $disasm{$lastline} .= $running_disasm; + } + + if ($html) { + printf $output ( + "

%s

%s\n
\n" .
+      "Total:%6s %6s (flat / cumulative %s)\n",
+      HtmlEscape(ShortFunctionName($routine)),
+      HtmlEscape(CleanFileName($filename)),
+      Unparse($total1),
+      Unparse($total2),
+      Units());
+  } else {
+    printf $output (
+      "ROUTINE ====================== %s in %s\n" .
+      "%6s %6s Total %s (flat / cumulative)\n",
+      ShortFunctionName($routine),
+      CleanFileName($filename),
+      Unparse($total1),
+      Unparse($total2),
+      Units());
+  }
+  if (!open(FILE, "<$filename")) {
+    print STDERR "$filename: $!\n";
+    return 0;
+  }
+  my $l = 0;
+  while () {
+    s/\r//g;         # turn windows-looking lines into unix-looking lines
+    $l++;
+    if ($l >= $firstline - 5 &&
+        (($l <= $oldlastline + 5) || ($l <= $lastline))) {
+      chop;
+      my $text = $_;
+      if ($l == $firstline) { print $output $skip_marker; }
+      my $n1 = GetEntry($samples1, $l);
+      my $n2 = GetEntry($samples2, $l);
+      if ($html) {
+        # Emit a span that has one of the following classes:
+        #    livesrc -- has samples
+        #    deadsrc -- has disassembly, but with no samples
+        #    nop     -- has no matching disasembly
+        # Also emit an optional span containing disassembly.
+        my $dis = $disasm{$l};
+        my $asm = "";
+        if (defined($dis) && $dis ne '') {
+          $asm = "" . $dis . "";
+        }
+        my $source_class = (($n1 + $n2 > 0)
+                            ? "livesrc"
+                            : (($asm ne "") ? "deadsrc" : "nop"));
+        printf $output (
+          "%5d " .
+          "%6s %6s %s%s\n",
+          $l, $source_class,
+          HtmlPrintNumber($n1),
+          HtmlPrintNumber($n2),
+          HtmlEscape($text),
+          $asm);
+      } else {
+        printf $output(
+          "%6s %6s %4d: %s\n",
+          UnparseAlt($n1),
+          UnparseAlt($n2),
+          $l,
+          $text);
+      }
+      if ($l == $lastline)  { print $output $skip_marker; }
+    };
+  }
+  close(FILE);
+  if ($html) {
+    print $output "
\n"; + } + return 1; +} + +# Return the source line for the specified file/linenumber. +# Returns undef if not found. +sub SourceLine { + my $file = shift; + my $line = shift; + + # Look in cache + if (!defined($main::source_cache{$file})) { + if (100 < scalar keys(%main::source_cache)) { + # Clear the cache when it gets too big + $main::source_cache = (); + } + + # Read all lines from the file + if (!open(FILE, "<$file")) { + print STDERR "$file: $!\n"; + $main::source_cache{$file} = []; # Cache the negative result + return undef; + } + my $lines = []; + push(@{$lines}, ""); # So we can use 1-based line numbers as indices + while () { + push(@{$lines}, $_); + } + close(FILE); + + # Save the lines in the cache + $main::source_cache{$file} = $lines; + } + + my $lines = $main::source_cache{$file}; + if (($line < 0) || ($line > $#{$lines})) { + return undef; + } else { + return $lines->[$line]; + } +} + +# Print disassembly for one routine with interspersed source if available +sub PrintDisassembledFunction { + my $prog = shift; + my $offset = shift; + my $routine = shift; + my $flat = shift; + my $cumulative = shift; + my $start_addr = shift; + my $end_addr = shift; + my $total = shift; + + # Disassemble all instructions + my @instructions = Disassemble($prog, $offset, $start_addr, $end_addr); + + # Make array of counts per instruction + my @flat_count = (); + my @cum_count = (); + my $flat_total = 0; + my $cum_total = 0; + foreach my $e (@instructions) { + # Add up counts for all address that fall inside this instruction + my $c1 = 0; + my $c2 = 0; + for (my $a = $e->[0]; $a lt $e->[4]; $a = AddressInc($a)) { + $c1 += GetEntry($flat, $a); + $c2 += GetEntry($cumulative, $a); + } + push(@flat_count, $c1); + push(@cum_count, $c2); + $flat_total += $c1; + $cum_total += $c2; + } + + # Print header with total counts + printf("ROUTINE ====================== %s\n" . + "%6s %6s %s (flat, cumulative) %.1f%% of total\n", + ShortFunctionName($routine), + Unparse($flat_total), + Unparse($cum_total), + Units(), + ($cum_total * 100.0) / $total); + + # Process instructions in order + my $current_file = ""; + for (my $i = 0; $i <= $#instructions; ) { + my $e = $instructions[$i]; + + # Print the new file name whenever we switch files + if ($e->[1] ne $current_file) { + $current_file = $e->[1]; + my $fname = $current_file; + $fname =~ s|^\./||; # Trim leading "./" + + # Shorten long file names + if (length($fname) >= 58) { + $fname = "..." . substr($fname, -55); + } + printf("-------------------- %s\n", $fname); + } + + # TODO: Compute range of lines to print together to deal with + # small reorderings. + my $first_line = $e->[2]; + my $last_line = $first_line; + my %flat_sum = (); + my %cum_sum = (); + for (my $l = $first_line; $l <= $last_line; $l++) { + $flat_sum{$l} = 0; + $cum_sum{$l} = 0; + } + + # Find run of instructions for this range of source lines + my $first_inst = $i; + while (($i <= $#instructions) && + ($instructions[$i]->[2] >= $first_line) && + ($instructions[$i]->[2] <= $last_line)) { + $e = $instructions[$i]; + $flat_sum{$e->[2]} += $flat_count[$i]; + $cum_sum{$e->[2]} += $cum_count[$i]; + $i++; + } + my $last_inst = $i - 1; + + # Print source lines + for (my $l = $first_line; $l <= $last_line; $l++) { + my $line = SourceLine($current_file, $l); + if (!defined($line)) { + $line = "?\n"; + next; + } else { + $line =~ s/^\s+//; + } + printf("%6s %6s %5d: %s", + UnparseAlt($flat_sum{$l}), + UnparseAlt($cum_sum{$l}), + $l, + $line); + } + + # Print disassembly + for (my $x = $first_inst; $x <= $last_inst; $x++) { + my $e = $instructions[$x]; + printf("%6s %6s %8s: %6s\n", + UnparseAlt($flat_count[$x]), + UnparseAlt($cum_count[$x]), + UnparseAddress($offset, $e->[0]), + CleanDisassembly($e->[3])); + } + } +} + +# Print DOT graph +sub PrintDot { + my $prog = shift; + my $symbols = shift; + my $raw = shift; + my $flat = shift; + my $cumulative = shift; + my $overall_total = shift; + + # Get total + my $local_total = TotalProfile($flat); + my $nodelimit = int($main::opt_nodefraction * $local_total); + my $edgelimit = int($main::opt_edgefraction * $local_total); + my $nodecount = $main::opt_nodecount; + + # Find nodes to include + my @list = (sort { abs(GetEntry($cumulative, $b)) <=> + abs(GetEntry($cumulative, $a)) + || $a cmp $b } + keys(%{$cumulative})); + my $last = $nodecount - 1; + if ($last > $#list) { + $last = $#list; + } + while (($last >= 0) && + (abs(GetEntry($cumulative, $list[$last])) <= $nodelimit)) { + $last--; + } + if ($last < 0) { + print STDERR "No nodes to print\n"; + return 0; + } + + if ($nodelimit > 0 || $edgelimit > 0) { + printf STDERR ("Dropping nodes with <= %s %s; edges with <= %s abs(%s)\n", + Unparse($nodelimit), Units(), + Unparse($edgelimit), Units()); + } + + # Open DOT output file + my $output; + my $escaped_dot = ShellEscape(@DOT); + my $escaped_ps2pdf = ShellEscape(@PS2PDF); + if ($main::opt_gv) { + my $escaped_outfile = ShellEscape(TempName($main::next_tmpfile, "ps")); + $output = "| $escaped_dot -Tps2 >$escaped_outfile"; + } elsif ($main::opt_evince) { + my $escaped_outfile = ShellEscape(TempName($main::next_tmpfile, "pdf")); + $output = "| $escaped_dot -Tps2 | $escaped_ps2pdf - $escaped_outfile"; + } elsif ($main::opt_ps) { + $output = "| $escaped_dot -Tps2"; + } elsif ($main::opt_pdf) { + $output = "| $escaped_dot -Tps2 | $escaped_ps2pdf - -"; + } elsif ($main::opt_web || $main::opt_svg) { + # We need to post-process the SVG, so write to a temporary file always. + my $escaped_outfile = ShellEscape(TempName($main::next_tmpfile, "svg")); + $output = "| $escaped_dot -Tsvg >$escaped_outfile"; + } elsif ($main::opt_gif) { + $output = "| $escaped_dot -Tgif"; + } else { + $output = ">&STDOUT"; + } + open(DOT, $output) || error("$output: $!\n"); + + # Title + printf DOT ("digraph \"%s; %s %s\" {\n", + $prog, + Unparse($overall_total), + Units()); + if ($main::opt_pdf) { + # The output is more printable if we set the page size for dot. + printf DOT ("size=\"8,11\"\n"); + } + printf DOT ("node [width=0.375,height=0.25];\n"); + + # Print legend + printf DOT ("Legend [shape=box,fontsize=24,shape=plaintext," . + "label=\"%s\\l%s\\l%s\\l%s\\l%s\\l\"];\n", + $prog, + sprintf("Total %s: %s", Units(), Unparse($overall_total)), + sprintf("Focusing on: %s", Unparse($local_total)), + sprintf("Dropped nodes with <= %s abs(%s)", + Unparse($nodelimit), Units()), + sprintf("Dropped edges with <= %s %s", + Unparse($edgelimit), Units()) + ); + + # Print nodes + my %node = (); + my $nextnode = 1; + foreach my $a (@list[0..$last]) { + # Pick font size + my $f = GetEntry($flat, $a); + my $c = GetEntry($cumulative, $a); + + my $fs = 8; + if ($local_total > 0) { + $fs = 8 + (50.0 * sqrt(abs($f * 1.0 / $local_total))); + } + + $node{$a} = $nextnode++; + my $sym = $a; + $sym =~ s/\s+/\\n/g; + $sym =~ s/::/\\n/g; + + # Extra cumulative info to print for non-leaves + my $extra = ""; + if ($f != $c) { + $extra = sprintf("\\rof %s (%s)", + Unparse($c), + Percent($c, $local_total)); + } + my $style = ""; + if ($main::opt_heapcheck) { + if ($f > 0) { + # make leak-causing nodes more visible (add a background) + $style = ",style=filled,fillcolor=gray" + } elsif ($f < 0) { + # make anti-leak-causing nodes (which almost never occur) + # stand out as well (triple border) + $style = ",peripheries=3" + } + } + + printf DOT ("N%d [label=\"%s\\n%s (%s)%s\\r" . + "\",shape=box,fontsize=%.1f%s];\n", + $node{$a}, + $sym, + Unparse($f), + Percent($f, $local_total), + $extra, + $fs, + $style, + ); + } + + # Get edges and counts per edge + my %edge = (); + my $n; + my $fullname_to_shortname_map = {}; + FillFullnameToShortnameMap($symbols, $fullname_to_shortname_map); + foreach my $k (keys(%{$raw})) { + # TODO: omit low %age edges + $n = $raw->{$k}; + my @translated = TranslateStack($symbols, $fullname_to_shortname_map, $k); + for (my $i = 1; $i <= $#translated; $i++) { + my $src = $translated[$i]; + my $dst = $translated[$i-1]; + #next if ($src eq $dst); # Avoid self-edges? + if (exists($node{$src}) && exists($node{$dst})) { + my $edge_label = "$src\001$dst"; + if (!exists($edge{$edge_label})) { + $edge{$edge_label} = 0; + } + $edge{$edge_label} += $n; + } + } + } + + # Print edges (process in order of decreasing counts) + my %indegree = (); # Number of incoming edges added per node so far + my %outdegree = (); # Number of outgoing edges added per node so far + foreach my $e (sort { $edge{$b} <=> $edge{$a} } keys(%edge)) { + my @x = split(/\001/, $e); + $n = $edge{$e}; + + # Initialize degree of kept incoming and outgoing edges if necessary + my $src = $x[0]; + my $dst = $x[1]; + if (!exists($outdegree{$src})) { $outdegree{$src} = 0; } + if (!exists($indegree{$dst})) { $indegree{$dst} = 0; } + + my $keep; + if ($indegree{$dst} == 0) { + # Keep edge if needed for reachability + $keep = 1; + } elsif (abs($n) <= $edgelimit) { + # Drop if we are below --edgefraction + $keep = 0; + } elsif ($outdegree{$src} >= $main::opt_maxdegree || + $indegree{$dst} >= $main::opt_maxdegree) { + # Keep limited number of in/out edges per node + $keep = 0; + } else { + $keep = 1; + } + + if ($keep) { + $outdegree{$src}++; + $indegree{$dst}++; + + # Compute line width based on edge count + my $fraction = abs($local_total ? (3 * ($n / $local_total)) : 0); + if ($fraction > 1) { $fraction = 1; } + my $w = $fraction * 2; + if ($w < 1 && ($main::opt_web || $main::opt_svg)) { + # SVG output treats line widths < 1 poorly. + $w = 1; + } + + # Dot sometimes segfaults if given edge weights that are too large, so + # we cap the weights at a large value + my $edgeweight = abs($n) ** 0.7; + if ($edgeweight > 100000) { $edgeweight = 100000; } + $edgeweight = int($edgeweight); + + my $style = sprintf("setlinewidth(%f)", $w); + if ($x[1] =~ m/\(inline\)/) { + $style .= ",dashed"; + } + + # Use a slightly squashed function of the edge count as the weight + printf DOT ("N%s -> N%s [label=%s, weight=%d, style=\"%s\"];\n", + $node{$x[0]}, + $node{$x[1]}, + Unparse($n), + $edgeweight, + $style); + } + } + + print DOT ("}\n"); + close(DOT); + + if ($main::opt_web || $main::opt_svg) { + # Rewrite SVG to be more usable inside web browser. + RewriteSvg(TempName($main::next_tmpfile, "svg")); + } + + return 1; +} + +sub RewriteSvg { + my $svgfile = shift; + + open(SVG, $svgfile) || die "open temp svg: $!"; + my @svg = ; + close(SVG); + unlink $svgfile; + my $svg = join('', @svg); + + # Dot's SVG output is + # + # + # + # ... + # + # + # + # Change it to + # + # + # $svg_javascript + # + # + # ... + # + # + # + + # Fix width, height; drop viewBox. + $svg =~ s/(?s) above first + my $svg_javascript = SvgJavascript(); + my $viewport = "\n"; + $svg =~ s/ above . + $svg =~ s/(.*)(<\/svg>)/$1<\/g>$2/; + $svg =~ s/$svgfile") || die "open $svgfile: $!"; + print SVG $svg; + close(SVG); + } +} + +sub SvgJavascript { + return <<'EOF'; + +EOF +} + +# Provides a map from fullname to shortname for cases where the +# shortname is ambiguous. The symlist has both the fullname and +# shortname for all symbols, which is usually fine, but sometimes -- +# such as overloaded functions -- two different fullnames can map to +# the same shortname. In that case, we use the address of the +# function to disambiguate the two. This function fills in a map that +# maps fullnames to modified shortnames in such cases. If a fullname +# is not present in the map, the 'normal' shortname provided by the +# symlist is the appropriate one to use. +sub FillFullnameToShortnameMap { + my $symbols = shift; + my $fullname_to_shortname_map = shift; + my $shortnames_seen_once = {}; + my $shortnames_seen_more_than_once = {}; + + foreach my $symlist (values(%{$symbols})) { + # TODO(csilvers): deal with inlined symbols too. + my $shortname = $symlist->[0]; + my $fullname = $symlist->[2]; + if ($fullname !~ /<[0-9a-fA-F]+>$/) { # fullname doesn't end in an address + next; # the only collisions we care about are when addresses differ + } + if (defined($shortnames_seen_once->{$shortname}) && + $shortnames_seen_once->{$shortname} ne $fullname) { + $shortnames_seen_more_than_once->{$shortname} = 1; + } else { + $shortnames_seen_once->{$shortname} = $fullname; + } + } + + foreach my $symlist (values(%{$symbols})) { + my $shortname = $symlist->[0]; + my $fullname = $symlist->[2]; + # TODO(csilvers): take in a list of addresses we care about, and only + # store in the map if $symlist->[1] is in that list. Saves space. + next if defined($fullname_to_shortname_map->{$fullname}); + if (defined($shortnames_seen_more_than_once->{$shortname})) { + if ($fullname =~ /<0*([^>]*)>$/) { # fullname has address at end of it + $fullname_to_shortname_map->{$fullname} = "$shortname\@$1"; + } + } + } +} + +# Return a small number that identifies the argument. +# Multiple calls with the same argument will return the same number. +# Calls with different arguments will return different numbers. +sub ShortIdFor { + my $key = shift; + my $id = $main::uniqueid{$key}; + if (!defined($id)) { + $id = keys(%main::uniqueid) + 1; + $main::uniqueid{$key} = $id; + } + return $id; +} + +# Translate a stack of addresses into a stack of symbols +sub TranslateStack { + my $symbols = shift; + my $fullname_to_shortname_map = shift; + my $k = shift; + + my @addrs = split(/\n/, $k); + my @result = (); + for (my $i = 0; $i <= $#addrs; $i++) { + my $a = $addrs[$i]; + + # Skip large addresses since they sometimes show up as fake entries on RH9 + if (length($a) > 8 && $a gt "7fffffffffffffff") { + next; + } + + if ($main::opt_disasm || $main::opt_list) { + # We want just the address for the key + push(@result, $a); + next; + } + + my $symlist = $symbols->{$a}; + if (!defined($symlist)) { + $symlist = [$a, "", $a]; + } + + # We can have a sequence of symbols for a particular entry + # (more than one symbol in the case of inlining). Callers + # come before callees in symlist, so walk backwards since + # the translated stack should contain callees before callers. + for (my $j = $#{$symlist}; $j >= 2; $j -= 3) { + my $func = $symlist->[$j-2]; + my $fileline = $symlist->[$j-1]; + my $fullfunc = $symlist->[$j]; + if (defined($fullname_to_shortname_map->{$fullfunc})) { + $func = $fullname_to_shortname_map->{$fullfunc}; + } + if ($j > 2) { + $func = "$func (inline)"; + } + + # Do not merge nodes corresponding to Callback::Run since that + # causes confusing cycles in dot display. Instead, we synthesize + # a unique name for this frame per caller. + if ($func =~ m/Callback.*::Run$/) { + my $caller = ($i > 0) ? $addrs[$i-1] : 0; + $func = "Run#" . ShortIdFor($caller); + } + + if ($main::opt_addresses) { + push(@result, "$a $func $fileline"); + } elsif ($main::opt_lines) { + if ($func eq '??' && $fileline eq '??:0') { + push(@result, "$a"); + } else { + push(@result, "$func $fileline"); + } + } elsif ($main::opt_functions) { + if ($func eq '??') { + push(@result, "$a"); + } else { + push(@result, $func); + } + } elsif ($main::opt_files) { + if ($fileline eq '??:0' || $fileline eq '') { + push(@result, "$a"); + } else { + my $f = $fileline; + $f =~ s/:\d+$//; + push(@result, $f); + } + } else { + push(@result, $a); + last; # Do not print inlined info + } + } + } + + # print join(",", @addrs), " => ", join(",", @result), "\n"; + return @result; +} + +# Generate percent string for a number and a total +sub Percent { + my $num = shift; + my $tot = shift; + if ($tot != 0) { + return sprintf("%.1f%%", $num * 100.0 / $tot); + } else { + return ($num == 0) ? "nan" : (($num > 0) ? "+inf" : "-inf"); + } +} + +# Generate pretty-printed form of number +sub Unparse { + my $num = shift; + if ($main::profile_type eq 'heap' || $main::profile_type eq 'growth') { + if ($main::opt_inuse_objects || $main::opt_alloc_objects) { + return sprintf("%d", $num); + } else { + if ($main::opt_show_bytes) { + return sprintf("%d", $num); + } else { + return sprintf("%.1f", $num / 1048576.0); + } + } + } elsif ($main::profile_type eq 'contention' && !$main::opt_contentions) { + return sprintf("%.3f", $num / 1e9); # Convert nanoseconds to seconds + } else { + return sprintf("%d", $num); + } +} + +# Alternate pretty-printed form: 0 maps to "." +sub UnparseAlt { + my $num = shift; + if ($num == 0) { + return "."; + } else { + return Unparse($num); + } +} + +# Alternate pretty-printed form: 0 maps to "" +sub HtmlPrintNumber { + my $num = shift; + if ($num == 0) { + return ""; + } else { + return Unparse($num); + } +} + +# Return output units +sub Units { + if ($main::profile_type eq 'heap' || $main::profile_type eq 'growth') { + if ($main::opt_inuse_objects || $main::opt_alloc_objects) { + return "objects"; + } else { + if ($main::opt_show_bytes) { + return "B"; + } else { + return "MB"; + } + } + } elsif ($main::profile_type eq 'contention' && !$main::opt_contentions) { + return "seconds"; + } else { + return "samples"; + } +} + +##### Profile manipulation code ##### + +# Generate flattened profile: +# If count is charged to stack [a,b,c,d], in generated profile, +# it will be charged to [a] +sub FlatProfile { + my $profile = shift; + my $result = {}; + foreach my $k (keys(%{$profile})) { + my $count = $profile->{$k}; + my @addrs = split(/\n/, $k); + if ($#addrs >= 0) { + AddEntry($result, $addrs[0], $count); + } + } + return $result; +} + +# Generate cumulative profile: +# If count is charged to stack [a,b,c,d], in generated profile, +# it will be charged to [a], [b], [c], [d] +sub CumulativeProfile { + my $profile = shift; + my $result = {}; + foreach my $k (keys(%{$profile})) { + my $count = $profile->{$k}; + my @addrs = split(/\n/, $k); + foreach my $a (@addrs) { + AddEntry($result, $a, $count); + } + } + return $result; +} + +# If the second-youngest PC on the stack is always the same, returns +# that pc. Otherwise, returns undef. +sub IsSecondPcAlwaysTheSame { + my $profile = shift; + + my $second_pc = undef; + foreach my $k (keys(%{$profile})) { + my @addrs = split(/\n/, $k); + if ($#addrs < 1) { + return undef; + } + if (not defined $second_pc) { + $second_pc = $addrs[1]; + } else { + if ($second_pc ne $addrs[1]) { + return undef; + } + } + } + return $second_pc; +} + +sub ExtractSymbolNameInlineStack { + my $symbols = shift; + my $address = shift; + + my @stack = (); + + if (exists $symbols->{$address}) { + my @localinlinestack = @{$symbols->{$address}}; + for (my $i = $#localinlinestack; $i > 0; $i-=3) { + my $file = $localinlinestack[$i-1]; + my $fn = $localinlinestack[$i-0]; + + if ($file eq "?" || $file eq ":0") { + $file = "??:0"; + } + if ($fn eq '??') { + # If we can't get the symbol name, at least use the file information. + $fn = $file; + } + my $suffix = "[inline]"; + if ($i == 2) { + $suffix = ""; + } + push (@stack, $fn.$suffix); + } + } + else { + # If we can't get a symbol name, at least fill in the address. + push (@stack, $address); + } + + return @stack; +} + +sub ExtractSymbolLocation { + my $symbols = shift; + my $address = shift; + # 'addr2line' outputs "??:0" for unknown locations; we do the + # same to be consistent. + my $location = "??:0:unknown"; + if (exists $symbols->{$address}) { + my $file = $symbols->{$address}->[1]; + if ($file eq "?") { + $file = "??:0" + } + $location = $file . ":" . $symbols->{$address}->[0]; + } + return $location; +} + +# Extracts a graph of calls. +sub ExtractCalls { + my $symbols = shift; + my $profile = shift; + + my $calls = {}; + while( my ($stack_trace, $count) = each %$profile ) { + my @address = split(/\n/, $stack_trace); + my $destination = ExtractSymbolLocation($symbols, $address[0]); + AddEntry($calls, $destination, $count); + for (my $i = 1; $i <= $#address; $i++) { + my $source = ExtractSymbolLocation($symbols, $address[$i]); + my $call = "$source -> $destination"; + AddEntry($calls, $call, $count); + $destination = $source; + } + } + + return $calls; +} + +sub FilterFrames { + my $symbols = shift; + my $profile = shift; + + if ($main::opt_retain eq '' && $main::opt_exclude eq '') { + return $profile; + } + + my $result = {}; + foreach my $k (keys(%{$profile})) { + my $count = $profile->{$k}; + my @addrs = split(/\n/, $k); + my @path = (); + foreach my $a (@addrs) { + my $sym; + if (exists($symbols->{$a})) { + $sym = $symbols->{$a}->[0]; + } else { + $sym = $a; + } + if ($main::opt_retain ne '' && $sym !~ m/$main::opt_retain/) { + next; + } + if ($main::opt_exclude ne '' && $sym =~ m/$main::opt_exclude/) { + next; + } + push(@path, $a); + } + if (scalar(@path) > 0) { + my $reduced_path = join("\n", @path); + AddEntry($result, $reduced_path, $count); + } + } + + return $result; +} + +sub PrintCollapsedStacks { + my $symbols = shift; + my $profile = shift; + + while (my ($stack_trace, $count) = each %$profile) { + my @address = split(/\n/, $stack_trace); + my @names = reverse ( map { ExtractSymbolNameInlineStack($symbols, $_) } @address ); + printf("%s %d\n", join(";", @names), $count); + } +} + +sub RemoveUninterestingFrames { + my $symbols = shift; + my $profile = shift; + + # List of function names to skip + my %skip = (); + my $skip_regexp = 'NOMATCH'; + if ($main::profile_type eq 'heap' || $main::profile_type eq 'growth') { + foreach my $name ('@JEMALLOC_PREFIX@calloc', + 'cfree', + '@JEMALLOC_PREFIX@malloc', + 'je_malloc_default', + 'newImpl', + 'void* newImpl', + 'fallbackNewImpl', + 'void* fallbackNewImpl', + '@JEMALLOC_PREFIX@free', + '@JEMALLOC_PREFIX@memalign', + '@JEMALLOC_PREFIX@posix_memalign', + '@JEMALLOC_PREFIX@aligned_alloc', + 'pvalloc', + '@JEMALLOC_PREFIX@valloc', + '@JEMALLOC_PREFIX@realloc', + '@JEMALLOC_PREFIX@mallocx', + '@JEMALLOC_PREFIX@rallocx', + 'do_rallocx', + '@JEMALLOC_PREFIX@xallocx', + '@JEMALLOC_PREFIX@dallocx', + '@JEMALLOC_PREFIX@sdallocx', + '@JEMALLOC_PREFIX@sdallocx_noflags', + 'tc_calloc', + 'tc_cfree', + 'tc_malloc', + 'tc_free', + 'tc_memalign', + 'tc_posix_memalign', + 'tc_pvalloc', + 'tc_valloc', + 'tc_realloc', + 'tc_new', + 'tc_delete', + 'tc_newarray', + 'tc_deletearray', + 'tc_new_nothrow', + 'tc_newarray_nothrow', + 'do_malloc', + '::do_malloc', # new name -- got moved to an unnamed ns + '::do_malloc_or_cpp_alloc', + 'DoSampledAllocation', + 'simple_alloc::allocate', + '__malloc_alloc_template::allocate', + '__builtin_delete', + '__builtin_new', + '__builtin_vec_delete', + '__builtin_vec_new', + 'operator new', + 'operator new[]', + # The entry to our memory-allocation routines on OS X + 'malloc_zone_malloc', + 'malloc_zone_calloc', + 'malloc_zone_valloc', + 'malloc_zone_realloc', + 'malloc_zone_memalign', + 'malloc_zone_free', + # These mark the beginning/end of our custom sections + '__start_google_malloc', + '__stop_google_malloc', + '__start_malloc_hook', + '__stop_malloc_hook') { + $skip{$name} = 1; + $skip{"_" . $name} = 1; # Mach (OS X) adds a _ prefix to everything + } + # TODO: Remove TCMalloc once everything has been + # moved into the tcmalloc:: namespace and we have flushed + # old code out of the system. + $skip_regexp = "TCMalloc|^tcmalloc::"; + } elsif ($main::profile_type eq 'contention') { + foreach my $vname ('base::RecordLockProfileData', + 'base::SubmitMutexProfileData', + 'base::SubmitSpinLockProfileData', + 'Mutex::Unlock', + 'Mutex::UnlockSlow', + 'Mutex::ReaderUnlock', + 'MutexLock::~MutexLock', + 'SpinLock::Unlock', + 'SpinLock::SlowUnlock', + 'SpinLockHolder::~SpinLockHolder') { + $skip{$vname} = 1; + } + } elsif ($main::profile_type eq 'cpu') { + # Drop signal handlers used for CPU profile collection + # TODO(dpeng): this should not be necessary; it's taken + # care of by the general 2nd-pc mechanism below. + foreach my $name ('ProfileData::Add', # historical + 'ProfileData::prof_handler', # historical + 'CpuProfiler::prof_handler', + '__FRAME_END__', + '__pthread_sighandler', + '__restore') { + $skip{$name} = 1; + } + } else { + # Nothing skipped for unknown types + } + + if ($main::profile_type eq 'cpu') { + # If all the second-youngest program counters are the same, + # this STRONGLY suggests that it is an artifact of measurement, + # i.e., stack frames pushed by the CPU profiler signal handler. + # Hence, we delete them. + # (The topmost PC is read from the signal structure, not from + # the stack, so it does not get involved.) + while (my $second_pc = IsSecondPcAlwaysTheSame($profile)) { + my $result = {}; + my $func = ''; + if (exists($symbols->{$second_pc})) { + $second_pc = $symbols->{$second_pc}->[0]; + } + print STDERR "Removing $second_pc from all stack traces.\n"; + foreach my $k (keys(%{$profile})) { + my $count = $profile->{$k}; + my @addrs = split(/\n/, $k); + splice @addrs, 1, 1; + my $reduced_path = join("\n", @addrs); + AddEntry($result, $reduced_path, $count); + } + $profile = $result; + } + } + + my $result = {}; + foreach my $k (keys(%{$profile})) { + my $count = $profile->{$k}; + my @addrs = split(/\n/, $k); + my @path = (); + foreach my $a (@addrs) { + if (exists($symbols->{$a})) { + my $func = $symbols->{$a}->[0]; + if ($skip{$func} || ($func =~ m/$skip_regexp/)) { + # Throw away the portion of the backtrace seen so far, under the + # assumption that previous frames were for functions internal to the + # allocator. + @path = (); + next; + } + } + push(@path, $a); + } + my $reduced_path = join("\n", @path); + AddEntry($result, $reduced_path, $count); + } + + $result = FilterFrames($symbols, $result); + + return $result; +} + +# Reduce profile to granularity given by user +sub ReduceProfile { + my $symbols = shift; + my $profile = shift; + my $result = {}; + my $fullname_to_shortname_map = {}; + FillFullnameToShortnameMap($symbols, $fullname_to_shortname_map); + foreach my $k (keys(%{$profile})) { + my $count = $profile->{$k}; + my @translated = TranslateStack($symbols, $fullname_to_shortname_map, $k); + my @path = (); + my %seen = (); + $seen{''} = 1; # So that empty keys are skipped + foreach my $e (@translated) { + # To avoid double-counting due to recursion, skip a stack-trace + # entry if it has already been seen + if (!$seen{$e}) { + $seen{$e} = 1; + push(@path, $e); + } + } + my $reduced_path = join("\n", @path); + AddEntry($result, $reduced_path, $count); + } + return $result; +} + +# Does the specified symbol array match the regexp? +sub SymbolMatches { + my $sym = shift; + my $re = shift; + if (defined($sym)) { + for (my $i = 0; $i < $#{$sym}; $i += 3) { + if ($sym->[$i] =~ m/$re/ || $sym->[$i+1] =~ m/$re/) { + return 1; + } + } + } + return 0; +} + +# Focus only on paths involving specified regexps +sub FocusProfile { + my $symbols = shift; + my $profile = shift; + my $focus = shift; + my $result = {}; + foreach my $k (keys(%{$profile})) { + my $count = $profile->{$k}; + my @addrs = split(/\n/, $k); + foreach my $a (@addrs) { + # Reply if it matches either the address/shortname/fileline + if (($a =~ m/$focus/) || SymbolMatches($symbols->{$a}, $focus)) { + AddEntry($result, $k, $count); + last; + } + } + } + return $result; +} + +# Focus only on paths not involving specified regexps +sub IgnoreProfile { + my $symbols = shift; + my $profile = shift; + my $ignore = shift; + my $result = {}; + foreach my $k (keys(%{$profile})) { + my $count = $profile->{$k}; + my @addrs = split(/\n/, $k); + my $matched = 0; + foreach my $a (@addrs) { + # Reply if it matches either the address/shortname/fileline + if (($a =~ m/$ignore/) || SymbolMatches($symbols->{$a}, $ignore)) { + $matched = 1; + last; + } + } + if (!$matched) { + AddEntry($result, $k, $count); + } + } + return $result; +} + +# Get total count in profile +sub TotalProfile { + my $profile = shift; + my $result = 0; + foreach my $k (keys(%{$profile})) { + $result += $profile->{$k}; + } + return $result; +} + +# Add A to B +sub AddProfile { + my $A = shift; + my $B = shift; + + my $R = {}; + # add all keys in A + foreach my $k (keys(%{$A})) { + my $v = $A->{$k}; + AddEntry($R, $k, $v); + } + # add all keys in B + foreach my $k (keys(%{$B})) { + my $v = $B->{$k}; + AddEntry($R, $k, $v); + } + return $R; +} + +# Merges symbol maps +sub MergeSymbols { + my $A = shift; + my $B = shift; + + my $R = {}; + foreach my $k (keys(%{$A})) { + $R->{$k} = $A->{$k}; + } + if (defined($B)) { + foreach my $k (keys(%{$B})) { + $R->{$k} = $B->{$k}; + } + } + return $R; +} + + +# Add A to B +sub AddPcs { + my $A = shift; + my $B = shift; + + my $R = {}; + # add all keys in A + foreach my $k (keys(%{$A})) { + $R->{$k} = 1 + } + # add all keys in B + foreach my $k (keys(%{$B})) { + $R->{$k} = 1 + } + return $R; +} + +# Subtract B from A +sub SubtractProfile { + my $A = shift; + my $B = shift; + + my $R = {}; + foreach my $k (keys(%{$A})) { + my $v = $A->{$k} - GetEntry($B, $k); + if ($v < 0 && $main::opt_drop_negative) { + $v = 0; + } + AddEntry($R, $k, $v); + } + if (!$main::opt_drop_negative) { + # Take care of when subtracted profile has more entries + foreach my $k (keys(%{$B})) { + if (!exists($A->{$k})) { + AddEntry($R, $k, 0 - $B->{$k}); + } + } + } + return $R; +} + +# Get entry from profile; zero if not present +sub GetEntry { + my $profile = shift; + my $k = shift; + if (exists($profile->{$k})) { + return $profile->{$k}; + } else { + return 0; + } +} + +# Add entry to specified profile +sub AddEntry { + my $profile = shift; + my $k = shift; + my $n = shift; + if (!exists($profile->{$k})) { + $profile->{$k} = 0; + } + $profile->{$k} += $n; +} + +# Add a stack of entries to specified profile, and add them to the $pcs +# list. +sub AddEntries { + my $profile = shift; + my $pcs = shift; + my $stack = shift; + my $count = shift; + my @k = (); + + foreach my $e (split(/\s+/, $stack)) { + my $pc = HexExtend($e); + $pcs->{$pc} = 1; + push @k, $pc; + } + AddEntry($profile, (join "\n", @k), $count); +} + +##### Code to profile a server dynamically ##### + +sub CheckSymbolPage { + my $url = SymbolPageURL(); + my $command = ShellEscape(@URL_FETCHER, $url); + open(SYMBOL, "$command |") or error($command); + my $line = ; + $line =~ s/\r//g; # turn windows-looking lines into unix-looking lines + close(SYMBOL); + unless (defined($line)) { + error("$url doesn't exist\n"); + } + + if ($line =~ /^num_symbols:\s+(\d+)$/) { + if ($1 == 0) { + error("Stripped binary. No symbols available.\n"); + } + } else { + error("Failed to get the number of symbols from $url\n"); + } +} + +sub IsProfileURL { + my $profile_name = shift; + if (-f $profile_name) { + printf STDERR "Using local file $profile_name.\n"; + return 0; + } + return 1; +} + +sub ParseProfileURL { + my $profile_name = shift; + + if (!defined($profile_name) || $profile_name eq "") { + return (); + } + + # Split profile URL - matches all non-empty strings, so no test. + $profile_name =~ m,^(https?://)?([^/]+)(.*?)(/|$PROFILES)?$,; + + my $proto = $1 || "http://"; + my $hostport = $2; + my $prefix = $3; + my $profile = $4 || "/"; + + my $host = $hostport; + $host =~ s/:.*//; + + my $baseurl = "$proto$hostport$prefix"; + return ($host, $baseurl, $profile); +} + +# We fetch symbols from the first profile argument. +sub SymbolPageURL { + my ($host, $baseURL, $path) = ParseProfileURL($main::pfile_args[0]); + return "$baseURL$SYMBOL_PAGE"; +} + +sub FetchProgramName() { + my ($host, $baseURL, $path) = ParseProfileURL($main::pfile_args[0]); + my $url = "$baseURL$PROGRAM_NAME_PAGE"; + my $command_line = ShellEscape(@URL_FETCHER, $url); + open(CMDLINE, "$command_line |") or error($command_line); + my $cmdline = ; + $cmdline =~ s/\r//g; # turn windows-looking lines into unix-looking lines + close(CMDLINE); + error("Failed to get program name from $url\n") unless defined($cmdline); + $cmdline =~ s/\x00.+//; # Remove argv[1] and latters. + $cmdline =~ s!\n!!g; # Remove LFs. + return $cmdline; +} + +# Gee, curl's -L (--location) option isn't reliable at least +# with its 7.12.3 version. Curl will forget to post data if +# there is a redirection. This function is a workaround for +# curl. Redirection happens on borg hosts. +sub ResolveRedirectionForCurl { + my $url = shift; + my $command_line = ShellEscape(@URL_FETCHER, "--head", $url); + open(CMDLINE, "$command_line |") or error($command_line); + while () { + s/\r//g; # turn windows-looking lines into unix-looking lines + if (/^Location: (.*)/) { + $url = $1; + } + } + close(CMDLINE); + return $url; +} + +# Add a timeout flat to URL_FETCHER. Returns a new list. +sub AddFetchTimeout { + my $timeout = shift; + my @fetcher = @_; + if (defined($timeout)) { + if (join(" ", @fetcher) =~ m/\bcurl -s/) { + push(@fetcher, "--max-time", sprintf("%d", $timeout)); + } elsif (join(" ", @fetcher) =~ m/\brpcget\b/) { + push(@fetcher, sprintf("--deadline=%d", $timeout)); + } + } + return @fetcher; +} + +# Reads a symbol map from the file handle name given as $1, returning +# the resulting symbol map. Also processes variables relating to symbols. +# Currently, the only variable processed is 'binary=' which updates +# $main::prog to have the correct program name. +sub ReadSymbols { + my $in = shift; + my $map = {}; + while (<$in>) { + s/\r//g; # turn windows-looking lines into unix-looking lines + # Removes all the leading zeroes from the symbols, see comment below. + if (m/^0x0*([0-9a-f]+)\s+(.+)/) { + $map->{$1} = $2; + } elsif (m/^---/) { + last; + } elsif (m/^([a-z][^=]*)=(.*)$/ ) { + my ($variable, $value) = ($1, $2); + for ($variable, $value) { + s/^\s+//; + s/\s+$//; + } + if ($variable eq "binary") { + if ($main::prog ne $UNKNOWN_BINARY && $main::prog ne $value) { + printf STDERR ("Warning: Mismatched binary name '%s', using '%s'.\n", + $main::prog, $value); + } + $main::prog = $value; + } else { + printf STDERR ("Ignoring unknown variable in symbols list: " . + "'%s' = '%s'\n", $variable, $value); + } + } + } + return $map; +} + +sub URLEncode { + my $str = shift; + $str =~ s/([^A-Za-z0-9\-_.!~*'()])/ sprintf "%%%02x", ord $1 /eg; + return $str; +} + +sub AppendSymbolFilterParams { + my $url = shift; + my @params = (); + if ($main::opt_retain ne '') { + push(@params, sprintf("retain=%s", URLEncode($main::opt_retain))); + } + if ($main::opt_exclude ne '') { + push(@params, sprintf("exclude=%s", URLEncode($main::opt_exclude))); + } + if (scalar @params > 0) { + $url = sprintf("%s?%s", $url, join("&", @params)); + } + return $url; +} + +# Fetches and processes symbols to prepare them for use in the profile output +# code. If the optional 'symbol_map' arg is not given, fetches symbols from +# $SYMBOL_PAGE for all PC values found in profile. Otherwise, the raw symbols +# are assumed to have already been fetched into 'symbol_map' and are simply +# extracted and processed. +sub FetchSymbols { + my $pcset = shift; + my $symbol_map = shift; + + my %seen = (); + my @pcs = grep { !$seen{$_}++ } keys(%$pcset); # uniq + + if (!defined($symbol_map)) { + my $post_data = join("+", sort((map {"0x" . "$_"} @pcs))); + + open(POSTFILE, ">$main::tmpfile_sym"); + print POSTFILE $post_data; + close(POSTFILE); + + my $url = SymbolPageURL(); + + my $command_line; + if (join(" ", @URL_FETCHER) =~ m/\bcurl -s/) { + $url = ResolveRedirectionForCurl($url); + $url = AppendSymbolFilterParams($url); + $command_line = ShellEscape(@URL_FETCHER, "-d", "\@$main::tmpfile_sym", + $url); + } else { + $url = AppendSymbolFilterParams($url); + $command_line = (ShellEscape(@URL_FETCHER, "--post", $url) + . " < " . ShellEscape($main::tmpfile_sym)); + } + # We use c++filt in case $SYMBOL_PAGE gives us mangled symbols. + my $escaped_cppfilt = ShellEscape($obj_tool_map{"c++filt"}); + open(SYMBOL, "$command_line | $escaped_cppfilt |") or error($command_line); + $symbol_map = ReadSymbols(*SYMBOL{IO}); + close(SYMBOL); + } + + my $symbols = {}; + foreach my $pc (@pcs) { + my $fullname; + # For 64 bits binaries, symbols are extracted with 8 leading zeroes. + # Then /symbol reads the long symbols in as uint64, and outputs + # the result with a "0x%08llx" format which get rid of the zeroes. + # By removing all the leading zeroes in both $pc and the symbols from + # /symbol, the symbols match and are retrievable from the map. + my $shortpc = $pc; + $shortpc =~ s/^0*//; + # Each line may have a list of names, which includes the function + # and also other functions it has inlined. They are separated (in + # PrintSymbolizedProfile), by --, which is illegal in function names. + my $fullnames; + if (defined($symbol_map->{$shortpc})) { + $fullnames = $symbol_map->{$shortpc}; + } else { + $fullnames = "0x" . $pc; # Just use addresses + } + my $sym = []; + $symbols->{$pc} = $sym; + foreach my $fullname (split("--", $fullnames)) { + my $name = ShortFunctionName($fullname); + push(@{$sym}, $name, "?", $fullname); + } + } + return $symbols; +} + +sub BaseName { + my $file_name = shift; + $file_name =~ s!^.*/!!; # Remove directory name + return $file_name; +} + +sub MakeProfileBaseName { + my ($binary_name, $profile_name) = @_; + my ($host, $baseURL, $path) = ParseProfileURL($profile_name); + my $binary_shortname = BaseName($binary_name); + return sprintf("%s.%s.%s", + $binary_shortname, $main::op_time, $host); +} + +sub FetchDynamicProfile { + my $binary_name = shift; + my $profile_name = shift; + my $fetch_name_only = shift; + my $encourage_patience = shift; + + if (!IsProfileURL($profile_name)) { + return $profile_name; + } else { + my ($host, $baseURL, $path) = ParseProfileURL($profile_name); + if ($path eq "" || $path eq "/") { + # Missing type specifier defaults to cpu-profile + $path = $PROFILE_PAGE; + } + + my $profile_file = MakeProfileBaseName($binary_name, $profile_name); + + my $url = "$baseURL$path"; + my $fetch_timeout = undef; + if ($path =~ m/$PROFILE_PAGE|$PMUPROFILE_PAGE/) { + if ($path =~ m/[?]/) { + $url .= "&"; + } else { + $url .= "?"; + } + $url .= sprintf("seconds=%d", $main::opt_seconds); + $fetch_timeout = $main::opt_seconds * 1.01 + 60; + # Set $profile_type for consumption by PrintSymbolizedProfile. + $main::profile_type = 'cpu'; + } else { + # For non-CPU profiles, we add a type-extension to + # the target profile file name. + my $suffix = $path; + $suffix =~ s,/,.,g; + $profile_file .= $suffix; + # Set $profile_type for consumption by PrintSymbolizedProfile. + if ($path =~ m/$HEAP_PAGE/) { + $main::profile_type = 'heap'; + } elsif ($path =~ m/$GROWTH_PAGE/) { + $main::profile_type = 'growth'; + } elsif ($path =~ m/$CONTENTION_PAGE/) { + $main::profile_type = 'contention'; + } + } + + my $profile_dir = $ENV{"JEPROF_TMPDIR"} || ($ENV{HOME} . "/jeprof"); + if (! -d $profile_dir) { + mkdir($profile_dir) + || die("Unable to create profile directory $profile_dir: $!\n"); + } + my $tmp_profile = "$profile_dir/.tmp.$profile_file"; + my $real_profile = "$profile_dir/$profile_file"; + + if ($fetch_name_only > 0) { + return $real_profile; + } + + my @fetcher = AddFetchTimeout($fetch_timeout, @URL_FETCHER); + my $cmd = ShellEscape(@fetcher, $url) . " > " . ShellEscape($tmp_profile); + if ($path =~ m/$PROFILE_PAGE|$PMUPROFILE_PAGE|$CENSUSPROFILE_PAGE/){ + print STDERR "Gathering CPU profile from $url for $main::opt_seconds seconds to\n ${real_profile}\n"; + if ($encourage_patience) { + print STDERR "Be patient...\n"; + } + } else { + print STDERR "Fetching $path profile from $url to\n ${real_profile}\n"; + } + + (system($cmd) == 0) || error("Failed to get profile: $cmd: $!\n"); + (system("mv", $tmp_profile, $real_profile) == 0) || error("Unable to rename profile\n"); + print STDERR "Wrote profile to $real_profile\n"; + $main::collected_profile = $real_profile; + return $main::collected_profile; + } +} + +# Collect profiles in parallel +sub FetchDynamicProfiles { + my $items = scalar(@main::pfile_args); + my $levels = log($items) / log(2); + + if ($items == 1) { + $main::profile_files[0] = FetchDynamicProfile($main::prog, $main::pfile_args[0], 0, 1); + } else { + # math rounding issues + if ((2 ** $levels) < $items) { + $levels++; + } + my $count = scalar(@main::pfile_args); + for (my $i = 0; $i < $count; $i++) { + $main::profile_files[$i] = FetchDynamicProfile($main::prog, $main::pfile_args[$i], 1, 0); + } + print STDERR "Fetching $count profiles, Be patient...\n"; + FetchDynamicProfilesRecurse($levels, 0, 0); + $main::collected_profile = join(" \\\n ", @main::profile_files); + } +} + +# Recursively fork a process to get enough processes +# collecting profiles +sub FetchDynamicProfilesRecurse { + my $maxlevel = shift; + my $level = shift; + my $position = shift; + + if (my $pid = fork()) { + $position = 0 | ($position << 1); + TryCollectProfile($maxlevel, $level, $position); + wait; + } else { + $position = 1 | ($position << 1); + TryCollectProfile($maxlevel, $level, $position); + cleanup(); + exit(0); + } +} + +# Collect a single profile +sub TryCollectProfile { + my $maxlevel = shift; + my $level = shift; + my $position = shift; + + if ($level >= ($maxlevel - 1)) { + if ($position < scalar(@main::pfile_args)) { + FetchDynamicProfile($main::prog, $main::pfile_args[$position], 0, 0); + } + } else { + FetchDynamicProfilesRecurse($maxlevel, $level+1, $position); + } +} + +##### Parsing code ##### + +# Provide a small streaming-read module to handle very large +# cpu-profile files. Stream in chunks along a sliding window. +# Provides an interface to get one 'slot', correctly handling +# endian-ness differences. A slot is one 32-bit or 64-bit word +# (depending on the input profile). We tell endianness and bit-size +# for the profile by looking at the first 8 bytes: in cpu profiles, +# the second slot is always 3 (we'll accept anything that's not 0). +BEGIN { + package CpuProfileStream; + + sub new { + my ($class, $file, $fname) = @_; + my $self = { file => $file, + base => 0, + stride => 512 * 1024, # must be a multiple of bitsize/8 + slots => [], + unpack_code => "", # N for big-endian, V for little + perl_is_64bit => 1, # matters if profile is 64-bit + }; + bless $self, $class; + # Let unittests adjust the stride + if ($main::opt_test_stride > 0) { + $self->{stride} = $main::opt_test_stride; + } + # Read the first two slots to figure out bitsize and endianness. + my $slots = $self->{slots}; + my $str; + read($self->{file}, $str, 8); + # Set the global $address_length based on what we see here. + # 8 is 32-bit (8 hexadecimal chars); 16 is 64-bit (16 hexadecimal chars). + $address_length = ($str eq (chr(0)x8)) ? 16 : 8; + if ($address_length == 8) { + if (substr($str, 6, 2) eq chr(0)x2) { + $self->{unpack_code} = 'V'; # Little-endian. + } elsif (substr($str, 4, 2) eq chr(0)x2) { + $self->{unpack_code} = 'N'; # Big-endian + } else { + ::error("$fname: header size >= 2**16\n"); + } + @$slots = unpack($self->{unpack_code} . "*", $str); + } else { + # If we're a 64-bit profile, check if we're a 64-bit-capable + # perl. Otherwise, each slot will be represented as a float + # instead of an int64, losing precision and making all the + # 64-bit addresses wrong. We won't complain yet, but will + # later if we ever see a value that doesn't fit in 32 bits. + my $has_q = 0; + eval { $has_q = pack("Q", "1") ? 1 : 1; }; + if (!$has_q) { + $self->{perl_is_64bit} = 0; + } + read($self->{file}, $str, 8); + if (substr($str, 4, 4) eq chr(0)x4) { + # We'd love to use 'Q', but it's a) not universal, b) not endian-proof. + $self->{unpack_code} = 'V'; # Little-endian. + } elsif (substr($str, 0, 4) eq chr(0)x4) { + $self->{unpack_code} = 'N'; # Big-endian + } else { + ::error("$fname: header size >= 2**32\n"); + } + my @pair = unpack($self->{unpack_code} . "*", $str); + # Since we know one of the pair is 0, it's fine to just add them. + @$slots = (0, $pair[0] + $pair[1]); + } + return $self; + } + + # Load more data when we access slots->get(X) which is not yet in memory. + sub overflow { + my ($self) = @_; + my $slots = $self->{slots}; + $self->{base} += $#$slots + 1; # skip over data we're replacing + my $str; + read($self->{file}, $str, $self->{stride}); + if ($address_length == 8) { # the 32-bit case + # This is the easy case: unpack provides 32-bit unpacking primitives. + @$slots = unpack($self->{unpack_code} . "*", $str); + } else { + # We need to unpack 32 bits at a time and combine. + my @b32_values = unpack($self->{unpack_code} . "*", $str); + my @b64_values = (); + for (my $i = 0; $i < $#b32_values; $i += 2) { + # TODO(csilvers): if this is a 32-bit perl, the math below + # could end up in a too-large int, which perl will promote + # to a double, losing necessary precision. Deal with that. + # Right now, we just die. + my ($lo, $hi) = ($b32_values[$i], $b32_values[$i+1]); + if ($self->{unpack_code} eq 'N') { # big-endian + ($lo, $hi) = ($hi, $lo); + } + my $value = $lo + $hi * (2**32); + if (!$self->{perl_is_64bit} && # check value is exactly represented + (($value % (2**32)) != $lo || int($value / (2**32)) != $hi)) { + ::error("Need a 64-bit perl to process this 64-bit profile.\n"); + } + push(@b64_values, $value); + } + @$slots = @b64_values; + } + } + + # Access the i-th long in the file (logically), or -1 at EOF. + sub get { + my ($self, $idx) = @_; + my $slots = $self->{slots}; + while ($#$slots >= 0) { + if ($idx < $self->{base}) { + # The only time we expect a reference to $slots[$i - something] + # after referencing $slots[$i] is reading the very first header. + # Since $stride > |header|, that shouldn't cause any lookback + # errors. And everything after the header is sequential. + print STDERR "Unexpected look-back reading CPU profile"; + return -1; # shrug, don't know what better to return + } elsif ($idx > $self->{base} + $#$slots) { + $self->overflow(); + } else { + return $slots->[$idx - $self->{base}]; + } + } + # If we get here, $slots is [], which means we've reached EOF + return -1; # unique since slots is supposed to hold unsigned numbers + } +} + +# Reads the top, 'header' section of a profile, and returns the last +# line of the header, commonly called a 'header line'. The header +# section of a profile consists of zero or more 'command' lines that +# are instructions to jeprof, which jeprof executes when reading the +# header. All 'command' lines start with a %. After the command +# lines is the 'header line', which is a profile-specific line that +# indicates what type of profile it is, and perhaps other global +# information about the profile. For instance, here's a header line +# for a heap profile: +# heap profile: 53: 38236 [ 5525: 1284029] @ heapprofile +# For historical reasons, the CPU profile does not contain a text- +# readable header line. If the profile looks like a CPU profile, +# this function returns "". If no header line could be found, this +# function returns undef. +# +# The following commands are recognized: +# %warn -- emit the rest of this line to stderr, prefixed by 'WARNING:' +# +# The input file should be in binmode. +sub ReadProfileHeader { + local *PROFILE = shift; + my $firstchar = ""; + my $line = ""; + read(PROFILE, $firstchar, 1); + seek(PROFILE, -1, 1); # unread the firstchar + if ($firstchar !~ /[[:print:]]/) { # is not a text character + return ""; + } + while (defined($line = )) { + $line =~ s/\r//g; # turn windows-looking lines into unix-looking lines + if ($line =~ /^%warn\s+(.*)/) { # 'warn' command + # Note this matches both '%warn blah\n' and '%warn\n'. + print STDERR "WARNING: $1\n"; # print the rest of the line + } elsif ($line =~ /^%/) { + print STDERR "Ignoring unknown command from profile header: $line"; + } else { + # End of commands, must be the header line. + return $line; + } + } + return undef; # got to EOF without seeing a header line +} + +sub IsSymbolizedProfileFile { + my $file_name = shift; + if (!(-e $file_name) || !(-r $file_name)) { + return 0; + } + # Check if the file contains a symbol-section marker. + open(TFILE, "<$file_name"); + binmode TFILE; + my $firstline = ReadProfileHeader(*TFILE); + close(TFILE); + if (!$firstline) { + return 0; + } + $SYMBOL_PAGE =~ m,[^/]+$,; # matches everything after the last slash + my $symbol_marker = $&; + return $firstline =~ /^--- *$symbol_marker/; +} + +# Parse profile generated by common/profiler.cc and return a reference +# to a map: +# $result->{version} Version number of profile file +# $result->{period} Sampling period (in microseconds) +# $result->{profile} Profile object +# $result->{threads} Map of thread IDs to profile objects +# $result->{map} Memory map info from profile +# $result->{pcs} Hash of all PC values seen, key is hex address +sub ReadProfile { + my $prog = shift; + my $fname = shift; + my $result; # return value + + $CONTENTION_PAGE =~ m,[^/]+$,; # matches everything after the last slash + my $contention_marker = $&; + $GROWTH_PAGE =~ m,[^/]+$,; # matches everything after the last slash + my $growth_marker = $&; + $SYMBOL_PAGE =~ m,[^/]+$,; # matches everything after the last slash + my $symbol_marker = $&; + $PROFILE_PAGE =~ m,[^/]+$,; # matches everything after the last slash + my $profile_marker = $&; + $HEAP_PAGE =~ m,[^/]+$,; # matches everything after the last slash + my $heap_marker = $&; + + # Look at first line to see if it is a heap or a CPU profile. + # CPU profile may start with no header at all, and just binary data + # (starting with \0\0\0\0) -- in that case, don't try to read the + # whole firstline, since it may be gigabytes(!) of data. + open(PROFILE, "<$fname") || error("$fname: $!\n"); + binmode PROFILE; # New perls do UTF-8 processing + my $header = ReadProfileHeader(*PROFILE); + if (!defined($header)) { # means "at EOF" + error("Profile is empty.\n"); + } + + my $symbols; + if ($header =~ m/^--- *$symbol_marker/o) { + # Verify that the user asked for a symbolized profile + if (!$main::use_symbolized_profile) { + # we have both a binary and symbolized profiles, abort + error("FATAL ERROR: Symbolized profile\n $fname\ncannot be used with " . + "a binary arg. Try again without passing\n $prog\n"); + } + # Read the symbol section of the symbolized profile file. + $symbols = ReadSymbols(*PROFILE{IO}); + # Read the next line to get the header for the remaining profile. + $header = ReadProfileHeader(*PROFILE) || ""; + } + + if ($header =~ m/^--- *($heap_marker|$growth_marker)/o) { + # Skip "--- ..." line for profile types that have their own headers. + $header = ReadProfileHeader(*PROFILE) || ""; + } + + $main::profile_type = ''; + + if ($header =~ m/^heap profile:.*$growth_marker/o) { + $main::profile_type = 'growth'; + $result = ReadHeapProfile($prog, *PROFILE, $header); + } elsif ($header =~ m/^heap profile:/) { + $main::profile_type = 'heap'; + $result = ReadHeapProfile($prog, *PROFILE, $header); + } elsif ($header =~ m/^heap/) { + $main::profile_type = 'heap'; + $result = ReadThreadedHeapProfile($prog, $fname, $header); + } elsif ($header =~ m/^--- *$contention_marker/o) { + $main::profile_type = 'contention'; + $result = ReadSynchProfile($prog, *PROFILE); + } elsif ($header =~ m/^--- *Stacks:/) { + print STDERR + "Old format contention profile: mistakenly reports " . + "condition variable signals as lock contentions.\n"; + $main::profile_type = 'contention'; + $result = ReadSynchProfile($prog, *PROFILE); + } elsif ($header =~ m/^--- *$profile_marker/) { + # the binary cpu profile data starts immediately after this line + $main::profile_type = 'cpu'; + $result = ReadCPUProfile($prog, $fname, *PROFILE); + } else { + if (defined($symbols)) { + # a symbolized profile contains a format we don't recognize, bail out + error("$fname: Cannot recognize profile section after symbols.\n"); + } + # no ascii header present -- must be a CPU profile + $main::profile_type = 'cpu'; + $result = ReadCPUProfile($prog, $fname, *PROFILE); + } + + close(PROFILE); + + # if we got symbols along with the profile, return those as well + if (defined($symbols)) { + $result->{symbols} = $symbols; + } + + return $result; +} + +# Subtract one from caller pc so we map back to call instr. +# However, don't do this if we're reading a symbolized profile +# file, in which case the subtract-one was done when the file +# was written. +# +# We apply the same logic to all readers, though ReadCPUProfile uses an +# independent implementation. +sub FixCallerAddresses { + my $stack = shift; + # --raw/http: Always subtract one from pc's, because PrintSymbolizedProfile() + # dumps unadjusted profiles. + { + $stack =~ /(\s)/; + my $delimiter = $1; + my @addrs = split(' ', $stack); + my @fixedaddrs; + $#fixedaddrs = $#addrs; + if ($#addrs >= 0) { + $fixedaddrs[0] = $addrs[0]; + } + for (my $i = 1; $i <= $#addrs; $i++) { + $fixedaddrs[$i] = AddressSub($addrs[$i], "0x1"); + } + return join $delimiter, @fixedaddrs; + } +} + +# CPU profile reader +sub ReadCPUProfile { + my $prog = shift; + my $fname = shift; # just used for logging + local *PROFILE = shift; + my $version; + my $period; + my $i; + my $profile = {}; + my $pcs = {}; + + # Parse string into array of slots. + my $slots = CpuProfileStream->new(*PROFILE, $fname); + + # Read header. The current header version is a 5-element structure + # containing: + # 0: header count (always 0) + # 1: header "words" (after this one: 3) + # 2: format version (0) + # 3: sampling period (usec) + # 4: unused padding (always 0) + if ($slots->get(0) != 0 ) { + error("$fname: not a profile file, or old format profile file\n"); + } + $i = 2 + $slots->get(1); + $version = $slots->get(2); + $period = $slots->get(3); + # Do some sanity checking on these header values. + if ($version > (2**32) || $period > (2**32) || $i > (2**32) || $i < 5) { + error("$fname: not a profile file, or corrupted profile file\n"); + } + + # Parse profile + while ($slots->get($i) != -1) { + my $n = $slots->get($i++); + my $d = $slots->get($i++); + if ($d > (2**16)) { # TODO(csilvers): what's a reasonable max-stack-depth? + my $addr = sprintf("0%o", $i * ($address_length == 8 ? 4 : 8)); + print STDERR "At index $i (address $addr):\n"; + error("$fname: stack trace depth >= 2**32\n"); + } + if ($slots->get($i) == 0) { + # End of profile data marker + $i += $d; + last; + } + + # Make key out of the stack entries + my @k = (); + for (my $j = 0; $j < $d; $j++) { + my $pc = $slots->get($i+$j); + # Subtract one from caller pc so we map back to call instr. + $pc--; + $pc = sprintf("%0*x", $address_length, $pc); + $pcs->{$pc} = 1; + push @k, $pc; + } + + AddEntry($profile, (join "\n", @k), $n); + $i += $d; + } + + # Parse map + my $map = ''; + seek(PROFILE, $i * 4, 0); + read(PROFILE, $map, (stat PROFILE)[7]); + + my $r = {}; + $r->{version} = $version; + $r->{period} = $period; + $r->{profile} = $profile; + $r->{libs} = ParseLibraries($prog, $map, $pcs); + $r->{pcs} = $pcs; + + return $r; +} + +sub HeapProfileIndex { + my $index = 1; + if ($main::opt_inuse_space) { + $index = 1; + } elsif ($main::opt_inuse_objects) { + $index = 0; + } elsif ($main::opt_alloc_space) { + $index = 3; + } elsif ($main::opt_alloc_objects) { + $index = 2; + } + return $index; +} + +sub ReadMappedLibraries { + my $fh = shift; + my $map = ""; + # Read the /proc/self/maps data + while (<$fh>) { + s/\r//g; # turn windows-looking lines into unix-looking lines + $map .= $_; + } + return $map; +} + +sub ReadMemoryMap { + my $fh = shift; + my $map = ""; + # Read /proc/self/maps data as formatted by DumpAddressMap() + my $buildvar = ""; + while () { + s/\r//g; # turn windows-looking lines into unix-looking lines + # Parse "build=" specification if supplied + if (m/^\s*build=(.*)\n/) { + $buildvar = $1; + } + + # Expand "$build" variable if available + $_ =~ s/\$build\b/$buildvar/g; + + $map .= $_; + } + return $map; +} + +sub AdjustSamples { + my ($sample_adjustment, $sampling_algorithm, $n1, $s1, $n2, $s2) = @_; + if ($sample_adjustment) { + if ($sampling_algorithm == 2) { + # Remote-heap version 2 + # The sampling frequency is the rate of a Poisson process. + # This means that the probability of sampling an allocation of + # size X with sampling rate Y is 1 - exp(-X/Y) + if ($n1 != 0) { + my $ratio = (($s1*1.0)/$n1)/($sample_adjustment); + my $scale_factor = 1/(1 - exp(-$ratio)); + $n1 *= $scale_factor; + $s1 *= $scale_factor; + } + if ($n2 != 0) { + my $ratio = (($s2*1.0)/$n2)/($sample_adjustment); + my $scale_factor = 1/(1 - exp(-$ratio)); + $n2 *= $scale_factor; + $s2 *= $scale_factor; + } + } else { + # Remote-heap version 1 + my $ratio; + $ratio = (($s1*1.0)/$n1)/($sample_adjustment); + if ($ratio < 1) { + $n1 /= $ratio; + $s1 /= $ratio; + } + $ratio = (($s2*1.0)/$n2)/($sample_adjustment); + if ($ratio < 1) { + $n2 /= $ratio; + $s2 /= $ratio; + } + } + } + return ($n1, $s1, $n2, $s2); +} + +sub ReadHeapProfile { + my $prog = shift; + local *PROFILE = shift; + my $header = shift; + + my $index = HeapProfileIndex(); + + # Find the type of this profile. The header line looks like: + # heap profile: 1246: 8800744 [ 1246: 8800744] @ /266053 + # There are two pairs , the first inuse objects/space, and the + # second allocated objects/space. This is followed optionally by a profile + # type, and if that is present, optionally by a sampling frequency. + # For remote heap profiles (v1): + # The interpretation of the sampling frequency is that the profiler, for + # each sample, calculates a uniformly distributed random integer less than + # the given value, and records the next sample after that many bytes have + # been allocated. Therefore, the expected sample interval is half of the + # given frequency. By default, if not specified, the expected sample + # interval is 128KB. Only remote-heap-page profiles are adjusted for + # sample size. + # For remote heap profiles (v2): + # The sampling frequency is the rate of a Poisson process. This means that + # the probability of sampling an allocation of size X with sampling rate Y + # is 1 - exp(-X/Y) + # For version 2, a typical header line might look like this: + # heap profile: 1922: 127792360 [ 1922: 127792360] @ _v2/524288 + # the trailing number (524288) is the sampling rate. (Version 1 showed + # double the 'rate' here) + my $sampling_algorithm = 0; + my $sample_adjustment = 0; + chomp($header); + my $type = "unknown"; + if ($header =~ m"^heap profile:\s*(\d+):\s+(\d+)\s+\[\s*(\d+):\s+(\d+)\](\s*@\s*([^/]*)(/(\d+))?)?") { + if (defined($6) && ($6 ne '')) { + $type = $6; + my $sample_period = $8; + # $type is "heapprofile" for profiles generated by the + # heap-profiler, and either "heap" or "heap_v2" for profiles + # generated by sampling directly within tcmalloc. It can also + # be "growth" for heap-growth profiles. The first is typically + # found for profiles generated locally, and the others for + # remote profiles. + if (($type eq "heapprofile") || ($type !~ /heap/) ) { + # No need to adjust for the sampling rate with heap-profiler-derived data + $sampling_algorithm = 0; + } elsif ($type =~ /_v2/) { + $sampling_algorithm = 2; # version 2 sampling + if (defined($sample_period) && ($sample_period ne '')) { + $sample_adjustment = int($sample_period); + } + } else { + $sampling_algorithm = 1; # version 1 sampling + if (defined($sample_period) && ($sample_period ne '')) { + $sample_adjustment = int($sample_period)/2; + } + } + } else { + # We detect whether or not this is a remote-heap profile by checking + # that the total-allocated stats ($n2,$s2) are exactly the + # same as the in-use stats ($n1,$s1). It is remotely conceivable + # that a non-remote-heap profile may pass this check, but it is hard + # to imagine how that could happen. + # In this case it's so old it's guaranteed to be remote-heap version 1. + my ($n1, $s1, $n2, $s2) = ($1, $2, $3, $4); + if (($n1 == $n2) && ($s1 == $s2)) { + # This is likely to be a remote-heap based sample profile + $sampling_algorithm = 1; + } + } + } + + if ($sampling_algorithm > 0) { + # For remote-heap generated profiles, adjust the counts and sizes to + # account for the sample rate (we sample once every 128KB by default). + if ($sample_adjustment == 0) { + # Turn on profile adjustment. + $sample_adjustment = 128*1024; + print STDERR "Adjusting heap profiles for 1-in-128KB sampling rate\n"; + } else { + printf STDERR ("Adjusting heap profiles for 1-in-%d sampling rate\n", + $sample_adjustment); + } + if ($sampling_algorithm > 1) { + # We don't bother printing anything for the original version (version 1) + printf STDERR "Heap version $sampling_algorithm\n"; + } + } + + my $profile = {}; + my $pcs = {}; + my $map = ""; + + while () { + s/\r//g; # turn windows-looking lines into unix-looking lines + if (/^MAPPED_LIBRARIES:/) { + $map .= ReadMappedLibraries(*PROFILE); + last; + } + + if (/^--- Memory map:/) { + $map .= ReadMemoryMap(*PROFILE); + last; + } + + # Read entry of the form: + # : [: ] @ a1 a2 a3 ... an + s/^\s*//; + s/\s*$//; + if (m/^\s*(\d+):\s+(\d+)\s+\[\s*(\d+):\s+(\d+)\]\s+@\s+(.*)$/) { + my $stack = $5; + my ($n1, $s1, $n2, $s2) = ($1, $2, $3, $4); + my @counts = AdjustSamples($sample_adjustment, $sampling_algorithm, + $n1, $s1, $n2, $s2); + AddEntries($profile, $pcs, FixCallerAddresses($stack), $counts[$index]); + } + } + + my $r = {}; + $r->{version} = "heap"; + $r->{period} = 1; + $r->{profile} = $profile; + $r->{libs} = ParseLibraries($prog, $map, $pcs); + $r->{pcs} = $pcs; + return $r; +} + +sub ReadThreadedHeapProfile { + my ($prog, $fname, $header) = @_; + + my $index = HeapProfileIndex(); + my $sampling_algorithm = 0; + my $sample_adjustment = 0; + chomp($header); + my $type = "unknown"; + # Assuming a very specific type of header for now. + if ($header =~ m"^heap_v2/(\d+)") { + $type = "_v2"; + $sampling_algorithm = 2; + $sample_adjustment = int($1); + } + if ($type ne "_v2" || !defined($sample_adjustment)) { + die "Threaded heap profiles require v2 sampling with a sample rate\n"; + } + + my $profile = {}; + my $thread_profiles = {}; + my $pcs = {}; + my $map = ""; + my $stack = ""; + + while () { + s/\r//g; + if (/^MAPPED_LIBRARIES:/) { + $map .= ReadMappedLibraries(*PROFILE); + last; + } + + if (/^--- Memory map:/) { + $map .= ReadMemoryMap(*PROFILE); + last; + } + + # Read entry of the form: + # @ a1 a2 ... an + # t*: : [: ] + # t1: : [: ] + # ... + # tn: : [: ] + s/^\s*//; + s/\s*$//; + if (m/^@\s+(.*)$/) { + $stack = $1; + } elsif (m/^\s*(t(\*|\d+)):\s+(\d+):\s+(\d+)\s+\[\s*(\d+):\s+(\d+)\]$/) { + if ($stack eq "") { + # Still in the header, so this is just a per-thread summary. + next; + } + my $thread = $2; + my ($n1, $s1, $n2, $s2) = ($3, $4, $5, $6); + my @counts = AdjustSamples($sample_adjustment, $sampling_algorithm, + $n1, $s1, $n2, $s2); + if ($thread eq "*") { + AddEntries($profile, $pcs, FixCallerAddresses($stack), $counts[$index]); + } else { + if (!exists($thread_profiles->{$thread})) { + $thread_profiles->{$thread} = {}; + } + AddEntries($thread_profiles->{$thread}, $pcs, + FixCallerAddresses($stack), $counts[$index]); + } + } + } + + my $r = {}; + $r->{version} = "heap"; + $r->{period} = 1; + $r->{profile} = $profile; + $r->{threads} = $thread_profiles; + $r->{libs} = ParseLibraries($prog, $map, $pcs); + $r->{pcs} = $pcs; + return $r; +} + +sub ReadSynchProfile { + my $prog = shift; + local *PROFILE = shift; + my $header = shift; + + my $map = ''; + my $profile = {}; + my $pcs = {}; + my $sampling_period = 1; + my $cyclespernanosec = 2.8; # Default assumption for old binaries + my $seen_clockrate = 0; + my $line; + + my $index = 0; + if ($main::opt_total_delay) { + $index = 0; + } elsif ($main::opt_contentions) { + $index = 1; + } elsif ($main::opt_mean_delay) { + $index = 2; + } + + while ( $line = ) { + $line =~ s/\r//g; # turn windows-looking lines into unix-looking lines + if ( $line =~ /^\s*(\d+)\s+(\d+) \@\s*(.*?)\s*$/ ) { + my ($cycles, $count, $stack) = ($1, $2, $3); + + # Convert cycles to nanoseconds + $cycles /= $cyclespernanosec; + + # Adjust for sampling done by application + $cycles *= $sampling_period; + $count *= $sampling_period; + + my @values = ($cycles, $count, $cycles / $count); + AddEntries($profile, $pcs, FixCallerAddresses($stack), $values[$index]); + + } elsif ( $line =~ /^(slow release).*thread \d+ \@\s*(.*?)\s*$/ || + $line =~ /^\s*(\d+) \@\s*(.*?)\s*$/ ) { + my ($cycles, $stack) = ($1, $2); + if ($cycles !~ /^\d+$/) { + next; + } + + # Convert cycles to nanoseconds + $cycles /= $cyclespernanosec; + + # Adjust for sampling done by application + $cycles *= $sampling_period; + + AddEntries($profile, $pcs, FixCallerAddresses($stack), $cycles); + + } elsif ( $line =~ m/^([a-z][^=]*)=(.*)$/ ) { + my ($variable, $value) = ($1,$2); + for ($variable, $value) { + s/^\s+//; + s/\s+$//; + } + if ($variable eq "cycles/second") { + $cyclespernanosec = $value / 1e9; + $seen_clockrate = 1; + } elsif ($variable eq "sampling period") { + $sampling_period = $value; + } elsif ($variable eq "ms since reset") { + # Currently nothing is done with this value in jeprof + # So we just silently ignore it for now + } elsif ($variable eq "discarded samples") { + # Currently nothing is done with this value in jeprof + # So we just silently ignore it for now + } else { + printf STDERR ("Ignoring unnknown variable in /contention output: " . + "'%s' = '%s'\n",$variable,$value); + } + } else { + # Memory map entry + $map .= $line; + } + } + + if (!$seen_clockrate) { + printf STDERR ("No cycles/second entry in profile; Guessing %.1f GHz\n", + $cyclespernanosec); + } + + my $r = {}; + $r->{version} = 0; + $r->{period} = $sampling_period; + $r->{profile} = $profile; + $r->{libs} = ParseLibraries($prog, $map, $pcs); + $r->{pcs} = $pcs; + return $r; +} + +# Given a hex value in the form "0x1abcd" or "1abcd", return either +# "0001abcd" or "000000000001abcd", depending on the current (global) +# address length. +sub HexExtend { + my $addr = shift; + + $addr =~ s/^(0x)?0*//; + my $zeros_needed = $address_length - length($addr); + if ($zeros_needed < 0) { + printf STDERR "Warning: address $addr is longer than address length $address_length\n"; + return $addr; + } + return ("0" x $zeros_needed) . $addr; +} + +##### Symbol extraction ##### + +# Aggressively search the lib_prefix values for the given library +# If all else fails, just return the name of the library unmodified. +# If the lib_prefix is "/my/path,/other/path" and $file is "/lib/dir/mylib.so" +# it will search the following locations in this order, until it finds a file: +# /my/path/lib/dir/mylib.so +# /other/path/lib/dir/mylib.so +# /my/path/dir/mylib.so +# /other/path/dir/mylib.so +# /my/path/mylib.so +# /other/path/mylib.so +# /lib/dir/mylib.so (returned as last resort) +sub FindLibrary { + my $file = shift; + my $suffix = $file; + + # Search for the library as described above + do { + foreach my $prefix (@prefix_list) { + my $fullpath = $prefix . $suffix; + if (-e $fullpath) { + return $fullpath; + } + } + } while ($suffix =~ s|^/[^/]+/|/|); + return $file; +} + +# Return path to library with debugging symbols. +# For libc libraries, the copy in /usr/lib/debug contains debugging symbols +sub DebuggingLibrary { + my $file = shift; + + if ($file !~ m|^/|) { + return undef; + } + + # Find debug symbol file if it's named after the library's name. + + if (-f "/usr/lib/debug$file") { + if($main::opt_debug) { print STDERR "found debug info for $file in /usr/lib/debug$file\n"; } + return "/usr/lib/debug$file"; + } elsif (-f "/usr/lib/debug$file.debug") { + if($main::opt_debug) { print STDERR "found debug info for $file in /usr/lib/debug$file.debug\n"; } + return "/usr/lib/debug$file.debug"; + } + + if(!$main::opt_debug_syms_by_id) { + if($main::opt_debug) { print STDERR "no debug symbols found for $file\n" }; + return undef; + } + + # Find debug file if it's named after the library's build ID. + + my $readelf = ''; + if (!$main::gave_up_on_elfutils) { + $readelf = qx/eu-readelf -n ${file}/; + if ($?) { + print STDERR "Cannot run eu-readelf. To use --debug-syms-by-id you must be on Linux, with elfutils installed.\n"; + $main::gave_up_on_elfutils = 1; + return undef; + } + my $buildID = $1 if $readelf =~ /Build ID: ([A-Fa-f0-9]+)/s; + if (defined $buildID && length $buildID > 0) { + my $symbolFile = '/usr/lib/debug/.build-id/' . substr($buildID, 0, 2) . '/' . substr($buildID, 2) . '.debug'; + if (-e $symbolFile) { + if($main::opt_debug) { print STDERR "found debug symbol file $symbolFile for $file\n" }; + return $symbolFile; + } else { + if($main::opt_debug) { print STDERR "no debug symbol file found for $file, build ID: $buildID\n" }; + return undef; + } + } + } + + if($main::opt_debug) { print STDERR "no debug symbols found for $file, build ID unknown\n" }; + return undef; +} + + +# Parse text section header of a library using objdump +sub ParseTextSectionHeaderFromObjdump { + my $lib = shift; + + my $size = undef; + my $vma; + my $file_offset; + # Get objdump output from the library file to figure out how to + # map between mapped addresses and addresses in the library. + my $cmd = ShellEscape($obj_tool_map{"objdump"}, "-h", $lib); + open(OBJDUMP, "$cmd |") || error("$cmd: $!\n"); + while () { + s/\r//g; # turn windows-looking lines into unix-looking lines + # Idx Name Size VMA LMA File off Algn + # 10 .text 00104b2c 420156f0 420156f0 000156f0 2**4 + # For 64-bit objects, VMA and LMA will be 16 hex digits, size and file + # offset may still be 8. But AddressSub below will still handle that. + my @x = split; + if (($#x >= 6) && ($x[1] eq '.text')) { + $size = $x[2]; + $vma = $x[3]; + $file_offset = $x[5]; + last; + } + } + close(OBJDUMP); + + if (!defined($size)) { + return undef; + } + + my $r = {}; + $r->{size} = $size; + $r->{vma} = $vma; + $r->{file_offset} = $file_offset; + + return $r; +} + +# Parse text section header of a library using otool (on OS X) +sub ParseTextSectionHeaderFromOtool { + my $lib = shift; + + my $size = undef; + my $vma = undef; + my $file_offset = undef; + # Get otool output from the library file to figure out how to + # map between mapped addresses and addresses in the library. + my $command = ShellEscape($obj_tool_map{"otool"}, "-l", $lib); + open(OTOOL, "$command |") || error("$command: $!\n"); + my $cmd = ""; + my $sectname = ""; + my $segname = ""; + foreach my $line () { + $line =~ s/\r//g; # turn windows-looking lines into unix-looking lines + # Load command <#> + # cmd LC_SEGMENT + # [...] + # Section + # sectname __text + # segname __TEXT + # addr 0x000009f8 + # size 0x00018b9e + # offset 2552 + # align 2^2 (4) + # We will need to strip off the leading 0x from the hex addresses, + # and convert the offset into hex. + if ($line =~ /Load command/) { + $cmd = ""; + $sectname = ""; + $segname = ""; + } elsif ($line =~ /Section/) { + $sectname = ""; + $segname = ""; + } elsif ($line =~ /cmd (\w+)/) { + $cmd = $1; + } elsif ($line =~ /sectname (\w+)/) { + $sectname = $1; + } elsif ($line =~ /segname (\w+)/) { + $segname = $1; + } elsif (!(($cmd eq "LC_SEGMENT" || $cmd eq "LC_SEGMENT_64") && + $sectname eq "__text" && + $segname eq "__TEXT")) { + next; + } elsif ($line =~ /\baddr 0x([0-9a-fA-F]+)/) { + $vma = $1; + } elsif ($line =~ /\bsize 0x([0-9a-fA-F]+)/) { + $size = $1; + } elsif ($line =~ /\boffset ([0-9]+)/) { + $file_offset = sprintf("%016x", $1); + } + if (defined($vma) && defined($size) && defined($file_offset)) { + last; + } + } + close(OTOOL); + + if (!defined($vma) || !defined($size) || !defined($file_offset)) { + return undef; + } + + my $r = {}; + $r->{size} = $size; + $r->{vma} = $vma; + $r->{file_offset} = $file_offset; + + return $r; +} + +sub ParseTextSectionHeader { + # obj_tool_map("otool") is only defined if we're in a Mach-O environment + if (defined($obj_tool_map{"otool"})) { + my $r = ParseTextSectionHeaderFromOtool(@_); + if (defined($r)){ + return $r; + } + } + # If otool doesn't work, or we don't have it, fall back to objdump + return ParseTextSectionHeaderFromObjdump(@_); +} + +# Split /proc/pid/maps dump into a list of libraries +sub ParseLibraries { + return if $main::use_symbol_page; # We don't need libraries info. + my $prog = Cwd::abs_path(shift); + my $map = shift; + my $pcs = shift; + + my $result = []; + my $h = "[a-f0-9]+"; + my $zero_offset = HexExtend("0"); + + my $buildvar = ""; + foreach my $l (split("\n", $map)) { + if ($l =~ m/^\s*build=(.*)$/) { + $buildvar = $1; + } + + my $start; + my $finish; + my $offset; + my $lib; + if ($l =~ /^($h)-($h)\s+..x.\s+($h)\s+\S+:\S+\s+\d+\s+(\S+\.(so|dll|dylib|bundle)((\.\d+)+\w*(\.\d+){0,3})?)$/i) { + # Full line from /proc/self/maps. Example: + # 40000000-40015000 r-xp 00000000 03:01 12845071 /lib/ld-2.3.2.so + $start = HexExtend($1); + $finish = HexExtend($2); + $offset = HexExtend($3); + $lib = $4; + $lib =~ s|\\|/|g; # turn windows-style paths into unix-style paths + } elsif ($l =~ /^\s*($h)-($h):\s*(\S+\.so(\.\d+)*)/) { + # Cooked line from DumpAddressMap. Example: + # 40000000-40015000: /lib/ld-2.3.2.so + $start = HexExtend($1); + $finish = HexExtend($2); + $offset = $zero_offset; + $lib = $3; + } elsif (($l =~ /^($h)-($h)\s+..x.\s+($h)\s+\S+:\S+\s+\d+\s+(\S+)$/i) && ($4 eq $prog)) { + # PIEs and address space randomization do not play well with our + # default assumption that main executable is at lowest + # addresses. So we're detecting main executable in + # /proc/self/maps as well. + $start = HexExtend($1); + $finish = HexExtend($2); + $offset = HexExtend($3); + $lib = $4; + $lib =~ s|\\|/|g; # turn windows-style paths into unix-style paths + } + # FreeBSD 10.0 virtual memory map /proc/curproc/map as defined in + # function procfs_doprocmap (sys/fs/procfs/procfs_map.c) + # + # Example: + # 0x800600000 0x80061a000 26 0 0xfffff800035a0000 r-x 75 33 0x1004 COW NC vnode /libexec/ld-elf.s + # o.1 NCH -1 + elsif ($l =~ /^(0x$h)\s(0x$h)\s\d+\s\d+\s0x$h\sr-x\s\d+\s\d+\s0x\d+\s(COW|NCO)\s(NC|NNC)\svnode\s(\S+\.so(\.\d+)*)/) { + $start = HexExtend($1); + $finish = HexExtend($2); + $offset = $zero_offset; + $lib = FindLibrary($5); + + } else { + next; + } + + # Expand "$build" variable if available + $lib =~ s/\$build\b/$buildvar/g; + + $lib = FindLibrary($lib); + + # Check for pre-relocated libraries, which use pre-relocated symbol tables + # and thus require adjusting the offset that we'll use to translate + # VM addresses into symbol table addresses. + # Only do this if we're not going to fetch the symbol table from a + # debugging copy of the library. + if (!DebuggingLibrary($lib)) { + my $text = ParseTextSectionHeader($lib); + if (defined($text)) { + my $vma_offset = AddressSub($text->{vma}, $text->{file_offset}); + $offset = AddressAdd($offset, $vma_offset); + } + } + + if($main::opt_debug) { printf STDERR "$start:$finish ($offset) $lib\n"; } + push(@{$result}, [$lib, $start, $finish, $offset]); + } + + # Append special entry for additional library (not relocated) + if ($main::opt_lib ne "") { + my $text = ParseTextSectionHeader($main::opt_lib); + if (defined($text)) { + my $start = $text->{vma}; + my $finish = AddressAdd($start, $text->{size}); + + push(@{$result}, [$main::opt_lib, $start, $finish, $start]); + } + } + + # Append special entry for the main program. This covers + # 0..max_pc_value_seen, so that we assume pc values not found in one + # of the library ranges will be treated as coming from the main + # program binary. + my $min_pc = HexExtend("0"); + my $max_pc = $min_pc; # find the maximal PC value in any sample + foreach my $pc (keys(%{$pcs})) { + if (HexExtend($pc) gt $max_pc) { $max_pc = HexExtend($pc); } + } + push(@{$result}, [$prog, $min_pc, $max_pc, $zero_offset]); + + return $result; +} + +# Add two hex addresses of length $address_length. +# Run jeprof --test for unit test if this is changed. +sub AddressAdd { + my $addr1 = shift; + my $addr2 = shift; + my $sum; + + if ($address_length == 8) { + # Perl doesn't cope with wraparound arithmetic, so do it explicitly: + $sum = (hex($addr1)+hex($addr2)) % (0x10000000 * 16); + return sprintf("%08x", $sum); + + } else { + # Do the addition in 7-nibble chunks to trivialize carry handling. + + if ($main::opt_debug and $main::opt_test) { + print STDERR "AddressAdd $addr1 + $addr2 = "; + } + + my $a1 = substr($addr1,-7); + $addr1 = substr($addr1,0,-7); + my $a2 = substr($addr2,-7); + $addr2 = substr($addr2,0,-7); + $sum = hex($a1) + hex($a2); + my $c = 0; + if ($sum > 0xfffffff) { + $c = 1; + $sum -= 0x10000000; + } + my $r = sprintf("%07x", $sum); + + $a1 = substr($addr1,-7); + $addr1 = substr($addr1,0,-7); + $a2 = substr($addr2,-7); + $addr2 = substr($addr2,0,-7); + $sum = hex($a1) + hex($a2) + $c; + $c = 0; + if ($sum > 0xfffffff) { + $c = 1; + $sum -= 0x10000000; + } + $r = sprintf("%07x", $sum) . $r; + + $sum = hex($addr1) + hex($addr2) + $c; + if ($sum > 0xff) { $sum -= 0x100; } + $r = sprintf("%02x", $sum) . $r; + + if ($main::opt_debug and $main::opt_test) { print STDERR "$r\n"; } + + return $r; + } +} + + +# Subtract two hex addresses of length $address_length. +# Run jeprof --test for unit test if this is changed. +sub AddressSub { + my $addr1 = shift; + my $addr2 = shift; + my $diff; + + if ($address_length == 8) { + # Perl doesn't cope with wraparound arithmetic, so do it explicitly: + $diff = (hex($addr1)-hex($addr2)) % (0x10000000 * 16); + return sprintf("%08x", $diff); + + } else { + # Do the addition in 7-nibble chunks to trivialize borrow handling. + # if ($main::opt_debug) { print STDERR "AddressSub $addr1 - $addr2 = "; } + + my $a1 = hex(substr($addr1,-7)); + $addr1 = substr($addr1,0,-7); + my $a2 = hex(substr($addr2,-7)); + $addr2 = substr($addr2,0,-7); + my $b = 0; + if ($a2 > $a1) { + $b = 1; + $a1 += 0x10000000; + } + $diff = $a1 - $a2; + my $r = sprintf("%07x", $diff); + + $a1 = hex(substr($addr1,-7)); + $addr1 = substr($addr1,0,-7); + $a2 = hex(substr($addr2,-7)) + $b; + $addr2 = substr($addr2,0,-7); + $b = 0; + if ($a2 > $a1) { + $b = 1; + $a1 += 0x10000000; + } + $diff = $a1 - $a2; + $r = sprintf("%07x", $diff) . $r; + + $a1 = hex($addr1); + $a2 = hex($addr2) + $b; + if ($a2 > $a1) { $a1 += 0x100; } + $diff = $a1 - $a2; + $r = sprintf("%02x", $diff) . $r; + + # if ($main::opt_debug) { print STDERR "$r\n"; } + + return $r; + } +} + +# Increment a hex addresses of length $address_length. +# Run jeprof --test for unit test if this is changed. +sub AddressInc { + my $addr = shift; + my $sum; + + if ($address_length == 8) { + # Perl doesn't cope with wraparound arithmetic, so do it explicitly: + $sum = (hex($addr)+1) % (0x10000000 * 16); + return sprintf("%08x", $sum); + + } else { + # Do the addition in 7-nibble chunks to trivialize carry handling. + # We are always doing this to step through the addresses in a function, + # and will almost never overflow the first chunk, so we check for this + # case and exit early. + + # if ($main::opt_debug) { print STDERR "AddressInc $addr1 = "; } + + my $a1 = substr($addr,-7); + $addr = substr($addr,0,-7); + $sum = hex($a1) + 1; + my $r = sprintf("%07x", $sum); + if ($sum <= 0xfffffff) { + $r = $addr . $r; + # if ($main::opt_debug) { print STDERR "$r\n"; } + return HexExtend($r); + } else { + $r = "0000000"; + } + + $a1 = substr($addr,-7); + $addr = substr($addr,0,-7); + $sum = hex($a1) + 1; + $r = sprintf("%07x", $sum) . $r; + if ($sum <= 0xfffffff) { + $r = $addr . $r; + # if ($main::opt_debug) { print STDERR "$r\n"; } + return HexExtend($r); + } else { + $r = "00000000000000"; + } + + $sum = hex($addr) + 1; + if ($sum > 0xff) { $sum -= 0x100; } + $r = sprintf("%02x", $sum) . $r; + + # if ($main::opt_debug) { print STDERR "$r\n"; } + return $r; + } +} + +# Extract symbols for all PC values found in profile +sub ExtractSymbols { + my $libs = shift; + my $pcset = shift; + + my $symbols = {}; + + # Map each PC value to the containing library. To make this faster, + # we sort libraries by their starting pc value (highest first), and + # advance through the libraries as we advance the pc. Sometimes the + # addresses of libraries may overlap with the addresses of the main + # binary, so to make sure the libraries 'win', we iterate over the + # libraries in reverse order (which assumes the binary doesn't start + # in the middle of a library, which seems a fair assumption). + my @pcs = (sort { $a cmp $b } keys(%{$pcset})); # pcset is 0-extended strings + foreach my $lib (sort {$b->[1] cmp $a->[1]} @{$libs}) { + my $libname = $lib->[0]; + my $start = $lib->[1]; + my $finish = $lib->[2]; + my $offset = $lib->[3]; + + # Use debug library if it exists + my $debug_libname = DebuggingLibrary($libname); + if ($debug_libname) { + $libname = $debug_libname; + } + + # Get list of pcs that belong in this library. + my $contained = []; + my ($start_pc_index, $finish_pc_index); + # Find smallest finish_pc_index such that $finish < $pc[$finish_pc_index]. + for ($finish_pc_index = $#pcs + 1; $finish_pc_index > 0; + $finish_pc_index--) { + last if $pcs[$finish_pc_index - 1] le $finish; + } + # Find smallest start_pc_index such that $start <= $pc[$start_pc_index]. + for ($start_pc_index = $finish_pc_index; $start_pc_index > 0; + $start_pc_index--) { + last if $pcs[$start_pc_index - 1] lt $start; + } + # This keeps PC values higher than $pc[$finish_pc_index] in @pcs, + # in case there are overlaps in libraries and the main binary. + @{$contained} = splice(@pcs, $start_pc_index, + $finish_pc_index - $start_pc_index); + # Map to symbols + MapToSymbols($libname, AddressSub($start, $offset), $contained, $symbols); + } + + return $symbols; +} + +# Map list of PC values to symbols for a given image +sub MapToSymbols { + my $image = shift; + my $offset = shift; + my $pclist = shift; + my $symbols = shift; + + my $debug = 0; + + # Ignore empty binaries + if ($#{$pclist} < 0) { return; } + + # Figure out the addr2line command to use + my $addr2line = $obj_tool_map{"addr2line"}; + my $cmd = ShellEscape($addr2line, "-f", "-C", "-e", $image); + if (exists $obj_tool_map{"addr2line_pdb"}) { + $addr2line = $obj_tool_map{"addr2line_pdb"}; + $cmd = ShellEscape($addr2line, "--demangle", "-f", "-C", "-e", $image); + } + + # If "addr2line" isn't installed on the system at all, just use + # nm to get what info we can (function names, but not line numbers). + if (system(ShellEscape($addr2line, "--help") . " >$dev_null 2>&1") != 0) { + MapSymbolsWithNM($image, $offset, $pclist, $symbols); + return; + } + + # "addr2line -i" can produce a variable number of lines per input + # address, with no separator that allows us to tell when data for + # the next address starts. So we find the address for a special + # symbol (_fini) and interleave this address between all real + # addresses passed to addr2line. The name of this special symbol + # can then be used as a separator. + $sep_address = undef; # May be filled in by MapSymbolsWithNM() + my $nm_symbols = {}; + MapSymbolsWithNM($image, $offset, $pclist, $nm_symbols); + if (defined($sep_address)) { + # Only add " -i" to addr2line if the binary supports it. + # addr2line --help returns 0, but not if it sees an unknown flag first. + if (system("$cmd -i --help >$dev_null 2>&1") == 0) { + $cmd .= " -i"; + } else { + $sep_address = undef; # no need for sep_address if we don't support -i + } + } + + # Make file with all PC values with intervening 'sep_address' so + # that we can reliably detect the end of inlined function list + open(ADDRESSES, ">$main::tmpfile_sym") || error("$main::tmpfile_sym: $!\n"); + if ($debug) { print("---- $image ---\n"); } + for (my $i = 0; $i <= $#{$pclist}; $i++) { + # addr2line always reads hex addresses, and does not need '0x' prefix. + if ($debug) { printf STDERR ("%s\n", $pclist->[$i]); } + printf ADDRESSES ("%s\n", AddressSub($pclist->[$i], $offset)); + if (defined($sep_address)) { + printf ADDRESSES ("%s\n", $sep_address); + } + } + close(ADDRESSES); + if ($debug) { + print("----\n"); + system("cat", $main::tmpfile_sym); + print("----\n"); + system("$cmd < " . ShellEscape($main::tmpfile_sym)); + print("----\n"); + } + + open(SYMBOLS, "$cmd <" . ShellEscape($main::tmpfile_sym) . " |") + || error("$cmd: $!\n"); + my $count = 0; # Index in pclist + while () { + # Read fullfunction and filelineinfo from next pair of lines + s/\r?\n$//g; + my $fullfunction = $_; + $_ = ; + s/\r?\n$//g; + my $filelinenum = $_; + + if (defined($sep_address) && $fullfunction eq $sep_symbol) { + # Terminating marker for data for this address + $count++; + next; + } + + $filelinenum =~ s|\\|/|g; # turn windows-style paths into unix-style paths + + my $pcstr = $pclist->[$count]; + my $function = ShortFunctionName($fullfunction); + my $nms = $nm_symbols->{$pcstr}; + if (defined($nms)) { + if ($fullfunction eq '??') { + # nm found a symbol for us. + $function = $nms->[0]; + $fullfunction = $nms->[2]; + } else { + # MapSymbolsWithNM tags each routine with its starting address, + # useful in case the image has multiple occurrences of this + # routine. (It uses a syntax that resembles template parameters, + # that are automatically stripped out by ShortFunctionName().) + # addr2line does not provide the same information. So we check + # if nm disambiguated our symbol, and if so take the annotated + # (nm) version of the routine-name. TODO(csilvers): this won't + # catch overloaded, inlined symbols, which nm doesn't see. + # Better would be to do a check similar to nm's, in this fn. + if ($nms->[2] =~ m/^\Q$function\E/) { # sanity check it's the right fn + $function = $nms->[0]; + $fullfunction = $nms->[2]; + } + } + } + + # Prepend to accumulated symbols for pcstr + # (so that caller comes before callee) + my $sym = $symbols->{$pcstr}; + if (!defined($sym)) { + $sym = []; + $symbols->{$pcstr} = $sym; + } + unshift(@{$sym}, $function, $filelinenum, $fullfunction); + if ($debug) { printf STDERR ("%s => [%s]\n", $pcstr, join(" ", @{$sym})); } + if (!defined($sep_address)) { + # Inlining is off, so this entry ends immediately + $count++; + } + } + close(SYMBOLS); +} + +# Use nm to map the list of referenced PCs to symbols. Return true iff we +# are able to read procedure information via nm. +sub MapSymbolsWithNM { + my $image = shift; + my $offset = shift; + my $pclist = shift; + my $symbols = shift; + + # Get nm output sorted by increasing address + my $symbol_table = GetProcedureBoundaries($image, "."); + if (!%{$symbol_table}) { + return 0; + } + # Start addresses are already the right length (8 or 16 hex digits). + my @names = sort { $symbol_table->{$a}->[0] cmp $symbol_table->{$b}->[0] } + keys(%{$symbol_table}); + + if ($#names < 0) { + # No symbols: just use addresses + foreach my $pc (@{$pclist}) { + my $pcstr = "0x" . $pc; + $symbols->{$pc} = [$pcstr, "?", $pcstr]; + } + return 0; + } + + # Sort addresses so we can do a join against nm output + my $index = 0; + my $fullname = $names[0]; + my $name = ShortFunctionName($fullname); + foreach my $pc (sort { $a cmp $b } @{$pclist}) { + # Adjust for mapped offset + my $mpc = AddressSub($pc, $offset); + while (($index < $#names) && ($mpc ge $symbol_table->{$fullname}->[1])){ + $index++; + $fullname = $names[$index]; + $name = ShortFunctionName($fullname); + } + if ($mpc lt $symbol_table->{$fullname}->[1]) { + $symbols->{$pc} = [$name, "?", $fullname]; + } else { + my $pcstr = "0x" . $pc; + $symbols->{$pc} = [$pcstr, "?", $pcstr]; + } + } + return 1; +} + +sub ShortFunctionName { + my $function = shift; + while ($function =~ s/\([^()]*\)(\s*const)?//g) { } # Argument types + while ($function =~ s/<[^<>]*>//g) { } # Remove template arguments + $function =~ s/^.*\s+(\w+::)/$1/; # Remove leading type + return $function; +} + +# Trim overly long symbols found in disassembler output +sub CleanDisassembly { + my $d = shift; + while ($d =~ s/\([^()%]*\)(\s*const)?//g) { } # Argument types, not (%rax) + while ($d =~ s/(\w+)<[^<>]*>/$1/g) { } # Remove template arguments + return $d; +} + +# Clean file name for display +sub CleanFileName { + my ($f) = @_; + $f =~ s|^/proc/self/cwd/||; + $f =~ s|^\./||; + return $f; +} + +# Make address relative to section and clean up for display +sub UnparseAddress { + my ($offset, $address) = @_; + $address = AddressSub($address, $offset); + $address =~ s/^0x//; + $address =~ s/^0*//; + return $address; +} + +##### Miscellaneous ##### + +# Find the right versions of the above object tools to use. The +# argument is the program file being analyzed, and should be an ELF +# 32-bit or ELF 64-bit executable file. The location of the tools +# is determined by considering the following options in this order: +# 1) --tools option, if set +# 2) JEPROF_TOOLS environment variable, if set +# 3) the environment +sub ConfigureObjTools { + my $prog_file = shift; + + # Check for the existence of $prog_file because /usr/bin/file does not + # predictably return error status in prod. + (-e $prog_file) || error("$prog_file does not exist.\n"); + + my $file_type = undef; + if (-e "/usr/bin/file") { + # Follow symlinks (at least for systems where "file" supports that). + my $escaped_prog_file = ShellEscape($prog_file); + $file_type = `/usr/bin/file -L $escaped_prog_file 2>$dev_null || + /usr/bin/file $escaped_prog_file`; + } elsif ($^O == "MSWin32") { + $file_type = "MS Windows"; + } else { + print STDERR "WARNING: Can't determine the file type of $prog_file"; + } + + if ($file_type =~ /64-bit/) { + # Change $address_length to 16 if the program file is ELF 64-bit. + # We can't detect this from many (most?) heap or lock contention + # profiles, since the actual addresses referenced are generally in low + # memory even for 64-bit programs. + $address_length = 16; + } + + if ($file_type =~ /MS Windows/) { + # For windows, we provide a version of nm and addr2line as part of + # the opensource release, which is capable of parsing + # Windows-style PDB executables. It should live in the path, or + # in the same directory as jeprof. + $obj_tool_map{"nm_pdb"} = "nm-pdb"; + $obj_tool_map{"addr2line_pdb"} = "addr2line-pdb"; + } + + if ($file_type =~ /Mach-O/) { + # OS X uses otool to examine Mach-O files, rather than objdump. + $obj_tool_map{"otool"} = "otool"; + $obj_tool_map{"addr2line"} = "false"; # no addr2line + $obj_tool_map{"objdump"} = "false"; # no objdump + } + + # Go fill in %obj_tool_map with the pathnames to use: + foreach my $tool (keys %obj_tool_map) { + $obj_tool_map{$tool} = ConfigureTool($obj_tool_map{$tool}); + } +} + +# Returns the path of a caller-specified object tool. If --tools or +# JEPROF_TOOLS are specified, then returns the full path to the tool +# with that prefix. Otherwise, returns the path unmodified (which +# means we will look for it on PATH). +sub ConfigureTool { + my $tool = shift; + my $path; + + # --tools (or $JEPROF_TOOLS) is a comma separated list, where each + # item is either a) a pathname prefix, or b) a map of the form + # :. First we look for an entry of type (b) for our + # tool. If one is found, we use it. Otherwise, we consider all the + # pathname prefixes in turn, until one yields an existing file. If + # none does, we use a default path. + my $tools = $main::opt_tools || $ENV{"JEPROF_TOOLS"} || ""; + if ($tools =~ m/(,|^)\Q$tool\E:([^,]*)/) { + $path = $2; + # TODO(csilvers): sanity-check that $path exists? Hard if it's relative. + } elsif ($tools ne '') { + foreach my $prefix (split(',', $tools)) { + next if ($prefix =~ /:/); # ignore "tool:fullpath" entries in the list + if (-x $prefix . $tool) { + $path = $prefix . $tool; + last; + } + } + if (!$path) { + error("No '$tool' found with prefix specified by " . + "--tools (or \$JEPROF_TOOLS) '$tools'\n"); + } + } else { + # ... otherwise use the version that exists in the same directory as + # jeprof. If there's nothing there, use $PATH. + $0 =~ m,[^/]*$,; # this is everything after the last slash + my $dirname = $`; # this is everything up to and including the last slash + if (-x "$dirname$tool") { + $path = "$dirname$tool"; + } else { + $path = $tool; + } + } + if ($main::opt_debug) { print STDERR "Using '$path' for '$tool'.\n"; } + return $path; +} + +sub ShellEscape { + my @escaped_words = (); + foreach my $word (@_) { + my $escaped_word = $word; + if ($word =~ m![^a-zA-Z0-9/.,_=-]!) { # check for anything not in whitelist + $escaped_word =~ s/'/'\\''/; + $escaped_word = "'$escaped_word'"; + } + push(@escaped_words, $escaped_word); + } + return join(" ", @escaped_words); +} + +sub cleanup { + unlink($main::tmpfile_sym); + unlink(keys %main::tempnames); + + # We leave any collected profiles in $HOME/jeprof in case the user wants + # to look at them later. We print a message informing them of this. + if ((scalar(@main::profile_files) > 0) && + defined($main::collected_profile)) { + if (scalar(@main::profile_files) == 1) { + print STDERR "Dynamically gathered profile is in $main::collected_profile\n"; + } + print STDERR "If you want to investigate this profile further, you can do:\n"; + print STDERR "\n"; + print STDERR " jeprof \\\n"; + print STDERR " $main::prog \\\n"; + print STDERR " $main::collected_profile\n"; + print STDERR "\n"; + } +} + +sub sighandler { + cleanup(); + exit(1); +} + +sub error { + my $msg = shift; + print STDERR $msg; + cleanup(); + exit(1); +} + + +# Run $nm_command and get all the resulting procedure boundaries whose +# names match "$regexp" and returns them in a hashtable mapping from +# procedure name to a two-element vector of [start address, end address] +sub GetProcedureBoundariesViaNm { + my $escaped_nm_command = shift; # shell-escaped + my $regexp = shift; + + my $symbol_table = {}; + open(NM, "$escaped_nm_command |") || error("$escaped_nm_command: $!\n"); + my $last_start = "0"; + my $routine = ""; + while () { + s/\r//g; # turn windows-looking lines into unix-looking lines + if (m/^\s*([0-9a-f]+) (.) (..*)/) { + my $start_val = $1; + my $type = $2; + my $this_routine = $3; + + # It's possible for two symbols to share the same address, if + # one is a zero-length variable (like __start_google_malloc) or + # one symbol is a weak alias to another (like __libc_malloc). + # In such cases, we want to ignore all values except for the + # actual symbol, which in nm-speak has type "T". The logic + # below does this, though it's a bit tricky: what happens when + # we have a series of lines with the same address, is the first + # one gets queued up to be processed. However, it won't + # *actually* be processed until later, when we read a line with + # a different address. That means that as long as we're reading + # lines with the same address, we have a chance to replace that + # item in the queue, which we do whenever we see a 'T' entry -- + # that is, a line with type 'T'. If we never see a 'T' entry, + # we'll just go ahead and process the first entry (which never + # got touched in the queue), and ignore the others. + if ($start_val eq $last_start && $type =~ /t/i) { + # We are the 'T' symbol at this address, replace previous symbol. + $routine = $this_routine; + next; + } elsif ($start_val eq $last_start) { + # We're not the 'T' symbol at this address, so ignore us. + next; + } + + if ($this_routine eq $sep_symbol) { + $sep_address = HexExtend($start_val); + } + + # Tag this routine with the starting address in case the image + # has multiple occurrences of this routine. We use a syntax + # that resembles template parameters that are automatically + # stripped out by ShortFunctionName() + $this_routine .= "<$start_val>"; + + if (defined($routine) && $routine =~ m/$regexp/) { + $symbol_table->{$routine} = [HexExtend($last_start), + HexExtend($start_val)]; + } + $last_start = $start_val; + $routine = $this_routine; + } elsif (m/^Loaded image name: (.+)/) { + # The win32 nm workalike emits information about the binary it is using. + if ($main::opt_debug) { print STDERR "Using Image $1\n"; } + } elsif (m/^PDB file name: (.+)/) { + # The win32 nm workalike emits information about the pdb it is using. + if ($main::opt_debug) { print STDERR "Using PDB $1\n"; } + } + } + close(NM); + # Handle the last line in the nm output. Unfortunately, we don't know + # how big this last symbol is, because we don't know how big the file + # is. For now, we just give it a size of 0. + # TODO(csilvers): do better here. + if (defined($routine) && $routine =~ m/$regexp/) { + $symbol_table->{$routine} = [HexExtend($last_start), + HexExtend($last_start)]; + } + return $symbol_table; +} + +# Gets the procedure boundaries for all routines in "$image" whose names +# match "$regexp" and returns them in a hashtable mapping from procedure +# name to a two-element vector of [start address, end address]. +# Will return an empty map if nm is not installed or not working properly. +sub GetProcedureBoundaries { + my $image = shift; + my $regexp = shift; + + # If $image doesn't start with /, then put ./ in front of it. This works + # around an obnoxious bug in our probing of nm -f behavior. + # "nm -f $image" is supposed to fail on GNU nm, but if: + # + # a. $image starts with [BbSsPp] (for example, bin/foo/bar), AND + # b. you have a.out in your current directory (a not uncommon occurrence) + # + # then "nm -f $image" succeeds because -f only looks at the first letter of + # the argument, which looks valid because it's [BbSsPp], and then since + # there's no image provided, it looks for a.out and finds it. + # + # This regex makes sure that $image starts with . or /, forcing the -f + # parsing to fail since . and / are not valid formats. + $image =~ s#^[^/]#./$&#; + + # For libc libraries, the copy in /usr/lib/debug contains debugging symbols + my $debugging = DebuggingLibrary($image); + if ($debugging) { + $image = $debugging; + } + + my $nm = $obj_tool_map{"nm"}; + my $cppfilt = $obj_tool_map{"c++filt"}; + + # nm can fail for two reasons: 1) $image isn't a debug library; 2) nm + # binary doesn't support --demangle. In addition, for OS X we need + # to use the -f flag to get 'flat' nm output (otherwise we don't sort + # properly and get incorrect results). Unfortunately, GNU nm uses -f + # in an incompatible way. So first we test whether our nm supports + # --demangle and -f. + my $demangle_flag = ""; + my $cppfilt_flag = ""; + my $to_devnull = ">$dev_null 2>&1"; + if (system(ShellEscape($nm, "--demangle", $image) . $to_devnull) == 0) { + # In this mode, we do "nm --demangle " + $demangle_flag = "--demangle"; + $cppfilt_flag = ""; + } elsif (system(ShellEscape($cppfilt, $image) . $to_devnull) == 0) { + # In this mode, we do "nm | c++filt" + $cppfilt_flag = " | " . ShellEscape($cppfilt); + }; + my $flatten_flag = ""; + if (system(ShellEscape($nm, "-f", $image) . $to_devnull) == 0) { + $flatten_flag = "-f"; + } + + # Finally, in the case $imagie isn't a debug library, we try again with + # -D to at least get *exported* symbols. If we can't use --demangle, + # we use c++filt instead, if it exists on this system. + my @nm_commands = (ShellEscape($nm, "-n", $flatten_flag, $demangle_flag, + $image) . " 2>$dev_null $cppfilt_flag", + ShellEscape($nm, "-D", "-n", $flatten_flag, $demangle_flag, + $image) . " 2>$dev_null $cppfilt_flag", + # 6nm is for Go binaries + ShellEscape("6nm", "$image") . " 2>$dev_null | sort", + ); + + # If the executable is an MS Windows PDB-format executable, we'll + # have set up obj_tool_map("nm_pdb"). In this case, we actually + # want to use both unix nm and windows-specific nm_pdb, since + # PDB-format executables can apparently include dwarf .o files. + if (exists $obj_tool_map{"nm_pdb"}) { + push(@nm_commands, + ShellEscape($obj_tool_map{"nm_pdb"}, "--demangle", $image) + . " 2>$dev_null"); + } + + foreach my $nm_command (@nm_commands) { + my $symbol_table = GetProcedureBoundariesViaNm($nm_command, $regexp); + return $symbol_table if (%{$symbol_table}); + } + my $symbol_table = {}; + return $symbol_table; +} + + +# The test vectors for AddressAdd/Sub/Inc are 8-16-nibble hex strings. +# To make them more readable, we add underscores at interesting places. +# This routine removes the underscores, producing the canonical representation +# used by jeprof to represent addresses, particularly in the tested routines. +sub CanonicalHex { + my $arg = shift; + return join '', (split '_',$arg); +} + + +# Unit test for AddressAdd: +sub AddressAddUnitTest { + my $test_data_8 = shift; + my $test_data_16 = shift; + my $error_count = 0; + my $fail_count = 0; + my $pass_count = 0; + # print STDERR "AddressAddUnitTest: ", 1+$#{$test_data_8}, " tests\n"; + + # First a few 8-nibble addresses. Note that this implementation uses + # plain old arithmetic, so a quick sanity check along with verifying what + # happens to overflow (we want it to wrap): + $address_length = 8; + foreach my $row (@{$test_data_8}) { + if ($main::opt_debug and $main::opt_test) { print STDERR "@{$row}\n"; } + my $sum = AddressAdd ($row->[0], $row->[1]); + if ($sum ne $row->[2]) { + printf STDERR "ERROR: %s != %s + %s = %s\n", $sum, + $row->[0], $row->[1], $row->[2]; + ++$fail_count; + } else { + ++$pass_count; + } + } + printf STDERR "AddressAdd 32-bit tests: %d passes, %d failures\n", + $pass_count, $fail_count; + $error_count = $fail_count; + $fail_count = 0; + $pass_count = 0; + + # Now 16-nibble addresses. + $address_length = 16; + foreach my $row (@{$test_data_16}) { + if ($main::opt_debug and $main::opt_test) { print STDERR "@{$row}\n"; } + my $sum = AddressAdd (CanonicalHex($row->[0]), CanonicalHex($row->[1])); + my $expected = join '', (split '_',$row->[2]); + if ($sum ne CanonicalHex($row->[2])) { + printf STDERR "ERROR: %s != %s + %s = %s\n", $sum, + $row->[0], $row->[1], $row->[2]; + ++$fail_count; + } else { + ++$pass_count; + } + } + printf STDERR "AddressAdd 64-bit tests: %d passes, %d failures\n", + $pass_count, $fail_count; + $error_count += $fail_count; + + return $error_count; +} + + +# Unit test for AddressSub: +sub AddressSubUnitTest { + my $test_data_8 = shift; + my $test_data_16 = shift; + my $error_count = 0; + my $fail_count = 0; + my $pass_count = 0; + # print STDERR "AddressSubUnitTest: ", 1+$#{$test_data_8}, " tests\n"; + + # First a few 8-nibble addresses. Note that this implementation uses + # plain old arithmetic, so a quick sanity check along with verifying what + # happens to overflow (we want it to wrap): + $address_length = 8; + foreach my $row (@{$test_data_8}) { + if ($main::opt_debug and $main::opt_test) { print STDERR "@{$row}\n"; } + my $sum = AddressSub ($row->[0], $row->[1]); + if ($sum ne $row->[3]) { + printf STDERR "ERROR: %s != %s - %s = %s\n", $sum, + $row->[0], $row->[1], $row->[3]; + ++$fail_count; + } else { + ++$pass_count; + } + } + printf STDERR "AddressSub 32-bit tests: %d passes, %d failures\n", + $pass_count, $fail_count; + $error_count = $fail_count; + $fail_count = 0; + $pass_count = 0; + + # Now 16-nibble addresses. + $address_length = 16; + foreach my $row (@{$test_data_16}) { + if ($main::opt_debug and $main::opt_test) { print STDERR "@{$row}\n"; } + my $sum = AddressSub (CanonicalHex($row->[0]), CanonicalHex($row->[1])); + if ($sum ne CanonicalHex($row->[3])) { + printf STDERR "ERROR: %s != %s - %s = %s\n", $sum, + $row->[0], $row->[1], $row->[3]; + ++$fail_count; + } else { + ++$pass_count; + } + } + printf STDERR "AddressSub 64-bit tests: %d passes, %d failures\n", + $pass_count, $fail_count; + $error_count += $fail_count; + + return $error_count; +} + + +# Unit test for AddressInc: +sub AddressIncUnitTest { + my $test_data_8 = shift; + my $test_data_16 = shift; + my $error_count = 0; + my $fail_count = 0; + my $pass_count = 0; + # print STDERR "AddressIncUnitTest: ", 1+$#{$test_data_8}, " tests\n"; + + # First a few 8-nibble addresses. Note that this implementation uses + # plain old arithmetic, so a quick sanity check along with verifying what + # happens to overflow (we want it to wrap): + $address_length = 8; + foreach my $row (@{$test_data_8}) { + if ($main::opt_debug and $main::opt_test) { print STDERR "@{$row}\n"; } + my $sum = AddressInc ($row->[0]); + if ($sum ne $row->[4]) { + printf STDERR "ERROR: %s != %s + 1 = %s\n", $sum, + $row->[0], $row->[4]; + ++$fail_count; + } else { + ++$pass_count; + } + } + printf STDERR "AddressInc 32-bit tests: %d passes, %d failures\n", + $pass_count, $fail_count; + $error_count = $fail_count; + $fail_count = 0; + $pass_count = 0; + + # Now 16-nibble addresses. + $address_length = 16; + foreach my $row (@{$test_data_16}) { + if ($main::opt_debug and $main::opt_test) { print STDERR "@{$row}\n"; } + my $sum = AddressInc (CanonicalHex($row->[0])); + if ($sum ne CanonicalHex($row->[4])) { + printf STDERR "ERROR: %s != %s + 1 = %s\n", $sum, + $row->[0], $row->[4]; + ++$fail_count; + } else { + ++$pass_count; + } + } + printf STDERR "AddressInc 64-bit tests: %d passes, %d failures\n", + $pass_count, $fail_count; + $error_count += $fail_count; + + return $error_count; +} + + +# Driver for unit tests. +# Currently just the address add/subtract/increment routines for 64-bit. +sub RunUnitTests { + my $error_count = 0; + + # This is a list of tuples [a, b, a+b, a-b, a+1] + my $unit_test_data_8 = [ + [qw(aaaaaaaa 50505050 fafafafa 5a5a5a5a aaaaaaab)], + [qw(50505050 aaaaaaaa fafafafa a5a5a5a6 50505051)], + [qw(ffffffff aaaaaaaa aaaaaaa9 55555555 00000000)], + [qw(00000001 ffffffff 00000000 00000002 00000002)], + [qw(00000001 fffffff0 fffffff1 00000011 00000002)], + ]; + my $unit_test_data_16 = [ + # The implementation handles data in 7-nibble chunks, so those are the + # interesting boundaries. + [qw(aaaaaaaa 50505050 + 00_000000f_afafafa 00_0000005_a5a5a5a 00_000000a_aaaaaab)], + [qw(50505050 aaaaaaaa + 00_000000f_afafafa ff_ffffffa_5a5a5a6 00_0000005_0505051)], + [qw(ffffffff aaaaaaaa + 00_000001a_aaaaaa9 00_0000005_5555555 00_0000010_0000000)], + [qw(00000001 ffffffff + 00_0000010_0000000 ff_ffffff0_0000002 00_0000000_0000002)], + [qw(00000001 fffffff0 + 00_000000f_ffffff1 ff_ffffff0_0000011 00_0000000_0000002)], + + [qw(00_a00000a_aaaaaaa 50505050 + 00_a00000f_afafafa 00_a000005_a5a5a5a 00_a00000a_aaaaaab)], + [qw(0f_fff0005_0505050 aaaaaaaa + 0f_fff000f_afafafa 0f_ffefffa_5a5a5a6 0f_fff0005_0505051)], + [qw(00_000000f_fffffff 01_800000a_aaaaaaa + 01_800001a_aaaaaa9 fe_8000005_5555555 00_0000010_0000000)], + [qw(00_0000000_0000001 ff_fffffff_fffffff + 00_0000000_0000000 00_0000000_0000002 00_0000000_0000002)], + [qw(00_0000000_0000001 ff_fffffff_ffffff0 + ff_fffffff_ffffff1 00_0000000_0000011 00_0000000_0000002)], + ]; + + $error_count += AddressAddUnitTest($unit_test_data_8, $unit_test_data_16); + $error_count += AddressSubUnitTest($unit_test_data_8, $unit_test_data_16); + $error_count += AddressIncUnitTest($unit_test_data_8, $unit_test_data_16); + if ($error_count > 0) { + print STDERR $error_count, " errors: FAILED\n"; + } else { + print STDERR "PASS\n"; + } + exit ($error_count); +} \ No newline at end of file diff --git a/src/server/status_server/mod.rs b/src/server/status_server/mod.rs index 679f21fdf6c..60b267a6d94 100644 --- a/src/server/status_server/mod.rs +++ b/src/server/status_server/mod.rs @@ -3,6 +3,7 @@ /// Provides profilers for TiKV. mod profile; use std::{ + env::args, error::Error as StdError, net::SocketAddr, path::PathBuf, @@ -39,10 +40,8 @@ use openssl::{ x509::X509, }; use pin_project::pin_project; -pub use profile::{ - activate_heap_profile, deactivate_heap_profile, jeprof_heap_profile, list_heap_profiles, - read_file, start_one_cpu_profile, start_one_heap_profile, -}; +pub use profile::HEAP_PROFILE_ACTIVE; +use profile::*; use prometheus::TEXT_FORMAT; use regex::Regex; use resource_control::ResourceGroupManager; @@ -168,16 +167,22 @@ where Ok(val) => val, Err(err) => return Ok(make_response(StatusCode::BAD_REQUEST, err.to_string())), }, - None => 60, + None => 0, }; - let interval = Duration::from_secs(interval); - let period = GLOBAL_TIMER_HANDLE - .interval(Instant::now() + interval, interval) - .compat() - .map_ok(|_| ()) - .map_err(|_| TIMER_CANCELED.to_owned()) - .into_stream(); + let period = if interval == 0 { + None + } else { + let interval = Duration::from_secs(interval); + Some( + GLOBAL_TIMER_HANDLE + .interval(Instant::now() + interval, interval) + .compat() + .map_ok(|_| ()) + .map_err(|_| TIMER_CANCELED.to_owned()) + .into_stream(), + ) + }; let (tx, rx) = oneshot::channel(); let callback = move || tx.send(()).unwrap_or_default(); let res = Handle::current().spawn(activate_heap_profile(period, store_path, callback)); @@ -199,7 +204,6 @@ where Ok(make_response(StatusCode::OK, body)) } - #[allow(dead_code)] async fn dump_heap_prof_to_resp(req: Request) -> hyper::Result> { let query = req.uri().query().unwrap_or(""); let query_pairs: HashMap<_, _> = url::form_urlencoded::parse(query.as_bytes()).collect(); @@ -207,27 +211,37 @@ where let use_jeprof = query_pairs.get("jeprof").map(|x| x.as_ref()) == Some("true"); let result = if let Some(name) = query_pairs.get("name") { - if use_jeprof { - jeprof_heap_profile(name) - } else { - read_file(name) + let re = Regex::new(HEAP_PROFILE_REGEX).unwrap(); + if !re.is_match(name) { + let errmsg = format!("heap profile name {} is invalid", name); + return Ok(make_response(StatusCode::BAD_REQUEST, errmsg)); } - } else { - let mut seconds = 10; - if let Some(s) = query_pairs.get("seconds") { - match s.parse() { - Ok(val) => seconds = val, - Err(_) => { - let errmsg = "request should have seconds argument".to_owned(); - return Ok(make_response(StatusCode::BAD_REQUEST, errmsg)); + let profiles = match list_heap_profiles() { + Ok(s) => s, + Err(e) => return Ok(make_response(StatusCode::INTERNAL_SERVER_ERROR, e)), + }; + if profiles.iter().any(|(f, _)| f == name) { + let dir = match heap_profiles_dir() { + Some(path) => path, + None => { + return Ok(make_response( + StatusCode::INTERNAL_SERVER_ERROR, + "heap profile is not active", + )); } + }; + let path = dir.join(name.as_ref()); + if use_jeprof { + jeprof_heap_profile(path.to_str().unwrap()) + } else { + read_file(path.to_str().unwrap()) } + } else { + let errmsg = format!("heap profile {} not found", name); + return Ok(make_response(StatusCode::BAD_REQUEST, errmsg)); } - let timer = GLOBAL_TIMER_HANDLE.delay(Instant::now() + Duration::from_secs(seconds)); - let end = Compat01As03::new(timer) - .map_err(|_| TIMER_CANCELED.to_owned()) - .into_future(); - start_one_heap_profile(end, use_jeprof).await + } else { + dump_one_heap_profile() }; match result { @@ -283,11 +297,100 @@ where }) } + async fn get_cmdline(_req: Request) -> hyper::Result> { + let args = args().into_iter().fold(String::new(), |mut a, b| { + a.push_str(&b); + a.push('\x00'); + a + }); + let response = Response::builder() + .header("Content-Type", mime::TEXT_PLAIN.to_string()) + .header("X-Content-Type-Options", "nosniff") + .body(args.into()) + .unwrap(); + Ok(response) + } + + async fn get_symbol_count(req: Request) -> hyper::Result> { + assert_eq!(req.method(), Method::GET); + // We don't know how many symbols we have, but we + // do have symbol information. pprof only cares whether + // this number is 0 (no symbols available) or > 0. + let text = "num_symbols: 1\n"; + let response = Response::builder() + .header("Content-Type", mime::TEXT_PLAIN.to_string()) + .header("X-Content-Type-Options", "nosniff") + .header("Content-Length", text.len()) + .body(text.into()) + .unwrap(); + Ok(response) + } + + // The request and response format follows pprof remote server + // https://gperftools.github.io/gperftools/pprof_remote_servers.html + // Here is the go pprof implementation: + // https://github.com/golang/go/blob/3857a89e7eb872fa22d569e70b7e076bec74ebbb/src/net/http/pprof/pprof.go#L191 + async fn get_symbol(req: Request) -> hyper::Result> { + assert_eq!(req.method(), Method::POST); + let mut text = String::new(); + let body_bytes = hyper::body::to_bytes(req.into_body()).await?; + let body = String::from_utf8(body_bytes.to_vec()).unwrap(); + + // The request body is a list of addr to be resolved joined by '+'. + // Resolve addrs with addr2line and write the symbols each per line in + // response. + for pc in body.split('+') { + let addr = usize::from_str_radix(pc.trim_start_matches("0x"), 16).unwrap_or(0); + if addr == 0 { + info!("invalid addr: {}", addr); + continue; + } + + // Would be multiple symbols if inlined. + let mut syms = vec![]; + backtrace::resolve(addr as *mut std::ffi::c_void, |sym| { + let name = sym + .name() + .unwrap_or_else(|| backtrace::SymbolName::new(b"")); + syms.push(name.to_string()); + }); + + if !syms.is_empty() { + // join inline functions with '--' + let f = syms.join("--"); + // should be + text.push_str(format!("{:#x} {}\n", addr, f).as_str()); + } else { + info!("can't resolve mapped addr: {:#x}", addr); + text.push_str(format!("{:#x} ??\n", addr).as_str()); + } + } + let response = Response::builder() + .header("Content-Type", mime::TEXT_PLAIN.to_string()) + .header("X-Content-Type-Options", "nosniff") + .header("Content-Length", text.len()) + .body(text.into()) + .unwrap(); + Ok(response) + } + async fn update_config( cfg_controller: ConfigController, req: Request, ) -> hyper::Result> { let mut body = Vec::new(); + let mut persist = true; + if let Some(query) = req.uri().query() { + let query_pairs: HashMap<_, _> = + url::form_urlencoded::parse(query.as_bytes()).collect(); + persist = match query_pairs.get("persist") { + Some(val) => match val.parse() { + Ok(val) => val, + Err(err) => return Ok(make_response(StatusCode::BAD_REQUEST, err.to_string())), + }, + None => true, + }; + } req.into_body() .try_for_each(|bytes| { body.extend(bytes); @@ -295,7 +398,11 @@ where }) .await?; Ok(match decode_json(&body) { - Ok(change) => match cfg_controller.update(change) { + Ok(change) => match if persist { + cfg_controller.update(change) + } else { + cfg_controller.update_without_persist(change) + } { Err(e) => { if let Some(e) = e.downcast_ref::() { make_response( @@ -649,9 +756,14 @@ where (Method::GET, "/debug/pprof/heap_deactivate") => { Self::deactivate_heap_prof(req) } - // (Method::GET, "/debug/pprof/heap") => { - // Self::dump_heap_prof_to_resp(req).await - // } + (Method::GET, "/debug/pprof/heap") => { + Self::dump_heap_prof_to_resp(req).await + } + (Method::GET, "/debug/pprof/cmdline") => Self::get_cmdline(req).await, + (Method::GET, "/debug/pprof/symbol") => { + Self::get_symbol_count(req).await + } + (Method::POST, "/debug/pprof/symbol") => Self::get_symbol(req).await, (Method::GET, "/config") => { Self::get_config(req, &cfg_controller).await } @@ -1202,6 +1314,76 @@ mod tests { status_server.stop(); } + #[test] + fn test_update_config_endpoint() { + let test_config = |persist: bool| { + let temp_dir = tempfile::TempDir::new().unwrap(); + let mut config = TikvConfig::default(); + config.cfg_path = temp_dir + .path() + .join("tikv.toml") + .to_str() + .unwrap() + .to_string(); + let mut status_server = StatusServer::new( + 1, + ConfigController::new(config), + Arc::new(SecurityConfig::default()), + MockRouter, + temp_dir.path().to_path_buf(), + None, + GrpcServiceManager::dummy(), + ) + .unwrap(); + let addr = "127.0.0.1:0".to_owned(); + let _ = status_server.start(addr); + let client = Client::new(); + let uri = if persist { + Uri::builder() + .scheme("http") + .authority(status_server.listening_addr().to_string().as_str()) + .path_and_query("/config") + .build() + .unwrap() + } else { + Uri::builder() + .scheme("http") + .authority(status_server.listening_addr().to_string().as_str()) + .path_and_query("/config?persist=false") + .build() + .unwrap() + }; + let mut req = Request::new(Body::from("{\"coprocessor.region-split-size\": \"1GB\"}")); + *req.method_mut() = Method::POST; + *req.uri_mut() = uri.clone(); + let handle = status_server.thread_pool.spawn(async move { + let resp = client.request(req).await.unwrap(); + assert_eq!(resp.status(), StatusCode::OK); + }); + block_on(handle).unwrap(); + + let client = Client::new(); + let handle2 = status_server.thread_pool.spawn(async move { + let resp = client.get(uri).await.unwrap(); + assert_eq!(resp.status(), StatusCode::OK); + let mut v = Vec::new(); + resp.into_body() + .try_for_each(|bytes| { + v.extend(bytes); + ok(()) + }) + .await + .unwrap(); + let resp_json = String::from_utf8_lossy(&v).to_string(); + assert!(resp_json.contains("\"region-split-size\":\"1GiB\"")); + }); + block_on(handle2).unwrap(); + status_server.stop(); + }; + test_config(true); + test_config(false); + } + #[cfg(feature = "failpoints")] #[test] fn test_status_service_fail_endpoints() { @@ -1547,6 +1729,59 @@ mod tests { status_server.stop(); } + #[test] + fn test_pprof_symbol_service() { + let _test_guard = TEST_PROFILE_MUTEX.lock().unwrap(); + let temp_dir = tempfile::TempDir::new().unwrap(); + let mut status_server = StatusServer::new( + 1, + ConfigController::default(), + Arc::new(SecurityConfig::default()), + MockRouter, + temp_dir.path().to_path_buf(), + None, + GrpcServiceManager::dummy(), + ) + .unwrap(); + let addr = "127.0.0.1:0".to_owned(); + let _ = status_server.start(addr); + let client = Client::new(); + + let mut addr = None; + backtrace::trace(|f| { + addr = Some(f.ip()); + false + }); + assert!(addr.is_some()); + + let uri = Uri::builder() + .scheme("http") + .authority(status_server.listening_addr().to_string().as_str()) + .path_and_query("/debug/pprof/symbol") + .build() + .unwrap(); + let req = Request::builder() + .method(Method::POST) + .uri(uri) + .body(Body::from(format!("{:p}", addr.unwrap()))) + .unwrap(); + let handle = status_server + .thread_pool + .spawn(async move { client.request(req).await.unwrap() }); + let resp = block_on(handle).unwrap(); + assert_eq!(resp.status(), StatusCode::OK); + let body_bytes = block_on(hyper::body::to_bytes(resp.into_body())).unwrap(); + assert!( + String::from_utf8(body_bytes.as_ref().to_owned()) + .unwrap() + .split(' ') + .last() + .unwrap() + .starts_with("backtrace::backtrace") + ); + status_server.stop(); + } + #[test] fn test_metrics() { let _test_guard = TEST_PROFILE_MUTEX.lock().unwrap(); diff --git a/src/server/status_server/profile.rs b/src/server/status_server/profile.rs index b3d91d3bea6..dbf819b35fe 100644 --- a/src/server/status_server/profile.rs +++ b/src/server/status_server/profile.rs @@ -1,11 +1,11 @@ // Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. use std::{ fs::{File, Metadata}, - io::Read, + io::{Read, Write}, path::PathBuf, pin::Pin, - process::Command, - sync::Mutex as StdMutex, + process::{Command, Stdio}, + sync::Mutex, time::{Duration, UNIX_EPOCH}, }; @@ -23,7 +23,6 @@ use regex::Regex; use tempfile::{NamedTempFile, TempDir}; #[cfg(not(test))] use tikv_alloc::{activate_prof, deactivate_prof, dump_prof}; -use tokio::sync::{Mutex, MutexGuard}; #[cfg(test)] pub use self::test_utils::TEST_PROFILE_MUTEX; @@ -31,13 +30,14 @@ pub use self::test_utils::TEST_PROFILE_MUTEX; use self::test_utils::{activate_prof, deactivate_prof, dump_prof}; // File name suffix for periodically dumped heap profiles. -const HEAP_PROFILE_SUFFIX: &str = ".heap"; +pub const HEAP_PROFILE_SUFFIX: &str = ".heap"; +pub const HEAP_PROFILE_REGEX: &str = r"^[0-9]{6,6}\.heap$"; lazy_static! { - // If it's locked it means there are already a heap or CPU profiling. - static ref PROFILE_MUTEX: Mutex<()> = Mutex::new(()); - // The channel is used to deactivate a profiling. - static ref PROFILE_ACTIVE: StdMutex, TempDir)>> = StdMutex::new(None); + // If it's some it means there are already a CPU profiling. + static ref CPU_PROFILE_ACTIVE: Mutex> = Mutex::new(None); + // If it's some it means there are already a heap profiling. The channel is used to deactivate a profiling. + pub static ref HEAP_PROFILE_ACTIVE: Mutex, TempDir)>>> = Mutex::new(None); // To normalize thread names. static ref THREAD_NAME_RE: Regex = @@ -47,32 +47,26 @@ lazy_static! { type OnEndFn = Box Result + Send + 'static>; -struct ProfileGuard<'a, I, T> { - _guard: MutexGuard<'a, ()>, +struct ProfileRunner { item: Option, on_end: Option>, end: BoxFuture<'static, Result<(), String>>, } -impl<'a, I, T> Unpin for ProfileGuard<'a, I, T> {} +impl Unpin for ProfileRunner {} -impl<'a, I, T> ProfileGuard<'a, I, T> { +impl ProfileRunner { fn new( on_start: F1, on_end: F2, end: BoxFuture<'static, Result<(), String>>, - ) -> Result, String> + ) -> Result where F1: FnOnce() -> Result, F2: FnOnce(I) -> Result + Send + 'static, { - let _guard = match PROFILE_MUTEX.try_lock() { - Ok(guard) => guard, - _ => return Err("Already in Profiling".to_owned()), - }; let item = on_start()?; - Ok(ProfileGuard { - _guard, + Ok(ProfileRunner { item: Some(item), on_end: Some(Box::new(on_end) as OnEndFn), end, @@ -80,7 +74,7 @@ impl<'a, I, T> ProfileGuard<'a, I, T> { } } -impl<'a, I, T> Future for ProfileGuard<'a, I, T> { +impl Future for ProfileRunner { type Output = Result; fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll { match self.end.as_mut().poll(cx) { @@ -98,34 +92,22 @@ impl<'a, I, T> Future for ProfileGuard<'a, I, T> { } } -/// Trigger a heap profie and return the content. -#[allow(dead_code)] -pub async fn start_one_heap_profile(end: F, use_jeprof: bool) -> Result, String> -where - F: Future> + Send + 'static, -{ - let on_start = || activate_prof().map_err(|e| format!("activate_prof: {}", e)); - - let on_end = move |_| { - deactivate_prof().map_err(|e| format!("deactivate_prof: {}", e))?; - let f = NamedTempFile::new().map_err(|e| format!("create tmp file fail: {}", e))?; - let path = f.path().to_str().unwrap(); - dump_prof(path).map_err(|e| format!("dump_prof: {}", e))?; - if use_jeprof { - jeprof_heap_profile(path) - } else { - read_file(path) - } - }; - - ProfileGuard::new(on_start, on_end, end.boxed())?.await +/// Trigger a heap profile and return the content. +pub fn dump_one_heap_profile() -> Result, String> { + if HEAP_PROFILE_ACTIVE.lock().unwrap().is_none() { + return Err("heap profiling is not activated".to_owned()); + } + let f = NamedTempFile::new().map_err(|e| format!("create tmp file fail: {}", e))?; + let path = f.path().to_str().unwrap(); + dump_prof(path).map_err(|e| format!("dump_prof: {}", e))?; + read_file(path) } /// Activate heap profile and call `callback` if successfully. /// `deactivate_heap_profile` can only be called after it's notified from /// `callback`. pub async fn activate_heap_profile( - dump_period: S, + dump_period: Option, store_path: PathBuf, callback: F, ) -> Result<(), String> @@ -133,6 +115,10 @@ where S: Stream> + Send + Unpin + 'static, F: FnOnce() + Send + 'static, { + if HEAP_PROFILE_ACTIVE.lock().unwrap().is_some() { + return Err("Already in Heap Profiling".to_owned()); + } + let (tx, rx) = oneshot::channel(); let dir = tempfile::Builder::new() .prefix("heap-") @@ -141,40 +127,57 @@ where let dir_path = dir.path().to_str().unwrap().to_owned(); let on_start = move || { - let mut activate = PROFILE_ACTIVE.lock().unwrap(); + let mut activate = HEAP_PROFILE_ACTIVE.lock().unwrap(); assert!(activate.is_none()); + *activate = Some(Some((tx, dir))); activate_prof().map_err(|e| format!("activate_prof: {}", e))?; - *activate = Some((tx, dir)); callback(); info!("periodical heap profiling is started"); Ok(()) }; let on_end = |_| { - deactivate_heap_profile(); - deactivate_prof().map_err(|e| format!("deactivate_prof: {}", e)) + let res = deactivate_prof().map_err(|e| format!("deactivate_prof: {}", e)); + *HEAP_PROFILE_ACTIVE.lock().unwrap() = None; + res }; let end = async move { - select! { - _ = rx.fuse() => { - info!("periodical heap profiling is canceled"); - Ok(()) - }, - res = dump_heap_profile_periodically(dump_period, dir_path).fuse() => { - warn!("the heap profiling dump loop shouldn't break"; "res" => ?res); - res + if let Some(dump_period) = dump_period { + select! { + _ = rx.fuse() => { + info!("periodical heap profiling is canceled"); + Ok(()) + }, + res = dump_heap_profile_periodically(dump_period, dir_path).fuse() => { + warn!("the heap profiling dump loop shouldn't break"; "res" => ?res); + res + } } + } else { + let _ = rx.await; + info!("periodical heap profiling is canceled"); + Ok(()) } }; - ProfileGuard::new(on_start, on_end, end.boxed())?.await + ProfileRunner::new(on_start, on_end, end.boxed())?.await } /// Deactivate heap profile. Return `false` if it hasn't been activated. pub fn deactivate_heap_profile() -> bool { - let mut activate = PROFILE_ACTIVE.lock().unwrap(); - activate.take().is_some() + let mut activate = HEAP_PROFILE_ACTIVE.lock().unwrap(); + match activate.as_mut() { + Some(tx) => { + if let Some((tx, _)) = tx.take() { + let _ = tx.send(()); + } else { + *activate = None; + } + true + } + None => false, + } } /// Trigger one cpu profile. @@ -186,7 +189,14 @@ pub async fn start_one_cpu_profile( where F: Future> + Send + 'static, { + if CPU_PROFILE_ACTIVE.lock().unwrap().is_some() { + return Err("Already in CPU Profiling".to_owned()); + } + let on_start = || { + let mut activate = CPU_PROFILE_ACTIVE.lock().unwrap(); + assert!(activate.is_none()); + *activate = Some(()); let guard = pprof::ProfilerGuardBuilder::default() .frequency(frequency) .blocklist(&["libc", "libgcc", "pthread", "vdso"]) @@ -217,10 +227,13 @@ where .flamegraph(&mut body) .map_err(|e| format!("generate flamegraph from report fail: {}", e))?; } + drop(guard); + *CPU_PROFILE_ACTIVE.lock().unwrap() = None; + Ok(body) }; - ProfileGuard::new(on_start, on_end, end.boxed())?.await + ProfileRunner::new(on_start, on_end, end.boxed())?.await } pub fn read_file(path: &str) -> Result, String> { @@ -233,9 +246,26 @@ pub fn read_file(path: &str) -> Result, String> { pub fn jeprof_heap_profile(path: &str) -> Result, String> { info!("using jeprof to process {}", path); - let output = Command::new("./jeprof") - .args(["--show_bytes", "./bin/tikv-server", path, "--svg"]) - .output() + let bin = std::env::current_exe().map_err(|e| format!("get current exe path fail: {}", e))?; + let mut jeprof = Command::new("perl") + .args([ + "/dev/stdin", + "--show_bytes", + &bin.as_os_str().to_string_lossy(), + path, + "--svg", + ]) + .stdin(Stdio::piped()) + .spawn() + .map_err(|e| format!("spawn jeprof fail: {}", e))?; + jeprof + .stdin + .take() + .unwrap() + .write_all(include_bytes!("jeprof.in")) + .unwrap(); + let output = jeprof + .wait_with_output() .map_err(|e| format!("jeprof: {}", e))?; if !output.status.success() { let stderr = std::str::from_utf8(&output.stderr).unwrap_or("invalid utf8"); @@ -244,9 +274,17 @@ pub fn jeprof_heap_profile(path: &str) -> Result, String> { Ok(output.stdout) } +pub fn heap_profiles_dir() -> Option { + HEAP_PROFILE_ACTIVE + .lock() + .unwrap() + .as_ref() + .and_then(|v| v.as_ref().map(|(_, dir)| dir.path().to_owned())) +} + pub fn list_heap_profiles() -> Result, String> { - let path = match &*PROFILE_ACTIVE.lock().unwrap() { - Some((_, ref dir)) => dir.path().to_str().unwrap().to_owned(), + let path = match heap_profiles_dir() { + Some(path) => path.into_os_string().into_string().unwrap(), None => return Ok(vec![]), }; @@ -257,7 +295,7 @@ pub fn list_heap_profiles() -> Result, String> { Ok(x) => x, _ => continue, }; - let f = item.path().to_str().unwrap().to_owned(); + let f = item.file_name().to_str().unwrap().to_owned(); if !f.ends_with(HEAP_PROFILE_SUFFIX) { continue; } @@ -372,7 +410,7 @@ mod tests { .build() .unwrap(); - let expected = "Already in Profiling"; + let expected = "Already in CPU Profiling"; let (tx1, rx1) = oneshot::channel(); let rx1 = rx1.map_err(|_| "channel canceled".to_owned()); @@ -384,17 +422,29 @@ mod tests { let res2 = rt.spawn(start_one_cpu_profile(rx2, 99, false)); assert_eq!(block_on(res2).unwrap().unwrap_err(), expected); - let (_tx2, rx2) = oneshot::channel(); - let rx2 = rx2.map_err(|_| "channel canceled".to_owned()); - let res2 = rt.spawn(start_one_heap_profile(rx2, false)); - assert_eq!(block_on(res2).unwrap().unwrap_err(), expected); + drop(tx1); + block_on(res1).unwrap().unwrap_err(); + + let expected = "Already in Heap Profiling"; + + let (tx1, rx1) = mpsc::channel(1); + let res1 = rt.spawn(activate_heap_profile( + Some(rx1), + std::env::temp_dir(), + || {}, + )); + thread::sleep(Duration::from_millis(100)); let (_tx2, rx2) = mpsc::channel(1); - let res2 = rt.spawn(activate_heap_profile(rx2, std::env::temp_dir(), || {})); + let res2 = rt.spawn(activate_heap_profile( + Some(rx2), + std::env::temp_dir(), + || {}, + )); assert_eq!(block_on(res2).unwrap().unwrap_err(), expected); drop(tx1); - block_on(res1).unwrap().unwrap_err(); + block_on(res1).unwrap().unwrap(); } #[test] @@ -407,7 +457,7 @@ mod tests { // Test activated profiling can be stopped by canceling the period stream. let (tx, rx) = mpsc::channel(1); - let res = rt.spawn(activate_heap_profile(rx, std::env::temp_dir(), || {})); + let res = rt.spawn(activate_heap_profile(Some(rx), std::env::temp_dir(), || {})); drop(tx); block_on(res).unwrap().unwrap(); @@ -418,7 +468,7 @@ mod tests { let (_tx, _rx) = mpsc::channel(1); let res = rt.spawn(activate_heap_profile( - _rx, + Some(_rx), std::env::temp_dir(), on_activated, )); @@ -437,7 +487,7 @@ mod tests { // Test heap profiling can be stopped by sending an error. let (mut tx, rx) = mpsc::channel(1); - let res = rt.spawn(activate_heap_profile(rx, std::env::temp_dir(), || {})); + let res = rt.spawn(activate_heap_profile(Some(rx), std::env::temp_dir(), || {})); block_on(tx.send(Err("test".to_string()))).unwrap(); block_on(res).unwrap().unwrap_err(); @@ -448,7 +498,7 @@ mod tests { let (_tx, _rx) = mpsc::channel(1); let res = rt.spawn(activate_heap_profile( - _rx, + Some(_rx), std::env::temp_dir(), on_activated, )); diff --git a/src/storage/OWNERS b/src/storage/OWNERS new file mode 100644 index 00000000000..0ce3dbff3a6 --- /dev/null +++ b/src/storage/OWNERS @@ -0,0 +1,7 @@ +# See the OWNERS docs at https://go.k8s.io/owners +options: + no_parent_owners: true +filters: + "(OWNERS|config\\.rs)$": + approvers: + - sig-critical-approvers-config-src diff --git a/src/storage/config.rs b/src/storage/config.rs index a40db2c424b..91c98ebf57b 100644 --- a/src/storage/config.rs +++ b/src/storage/config.rs @@ -31,6 +31,13 @@ const DEFAULT_SCHED_PENDING_WRITE_MB: u64 = 100; const DEFAULT_RESERVED_SPACE_GB: u64 = 5; const DEFAULT_RESERVED_RAFT_SPACE_GB: u64 = 1; +// In tests, we've observed 1.2M entries in the TxnStatusCache. We +// conservatively set the limit to 5M entries in total. +// As TxnStatusCache have 128 slots by default. We round it to 5.12M. +// This consumes at most around 300MB memory theoretically, but usually it's +// much less as it's hard to see the capacity being used up. +const DEFAULT_TXN_STATUS_CACHE_CAPACITY: usize = 40_000 * 128; + // Block cache capacity used when TikvConfig isn't validated. It should only // occur in tests. const FALLBACK_BLOCK_CACHE_CAPACITY: ReadableSize = ReadableSize::mb(128); @@ -76,6 +83,8 @@ pub struct Config { pub background_error_recovery_window: ReadableDuration, /// Interval to check TTL for all SSTs, pub ttl_check_poll_interval: ReadableDuration, + #[online_config(skip)] + pub txn_status_cache_capacity: usize, #[online_config(submodule)] pub flow_control: FlowControlConfig, #[online_config(submodule)] @@ -105,6 +114,7 @@ impl Default for Config { api_version: 1, enable_ttl: false, ttl_check_poll_interval: ReadableDuration::hours(12), + txn_status_cache_capacity: DEFAULT_TXN_STATUS_CACHE_CAPACITY, flow_control: FlowControlConfig::default(), block_cache: BlockCacheConfig::default(), io_rate_limit: IoRateLimitConfig::default(), diff --git a/src/storage/errors.rs b/src/storage/errors.rs index 0e7db9ffc96..0cd14d7ff8a 100644 --- a/src/storage/errors.rs +++ b/src/storage/errors.rs @@ -12,7 +12,7 @@ use std::{ use error_code::{self, ErrorCode, ErrorCodeExt}; use kvproto::{errorpb, kvrpcpb, kvrpcpb::ApiVersion}; use thiserror::Error; -use tikv_util::deadline::DeadlineError; +use tikv_util::deadline::{set_deadline_exceeded_busy_error, DeadlineError}; use txn_types::{KvPair, TimeStamp}; use crate::storage::{ @@ -222,7 +222,6 @@ impl Display for ErrorHeaderKind { const SCHEDULER_IS_BUSY: &str = "scheduler is busy"; const GC_WORKER_IS_BUSY: &str = "gc worker is busy"; -const DEADLINE_EXCEEDED: &str = "deadline is exceeded"; /// Get the `ErrorHeaderKind` enum that corresponds to the error in the protobuf /// message. Returns `ErrorHeaderKind::Other` if no match found. @@ -287,6 +286,14 @@ pub fn extract_region_error_from_error(e: &Error) -> Option { err.set_max_timestamp_not_synced(Default::default()); Some(err) } + Error(box ErrorInner::Txn( + e @ TxnError(box TxnErrorInner::RawKvMaxTimestampNotSynced { .. }), + )) => { + let mut err = errorpb::Error::default(); + err.set_max_timestamp_not_synced(Default::default()); + err.set_message(format!("{}", e)); + Some(err) + } Error(box ErrorInner::Txn(TxnError(box TxnErrorInner::FlashbackNotPrepared( region_id, )))) => { @@ -319,9 +326,8 @@ pub fn extract_region_error_from_error(e: &Error) -> Option { } Error(box ErrorInner::DeadlineExceeded) => { let mut err = errorpb::Error::default(); - let mut server_is_busy_err = errorpb::ServerIsBusy::default(); - server_is_busy_err.set_reason(DEADLINE_EXCEEDED.to_owned()); - err.set_server_is_busy(server_is_busy_err); + err.set_message(e.to_string()); + set_deadline_exceeded_busy_error(&mut err); Some(err) } _ => None, diff --git a/src/storage/kv/test_engine_builder.rs b/src/storage/kv/test_engine_builder.rs index 23a0bfcd594..30b14d22274 100644 --- a/src/storage/kv/test_engine_builder.rs +++ b/src/storage/kv/test_engine_builder.rs @@ -126,7 +126,9 @@ impl TestEngineBuilder { _ => (*cf, RocksCfOptions::default()), }) .collect(); - let engine = RocksEngine::new(&path, None, cfs_opts, self.io_rate_limiter)?; + let resources = cfg_rocksdb.build_resources(Default::default(), EngineType::RaftKv); + let db_opts = cfg_rocksdb.build_opt(&resources, EngineType::RaftKv); + let engine = RocksEngine::new(&path, Some(db_opts), cfs_opts, self.io_rate_limiter)?; Ok(engine) } } diff --git a/src/storage/lock_manager/lock_waiting_queue.rs b/src/storage/lock_manager/lock_waiting_queue.rs index a81248fe9e2..dbe71f6267b 100644 --- a/src/storage/lock_manager/lock_waiting_queue.rs +++ b/src/storage/lock_manager/lock_waiting_queue.rs @@ -618,6 +618,7 @@ impl LockWaitQueues { hash: entry.lock_hash, }, lock_info: key_state.current_lock.clone(), + allow_lock_with_conflict: entry.parameters.allow_lock_with_conflict, }, }; update_wait_for_events.push(event); diff --git a/src/storage/lock_manager/mod.rs b/src/storage/lock_manager/mod.rs index 5c103f40f82..4c2b4b0ccca 100644 --- a/src/storage/lock_manager/mod.rs +++ b/src/storage/lock_manager/mod.rs @@ -97,6 +97,7 @@ pub struct KeyLockWaitInfo { pub key: Key, pub lock_digest: LockDigest, pub lock_info: LockInfo, + pub allow_lock_with_conflict: bool, } /// Uniquely identifies a lock-waiting request in a `LockManager`. diff --git a/src/storage/metrics.rs b/src/storage/metrics.rs index e9477b56b0f..cf7956d76b7 100644 --- a/src/storage/metrics.rs +++ b/src/storage/metrics.rs @@ -370,6 +370,13 @@ make_static_metric! { keys, }, } + + pub struct TxnStatusCacheSizeGauge: IntGauge { + "type" => { + used, + allocated, + } + } } lazy_static! { @@ -601,4 +608,12 @@ lazy_static! { exponential_buckets(1.0, 2.0, 16).unwrap() ) .unwrap(); + + pub static ref SCHED_TXN_STATUS_CACHE_SIZE: TxnStatusCacheSizeGauge = register_static_int_gauge_vec!( + TxnStatusCacheSizeGauge, + "tikv_scheduler_txn_status_cache_size", + "Statistics of size and capacity of txn status cache (represented in count of entries)", + &["type"] + ) + .unwrap(); } diff --git a/src/storage/mod.rs b/src/storage/mod.rs index 0d4679fbe18..60202d5ff84 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -97,7 +97,7 @@ use tikv_util::{ deadline::Deadline, future::try_poll, quota_limiter::QuotaLimiter, - time::{duration_to_ms, Instant, ThreadReadId}, + time::{duration_to_ms, duration_to_sec, Instant, ThreadReadId}, }; use tracker::{ clear_tls_tracker_token, set_tls_tracker_token, with_tls_tracker, TrackedFuture, TrackerToken, @@ -609,6 +609,7 @@ impl Storage { r.get_resource_limiter( ctx.get_resource_control_context().get_resource_group_name(), ctx.get_request_source(), + ctx.get_resource_control_context().get_override_priority(), ) }); let priority_tag = get_priority_tag(priority); @@ -645,7 +646,7 @@ impl Storage { Self::check_api_version(api_version, ctx.api_version, CMD, [key.as_encoded()])?; - let command_duration = tikv_util::time::Instant::now(); + let command_duration = Instant::now(); // The bypass_locks and access_locks set will be checked at most once. // `TsSet::vec` is more efficient here. @@ -697,12 +698,15 @@ impl Storage { &statistics, buckets.as_ref(), ); + let now = Instant::now(); SCHED_PROCESSING_READ_HISTOGRAM_STATIC .get(CMD) - .observe(begin_instant.saturating_elapsed_secs()); - SCHED_HISTOGRAM_VEC_STATIC - .get(CMD) - .observe(command_duration.saturating_elapsed_secs()); + .observe(duration_to_sec( + now.saturating_duration_since(begin_instant), + )); + SCHED_HISTOGRAM_VEC_STATIC.get(CMD).observe(duration_to_sec( + now.saturating_duration_since(command_duration), + )); let read_bytes = key.len() + result @@ -765,7 +769,7 @@ impl Storage { ids: Vec, trackers: Vec, consumer: P, - begin_instant: tikv_util::time::Instant, + begin_instant: Instant, ) -> impl Future> { const CMD: CommandKind = CommandKind::batch_get_command; // all requests in a batch have the same region, epoch, term, replica_read @@ -779,6 +783,10 @@ impl Storage { .get_resource_control_context() .get_resource_group_name(), requests[0].get_context().get_request_source(), + requests[0] + .get_context() + .get_resource_control_context() + .get_override_priority(), ) }); let concurrency_manager = self.concurrency_manager.clone(); @@ -805,7 +813,7 @@ impl Storage { KV_COMMAND_KEYREAD_HISTOGRAM_STATIC .get(CMD) .observe(requests.len() as f64); - let command_duration = tikv_util::time::Instant::now(); + let command_duration = Instant::now(); let read_id = Some(ThreadReadId::new()); let mut statistics = Statistics::default(); let mut req_snaps = vec![]; @@ -975,6 +983,7 @@ impl Storage { r.get_resource_limiter( ctx.get_resource_control_context().get_resource_group_name(), ctx.get_request_source(), + ctx.get_resource_control_context().get_override_priority(), ) }); let priority_tag = get_priority_tag(priority); @@ -1019,7 +1028,7 @@ impl Storage { keys.iter().map(Key::as_encoded), )?; - let command_duration = tikv_util::time::Instant::now(); + let command_duration = Instant::now(); let bypass_locks = TsSet::from_u64s(ctx.take_resolved_locks()); let access_locks = TsSet::from_u64s(ctx.take_committed_locks()); @@ -1086,12 +1095,15 @@ impl Storage { (result, stats) }); metrics::tls_collect_scan_details(CMD, &stats); + let now = Instant::now(); SCHED_PROCESSING_READ_HISTOGRAM_STATIC .get(CMD) - .observe(begin_instant.saturating_elapsed_secs()); - SCHED_HISTOGRAM_VEC_STATIC - .get(CMD) - .observe(command_duration.saturating_elapsed_secs()); + .observe(duration_to_sec( + now.saturating_duration_since(begin_instant), + )); + SCHED_HISTOGRAM_VEC_STATIC.get(CMD).observe(duration_to_sec( + now.saturating_duration_since(command_duration), + )); let read_bytes = stats.cf_statistics(CF_DEFAULT).flow_stats.read_bytes + stats.cf_statistics(CF_LOCK).flow_stats.read_bytes @@ -1164,6 +1176,7 @@ impl Storage { r.get_resource_limiter( ctx.get_resource_control_context().get_resource_group_name(), ctx.get_request_source(), + ctx.get_resource_control_context().get_override_priority(), ) }); let priority_tag = get_priority_tag(priority); @@ -1217,7 +1230,7 @@ impl Storage { if reverse_scan { std::mem::swap(&mut start_key, &mut end_key); } - let command_duration = tikv_util::time::Instant::now(); + let command_duration = Instant::now(); let bypass_locks = TsSet::from_u64s(ctx.take_resolved_locks()); let access_locks = TsSet::from_u64s(ctx.take_committed_locks()); @@ -1296,12 +1309,15 @@ impl Storage { &statistics, buckets.as_ref(), ); + let now = Instant::now(); SCHED_PROCESSING_READ_HISTOGRAM_STATIC .get(CMD) - .observe(begin_instant.saturating_elapsed_secs()); - SCHED_HISTOGRAM_VEC_STATIC - .get(CMD) - .observe(command_duration.saturating_elapsed_secs()); + .observe(duration_to_sec( + now.saturating_duration_since(begin_instant), + )); + SCHED_HISTOGRAM_VEC_STATIC.get(CMD).observe(duration_to_sec( + now.saturating_duration_since(command_duration), + )); res.map_err(Error::from).map(|results| { KV_COMMAND_KEYREAD_HISTOGRAM_STATIC @@ -1337,6 +1353,7 @@ impl Storage { r.get_resource_limiter( ctx.get_resource_control_context().get_resource_group_name(), ctx.get_request_source(), + ctx.get_resource_control_context().get_override_priority(), ) }); let priority_tag = get_priority_tag(priority); @@ -1383,7 +1400,7 @@ impl Storage { // which resolves locks on regions, and boundary of regions will be out of range // of TiDB keys. - let command_duration = tikv_util::time::Instant::now(); + let command_duration = Instant::now(); concurrency_manager.update_max_ts(max_ts); let begin_instant = Instant::now(); @@ -1455,12 +1472,15 @@ impl Storage { &statistics, buckets.as_ref(), ); + let now = Instant::now(); SCHED_PROCESSING_READ_HISTOGRAM_STATIC .get(CMD) - .observe(begin_instant.saturating_elapsed_secs()); - SCHED_HISTOGRAM_VEC_STATIC - .get(CMD) - .observe(command_duration.saturating_elapsed_secs()); + .observe(duration_to_sec( + now.saturating_duration_since(begin_instant), + )); + SCHED_HISTOGRAM_VEC_STATIC.get(CMD).observe(duration_to_sec( + now.saturating_duration_since(command_duration), + )); Ok(locks) }) @@ -1650,6 +1670,7 @@ impl Storage { r.get_resource_limiter( ctx.get_resource_control_context().get_resource_group_name(), ctx.get_request_source(), + ctx.get_resource_control_context().get_override_priority(), ) }); let priority_tag = get_priority_tag(priority); @@ -1669,7 +1690,7 @@ impl Storage { Self::check_api_version(api_version, ctx.api_version, CMD, [&key])?; - let command_duration = tikv_util::time::Instant::now(); + let command_duration = Instant::now(); let snap_ctx = SnapContext { pb_ctx: &ctx, ..Default::default() @@ -1704,12 +1725,15 @@ impl Storage { &stats, buckets.as_ref(), ); + let now = Instant::now(); SCHED_PROCESSING_READ_HISTOGRAM_STATIC .get(CMD) - .observe(begin_instant.saturating_elapsed_secs()); - SCHED_HISTOGRAM_VEC_STATIC - .get(CMD) - .observe(command_duration.saturating_elapsed_secs()); + .observe(duration_to_sec( + now.saturating_duration_since(begin_instant), + )); + SCHED_HISTOGRAM_VEC_STATIC.get(CMD).observe(duration_to_sec( + now.saturating_duration_since(command_duration), + )); r } } @@ -1739,6 +1763,10 @@ impl Storage { .get_resource_control_context() .get_resource_group_name(), gets[0].get_context().get_request_source(), + gets[0] + .get_context() + .get_resource_control_context() + .get_override_priority(), ) }); let priority_tag = get_priority_tag(priority); @@ -1776,7 +1804,7 @@ impl Storage { .map_err(Error::from)?; } - let command_duration = tikv_util::time::Instant::now(); + let command_duration = Instant::now(); let read_id = Some(ThreadReadId::new()); let mut snaps = vec![]; for (mut req, id) in gets.into_iter().zip(ids) { @@ -1845,12 +1873,15 @@ impl Storage { } } + let now = Instant::now(); SCHED_PROCESSING_READ_HISTOGRAM_STATIC .get(CMD) - .observe(begin_instant.saturating_elapsed_secs()); - SCHED_HISTOGRAM_VEC_STATIC - .get(CMD) - .observe(command_duration.saturating_elapsed_secs()); + .observe(duration_to_sec( + now.saturating_duration_since(begin_instant), + )); + SCHED_HISTOGRAM_VEC_STATIC.get(CMD).observe(duration_to_sec( + now.saturating_duration_since(command_duration), + )); Ok(()) } .in_resource_metering_tag(resource_tag), @@ -1875,6 +1906,7 @@ impl Storage { r.get_resource_limiter( ctx.get_resource_control_context().get_resource_group_name(), ctx.get_request_source(), + ctx.get_resource_control_context().get_override_priority(), ) }); let priority_tag = get_priority_tag(priority); @@ -1896,7 +1928,7 @@ impl Storage { Self::check_api_version(api_version, ctx.api_version, CMD, &keys)?; - let command_duration = tikv_util::time::Instant::now(); + let command_duration = Instant::now(); let snap_ctx = SnapContext { pb_ctx: &ctx, ..Default::default() @@ -1947,12 +1979,15 @@ impl Storage { KV_COMMAND_KEYREAD_HISTOGRAM_STATIC .get(CMD) .observe(stats.data.flow_stats.read_keys as f64); + let now = Instant::now(); SCHED_PROCESSING_READ_HISTOGRAM_STATIC .get(CMD) - .observe(begin_instant.saturating_elapsed_secs()); - SCHED_HISTOGRAM_VEC_STATIC - .get(CMD) - .observe(command_duration.saturating_elapsed_secs()); + .observe(duration_to_sec( + now.saturating_duration_since(begin_instant), + )); + SCHED_HISTOGRAM_VEC_STATIC.get(CMD).observe(duration_to_sec( + now.saturating_duration_since(command_duration), + )); Ok(result) } } @@ -1975,9 +2010,8 @@ impl Storage { SCHED_STAGE_COUNTER_VEC.get(tag).snapshot_ok.inc(); if !snapshot.ext().is_max_ts_synced() { return Err(Error::from(txn::Error::from( - TxnError::MaxTimestampNotSynced { + TxnError::RawKvMaxTimestampNotSynced { region_id: ctx.get_region_id(), - start_ts: TimeStamp::zero(), }, ))); } @@ -2028,7 +2062,7 @@ impl Storage { if let Err(e) = deadline.check() { return callback(Err(Error::from(e))); } - let command_duration = tikv_util::time::Instant::now(); + let command_duration = Instant::now(); if let Err(e) = Self::check_causal_ts_flushed(&mut ctx, CMD).await { return callback(Err(e)); @@ -2140,7 +2174,7 @@ impl Storage { if let Err(e) = deadline.check() { return callback(Err(Error::from(e))); } - let command_duration = tikv_util::time::Instant::now(); + let command_duration = Instant::now(); if let Err(e) = Self::check_causal_ts_flushed(&mut ctx, CMD).await { return callback(Err(e)); @@ -2205,7 +2239,7 @@ impl Storage { if let Err(e) = deadline.check() { return callback(Err(Error::from(e))); } - let command_duration = tikv_util::time::Instant::now(); + let command_duration = Instant::now(); if let Err(e) = Self::check_causal_ts_flushed(&mut ctx, CMD).await { return callback(Err(e)); @@ -2266,7 +2300,7 @@ impl Storage { if let Err(e) = deadline.check() { return callback(Err(Error::from(e))); } - let command_duration = tikv_util::time::Instant::now(); + let command_duration = Instant::now(); let start_key = F::encode_raw_key_owned(start_key, None); let end_key = F::encode_raw_key_owned(end_key, None); @@ -2314,7 +2348,7 @@ impl Storage { if let Err(e) = deadline.check() { return callback(Err(Error::from(e))); } - let command_duration = tikv_util::time::Instant::now(); + let command_duration = Instant::now(); if let Err(e) = Self::check_causal_ts_flushed(&mut ctx, CMD).await { return callback(Err(e)); @@ -2378,6 +2412,7 @@ impl Storage { r.get_resource_limiter( ctx.get_resource_control_context().get_resource_group_name(), ctx.get_request_source(), + ctx.get_resource_control_context().get_override_priority(), ) }); let priority_tag = get_priority_tag(priority); @@ -2400,7 +2435,7 @@ impl Storage { [(Some(&start_key), end_key.as_ref())], )?; - let command_duration = tikv_util::time::Instant::now(); + let command_duration = Instant::now(); let snap_ctx = SnapContext { pb_ctx: &ctx, ..Default::default() @@ -2477,12 +2512,15 @@ impl Storage { .get(CMD) .observe(statistics.data.flow_stats.read_keys as f64); metrics::tls_collect_scan_details(CMD, &statistics); + let now = Instant::now(); SCHED_PROCESSING_READ_HISTOGRAM_STATIC .get(CMD) - .observe(begin_instant.saturating_elapsed_secs()); - SCHED_HISTOGRAM_VEC_STATIC - .get(CMD) - .observe(command_duration.saturating_elapsed_secs()); + .observe(duration_to_sec( + now.saturating_duration_since(begin_instant), + )); + SCHED_HISTOGRAM_VEC_STATIC.get(CMD).observe(duration_to_sec( + now.saturating_duration_since(command_duration), + )); result } @@ -2512,6 +2550,7 @@ impl Storage { r.get_resource_limiter( ctx.get_resource_control_context().get_resource_group_name(), ctx.get_request_source(), + ctx.get_resource_control_context().get_override_priority(), ) }); let priority_tag = get_priority_tag(priority); @@ -2542,7 +2581,7 @@ impl Storage { .map(|range| (Some(range.get_start_key()), Some(range.get_end_key()))), )?; - let command_duration = tikv_util::time::Instant::now(); + let command_duration = Instant::now(); let snap_ctx = SnapContext { pb_ctx: &ctx, ..Default::default() @@ -2640,12 +2679,15 @@ impl Storage { .get(CMD) .observe(statistics.data.flow_stats.read_keys as f64); metrics::tls_collect_scan_details(CMD, &statistics); + let now = Instant::now(); SCHED_PROCESSING_READ_HISTOGRAM_STATIC .get(CMD) - .observe(begin_instant.saturating_elapsed_secs()); - SCHED_HISTOGRAM_VEC_STATIC - .get(CMD) - .observe(command_duration.saturating_elapsed_secs()); + .observe(duration_to_sec( + now.saturating_duration_since(begin_instant), + )); + SCHED_HISTOGRAM_VEC_STATIC.get(CMD).observe(duration_to_sec( + now.saturating_duration_since(command_duration), + )); Ok(result) } } @@ -2671,6 +2713,7 @@ impl Storage { r.get_resource_limiter( ctx.get_resource_control_context().get_resource_group_name(), ctx.get_request_source(), + ctx.get_resource_control_context().get_override_priority(), ) }); let priority_tag = get_priority_tag(priority); @@ -2690,7 +2733,7 @@ impl Storage { Self::check_api_version(api_version, ctx.api_version, CMD, [&key])?; - let command_duration = tikv_util::time::Instant::now(); + let command_duration = Instant::now(); let snap_ctx = SnapContext { pb_ctx: &ctx, ..Default::default() @@ -2725,12 +2768,15 @@ impl Storage { &stats, buckets.as_ref(), ); + let now = Instant::now(); SCHED_PROCESSING_READ_HISTOGRAM_STATIC .get(CMD) - .observe(begin_instant.saturating_elapsed_secs()); - SCHED_HISTOGRAM_VEC_STATIC - .get(CMD) - .observe(command_duration.saturating_elapsed_secs()); + .observe(duration_to_sec( + now.saturating_duration_since(begin_instant), + )); + SCHED_HISTOGRAM_VEC_STATIC.get(CMD).observe(duration_to_sec( + now.saturating_duration_since(command_duration), + )); r } } @@ -2849,6 +2895,7 @@ impl Storage { r.get_resource_limiter( ctx.get_resource_control_context().get_resource_group_name(), ctx.get_request_source(), + ctx.get_resource_control_context().get_override_priority(), ) }); let priority_tag = get_priority_tag(priority); @@ -2887,7 +2934,7 @@ impl Storage { range.set_end_key(end_key.into_encoded()); } - let command_duration = tikv_util::time::Instant::now(); + let command_duration = Instant::now(); let snap_ctx = SnapContext { pb_ctx: &ctx, ..Default::default() @@ -2898,7 +2945,7 @@ impl Storage { let store = RawStore::new(snapshot, api_version); let cf = Self::rawkv_cf("", api_version)?; - let begin_instant = tikv_util::time::Instant::now(); + let begin_instant = Instant::now(); let mut stats = Vec::with_capacity(ranges.len()); let ret = store .raw_checksum_ranges(cf, &ranges, &mut stats) @@ -2913,12 +2960,15 @@ impl Storage { buckets.as_ref(), ); }); + let now = Instant::now(); SCHED_PROCESSING_READ_HISTOGRAM_STATIC .get(CMD) - .observe(begin_instant.saturating_elapsed().as_secs_f64()); - SCHED_HISTOGRAM_VEC_STATIC - .get(CMD) - .observe(command_duration.saturating_elapsed().as_secs_f64()); + .observe(duration_to_sec( + now.saturating_duration_since(begin_instant), + )); + SCHED_HISTOGRAM_VEC_STATIC.get(CMD).observe(duration_to_sec( + now.saturating_duration_since(command_duration), + )); ret } @@ -3293,7 +3343,8 @@ impl TestStorageBuilder { } else { None }; - + let manager = Arc::new(ResourceGroupManager::default()); + let resource_ctl = manager.derive_controller("test".into(), false); Storage::from_engine( self.engine, &self.config, @@ -3311,11 +3362,8 @@ impl TestStorageBuilder { Arc::new(QuotaLimiter::default()), latest_feature_gate(), ts_provider, - Some(Arc::new(ResourceController::new_for_test( - "test".to_owned(), - false, - ))), - None, + Some(resource_ctl), + Some(manager), ) } @@ -3328,7 +3376,8 @@ impl TestStorageBuilder { &crate::config::StorageReadPoolConfig::default_for_test(), engine.clone(), ); - + let manager = Arc::new(ResourceGroupManager::default()); + let resource_ctl = manager.derive_controller("test".into(), false); Storage::from_engine( engine, &self.config, @@ -3346,16 +3395,14 @@ impl TestStorageBuilder { Arc::new(QuotaLimiter::default()), latest_feature_gate(), None, - Some(Arc::new(ResourceController::new_for_test( - "test".to_owned(), - false, - ))), - None, + Some(resource_ctl), + Some(manager), ) } pub fn build_for_resource_controller( self, + resource_manager: Arc, resource_controller: Arc, ) -> Result, L, F>> { let engine = TxnTestEngine { @@ -3385,7 +3432,7 @@ impl TestStorageBuilder { latest_feature_gate(), None, Some(resource_controller), - None, + Some(resource_manager), ) } } @@ -3695,7 +3742,7 @@ pub mod test_util { &self, id: u64, res: Result<(Option>, Statistics)>, - _: tikv_util::time::Instant, + _: Instant, _source: String, ) { self.data.lock().unwrap().push(GetResult { @@ -3706,13 +3753,7 @@ pub mod test_util { } impl ResponseBatchConsumer>> for GetConsumer { - fn consume( - &self, - id: u64, - res: Result>>, - _: tikv_util::time::Instant, - _source: String, - ) { + fn consume(&self, id: u64, res: Result>>, _: Instant, _source: String) { self.data.lock().unwrap().push(GetResult { id, res }); } } @@ -3826,6 +3867,7 @@ mod tests { commands, commands::{AcquirePessimisticLock, Prewrite}, tests::must_rollback, + txn_status_cache::TxnStatusCache, Error as TxnError, ErrorInner as TxnErrorInner, }, types::{PessimisticLockKeyResult, PessimisticLockResults}, @@ -3857,6 +3899,7 @@ mod tests { statistics: &mut Statistics::default(), async_apply_prewrite: false, raw_ext: None, + txn_status_cache: &TxnStatusCache::new_for_test(), }, ) .unwrap(); @@ -10842,4 +10885,507 @@ mod tests { // Prewrite still succeeds rx.recv().unwrap().unwrap(); } + + #[test] + fn test_prewrite_cached_committed_transaction_do_not_skip_constraint_check() { + let storage = TestStorageBuilderApiV1::new(MockLockManager::new()) + .build() + .unwrap(); + let cm = storage.concurrency_manager.clone(); + let k1 = Key::from_raw(b"k1"); + let pk = b"pk"; + // Simulate the case that the current TiKV instance have a non-unique + // index key of a pessimistic transaction. It won't be pessimistic + // locked, and prewrite skips constraint checks. + // Simulate the case that a prewrite is performed twice, with async + // commit enabled, and max_ts changes when the second request arrives. + + // A retrying prewrite request arrives. + cm.update_max_ts(20.into()); + let mut ctx = Context::default(); + ctx.set_is_retry_request(true); + let (tx, rx) = channel(); + storage + .sched_txn_command( + commands::PrewritePessimistic::new( + vec![( + Mutation::make_put(k1.clone(), b"v".to_vec()), + SkipPessimisticCheck, + )], + pk.to_vec(), + 10.into(), + 3000, + 10.into(), + 1, + 11.into(), + 0.into(), + Some(vec![]), + false, + AssertionLevel::Off, + vec![], + ctx, + ), + Box::new(move |res| { + tx.send(res).unwrap(); + }), + ) + .unwrap(); + + let res = rx.recv().unwrap().unwrap(); + assert_eq!(res.min_commit_ts, 21.into()); + + // Commit it. + let (tx, rx) = channel(); + storage + .sched_txn_command( + commands::Commit::new(vec![k1.clone()], 10.into(), 21.into(), Context::default()), + expect_ok_callback(tx, 0), + ) + .unwrap(); + rx.recv().unwrap(); + + // The txn's status is cached + assert_eq!( + storage + .sched + .get_txn_status_cache() + .get_no_promote(10.into()) + .unwrap(), + 21.into() + ); + + // Check committed; push max_ts to 30 + assert_eq!( + block_on(storage.get(Context::default(), k1.clone(), 30.into())) + .unwrap() + .0, + Some(b"v".to_vec()) + ); + + let (tx, rx) = channel(); + storage + .sched_txn_command( + commands::PrewritePessimistic::new( + vec![( + Mutation::make_put(k1.clone(), b"v".to_vec()), + SkipPessimisticCheck, + )], + pk.to_vec(), + 10.into(), + 3000, + 10.into(), + 1, + 11.into(), + 0.into(), + Some(vec![]), + false, + AssertionLevel::Off, + vec![], + Context::default(), + ), + Box::new(move |res| { + tx.send(res).unwrap(); + }), + ) + .unwrap(); + let res = rx.recv().unwrap().unwrap(); + assert_eq!(res.min_commit_ts, 21.into()); + + // Key must not be locked. + assert_eq!( + block_on(storage.get(Context::default(), k1, 50.into())) + .unwrap() + .0, + Some(b"v".to_vec()) + ); + } + + #[test] + fn test_updating_txn_status_cache() { + let storage = TestStorageBuilderApiV1::new(MockLockManager::new()) + .build() + .unwrap(); + let cm = storage.concurrency_manager.clone(); + + // Commit + let (tx, rx) = channel(); + storage + .sched_txn_command( + commands::PrewritePessimistic::new( + vec![( + Mutation::make_put(Key::from_raw(b"k1"), b"v1".to_vec()), + SkipPessimisticCheck, + )], + b"k1".to_vec(), + 10.into(), + 3000, + 10.into(), + 1, + 11.into(), + 0.into(), + Some(vec![]), + false, + AssertionLevel::Off, + vec![], + Context::default(), + ), + expect_ok_callback(tx.clone(), 0), + ) + .unwrap(); + rx.recv().unwrap(); + assert!( + storage + .sched + .get_txn_status_cache() + .get_no_promote(10.into()) + .is_none() + ); + + storage + .sched_txn_command( + commands::Commit::new( + vec![Key::from_raw(b"k1")], + 10.into(), + 20.into(), + Context::default(), + ), + expect_ok_callback(tx.clone(), 0), + ) + .unwrap(); + rx.recv().unwrap(); + assert_eq!( + storage + .sched + .get_txn_status_cache() + .get_no_promote(10.into()) + .unwrap(), + 20.into() + ); + + // Unsuccessful commit won't update cache + storage + .sched_txn_command( + commands::Commit::new( + vec![Key::from_raw(b"k2")], + 30.into(), + 40.into(), + Context::default(), + ), + expect_fail_callback(tx, 0, |_| ()), + ) + .unwrap(); + rx.recv().unwrap(); + assert!( + storage + .sched + .get_txn_status_cache() + .get_no_promote(30.into()) + .is_none() + ); + + // 1PC update + let (tx, rx) = channel(); + cm.update_max_ts(59.into()); + storage + .sched_txn_command( + Prewrite::new( + vec![Mutation::make_put(Key::from_raw(b"k3"), b"v3".to_vec())], + b"k3".to_vec(), + 50.into(), + 3000, + false, + 1, + 51.into(), + 0.into(), + Some(vec![]), + true, + AssertionLevel::Off, + Context::default(), + ), + Box::new(move |res| { + tx.send(res).unwrap(); + }), + ) + .unwrap(); + let res = rx.recv().unwrap().unwrap(); + assert_eq!(res.one_pc_commit_ts, 60.into()); + assert_eq!( + storage + .sched + .get_txn_status_cache() + .get_no_promote(50.into()) + .unwrap(), + 60.into() + ); + + // Resolve lock commit + let (tx, rx) = channel(); + storage + .sched_txn_command( + Prewrite::new( + vec![Mutation::make_put(Key::from_raw(b"k4"), b"v4".to_vec())], + b"pk".to_vec(), + 70.into(), + 3000, + false, + 1, + 0.into(), + 0.into(), + None, + false, + AssertionLevel::Off, + Context::default(), + ), + expect_ok_callback(tx.clone(), 0), + ) + .unwrap(); + rx.recv().unwrap(); + + storage + .sched_txn_command( + commands::ResolveLockReadPhase::new( + vec![(TimeStamp::from(70), TimeStamp::from(80))] + .into_iter() + .collect(), + None, + Context::default(), + ), + expect_ok_callback(tx.clone(), 0), + ) + .unwrap(); + rx.recv().unwrap(); + assert_eq!( + storage + .sched + .get_txn_status_cache() + .get_no_promote(70.into()) + .unwrap(), + 80.into() + ); + + // Resolve lock lite + storage + .sched_txn_command( + Prewrite::new( + vec![Mutation::make_put(Key::from_raw(b"k5"), b"v5".to_vec())], + b"pk".to_vec(), + 90.into(), + 3000, + false, + 1, + 0.into(), + 0.into(), + None, + false, + AssertionLevel::Off, + Context::default(), + ), + expect_ok_callback(tx.clone(), 0), + ) + .unwrap(); + rx.recv().unwrap(); + + storage + .sched_txn_command( + commands::ResolveLockLite::new( + 90.into(), + 100.into(), + vec![Key::from_raw(b"k5")], + Context::default(), + ), + expect_ok_callback(tx.clone(), 0), + ) + .unwrap(); + rx.recv().unwrap(); + assert_eq!( + storage + .sched + .get_txn_status_cache() + .get_no_promote(90.into()) + .unwrap(), + 100.into() + ); + + // CheckTxnStatus: uncommitted transaction + storage + .sched_txn_command( + commands::CheckTxnStatus::new( + Key::from_raw(b"k1"), + 9.into(), + 110.into(), + 110.into(), + true, + false, + false, + false, + Context::default(), + ), + expect_ok_callback(tx.clone(), 0), + ) + .unwrap(); + rx.recv().unwrap(); + assert!( + storage + .sched + .get_txn_status_cache() + .get_no_promote(9.into()) + .is_none() + ); + + // CheckTxnStatus: committed transaction + storage.sched.get_txn_status_cache().remove(10.into()); + storage + .sched_txn_command( + commands::CheckTxnStatus::new( + Key::from_raw(b"k1"), + 10.into(), + 110.into(), + 110.into(), + true, + false, + false, + false, + Context::default(), + ), + expect_ok_callback(tx.clone(), 0), + ) + .unwrap(); + rx.recv().unwrap(); + assert_eq!( + storage + .sched + .get_txn_status_cache() + .get_no_promote(10.into()) + .unwrap(), + 20.into() + ); + + // CheckSecondaryLocks: uncommitted transaction + storage + .sched_txn_command( + Prewrite::new( + vec![Mutation::make_put(Key::from_raw(b"k6"), b"v6".to_vec())], + b"pk".to_vec(), + 120.into(), + 3000, + false, + 1, + 0.into(), + 0.into(), + Some(vec![]), + false, + AssertionLevel::Off, + Context::default(), + ), + expect_ok_callback(tx.clone(), 0), + ) + .unwrap(); + rx.recv().unwrap(); + + // Lock exists but the transaction status is still unknown + storage + .sched_txn_command( + commands::CheckSecondaryLocks::new( + vec![Key::from_raw(b"k6")], + 120.into(), + Context::default(), + ), + expect_ok_callback(tx.clone(), 0), + ) + .unwrap(); + rx.recv().unwrap(); + assert!( + storage + .sched + .get_txn_status_cache() + .get_no_promote(120.into()) + .is_none() + ); + + // One of the lock doesn't exist so the transaction becomes rolled-back status. + storage + .sched_txn_command( + commands::CheckSecondaryLocks::new( + vec![Key::from_raw(b"k6"), Key::from_raw(b"k7")], + 120.into(), + Context::default(), + ), + expect_ok_callback(tx.clone(), 0), + ) + .unwrap(); + rx.recv().unwrap(); + assert!( + storage + .sched + .get_txn_status_cache() + .get_no_promote(120.into()) + .is_none() + ); + + // CheckSecondaryLocks: committed transaction + storage + .sched_txn_command( + Prewrite::new( + vec![ + Mutation::make_put(Key::from_raw(b"k8"), b"v8".to_vec()), + Mutation::make_put(Key::from_raw(b"k9"), b"v9".to_vec()), + ], + b"pk".to_vec(), + 130.into(), + 3000, + false, + 1, + 0.into(), + 0.into(), + Some(vec![]), + false, + AssertionLevel::Off, + Context::default(), + ), + expect_ok_callback(tx.clone(), 0), + ) + .unwrap(); + rx.recv().unwrap(); + // Commit one of the key + storage + .sched_txn_command( + commands::Commit::new( + vec![Key::from_raw(b"k9")], + 130.into(), + 140.into(), + Context::default(), + ), + expect_ok_callback(tx.clone(), 0), + ) + .unwrap(); + rx.recv().unwrap(); + assert_eq!( + storage + .sched + .get_txn_status_cache() + .remove(130.into()) + .unwrap(), + 140.into() + ); + + storage + .sched_txn_command( + commands::CheckSecondaryLocks::new( + vec![Key::from_raw(b"k8"), Key::from_raw(b"k9")], + 130.into(), + Context::default(), + ), + expect_ok_callback(tx, 0), + ) + .unwrap(); + rx.recv().unwrap(); + assert_eq!( + storage + .sched + .get_txn_status_cache() + .get_no_promote(130.into()) + .unwrap(), + 140.into() + ); + } } diff --git a/src/storage/mvcc/metrics.rs b/src/storage/mvcc/metrics.rs index 3c4bda63f7e..22d2760a769 100644 --- a/src/storage/mvcc/metrics.rs +++ b/src/storage/mvcc/metrics.rs @@ -51,6 +51,13 @@ make_static_metric! { pub struct MvccPrewriteAssertionPerfCounterVec: IntCounter { "type" => MvccPrewriteAssertionPerfKind, } + + pub struct MvccPrewriteRequestAfterCommitCounterVec: IntCounter { + "type" => { + non_retry_req, + retry_req, + }, + } } lazy_static! { @@ -104,4 +111,13 @@ lazy_static! { ) .unwrap() }; + pub static ref MVCC_PREWRITE_REQUEST_AFTER_COMMIT_COUNTER_VEC: MvccPrewriteRequestAfterCommitCounterVec = { + register_static_int_counter_vec!( + MvccPrewriteRequestAfterCommitCounterVec, + "tikv_storage_mvcc_prewrite_request_after_commit_counter", + "Counter of prewrite requests of already-committed transactions that are determined by checking TxnStatucCache", + &["type"] + ) + .unwrap() + }; } diff --git a/src/storage/mvcc/txn.rs b/src/storage/mvcc/txn.rs index a446ef64d22..08f72bf777a 100644 --- a/src/storage/mvcc/txn.rs +++ b/src/storage/mvcc/txn.rs @@ -543,8 +543,10 @@ pub(crate) mod tests { // Rollback lock must_rollback(&mut engine, k, 15, false); - // Rollbacks of optimistic transactions needn't be protected - must_get_rollback_protected(&mut engine, k, 15, false); + // Rollbacks of optimistic transactions need to be protected + // TODO: Re-check how the test can be better written after refinement of + // `must_rollback`'s semantics. + must_get_rollback_protected(&mut engine, k, 15, true); } #[test] @@ -896,16 +898,20 @@ pub(crate) mod tests { #[test] fn test_collapse_prev_rollback() { let mut engine = TestEngineBuilder::new().build().unwrap(); - let (key, value) = (b"key", b"value"); + let (key, pk, value) = (b"key", b"pk", b"value"); + + // Worked around the problem that `must_rollback` always protects primary lock + // by setting different PK. + // TODO: Cover primary when working on https://github.com/tikv/tikv/issues/16625 // Add a Rollback whose start ts is 1. - must_prewrite_put(&mut engine, key, value, key, 1); + must_prewrite_put(&mut engine, key, value, pk, 1); must_rollback(&mut engine, key, 1, false); must_get_rollback_ts(&mut engine, key, 1); // Add a Rollback whose start ts is 2, the previous Rollback whose // start ts is 1 will be collapsed. - must_prewrite_put(&mut engine, key, value, key, 2); + must_prewrite_put(&mut engine, key, value, pk, 2); must_rollback(&mut engine, key, 2, false); must_get_none(&mut engine, key, 2); must_get_rollback_ts(&mut engine, key, 2); diff --git a/src/storage/txn/actions/check_txn_status.rs b/src/storage/txn/actions/check_txn_status.rs index 6e786aec5fa..8ec8f438f21 100644 --- a/src/storage/txn/actions/check_txn_status.rs +++ b/src/storage/txn/actions/check_txn_status.rs @@ -322,8 +322,8 @@ pub fn rollback_lock( txn.delete_value(key.clone(), lock.ts); } - // Only the primary key of a pessimistic transaction needs to be protected. - let protected: bool = is_pessimistic_txn && key.is_encoded_from(&lock.primary); + // The primary key of a transaction needs to be protected. + let protected: bool = key.is_encoded_from(&lock.primary); if let Some(write) = make_rollback(reader.start_ts, protected, overlapped_write) { txn.put_write(key.clone(), reader.start_ts, write.as_ref().to_bytes()); } diff --git a/src/storage/txn/actions/cleanup.rs b/src/storage/txn/actions/cleanup.rs index 5ed77d4fab3..d28368aa1bf 100644 --- a/src/storage/txn/actions/cleanup.rs +++ b/src/storage/txn/actions/cleanup.rs @@ -223,8 +223,9 @@ pub mod tests { // TTL expired. The lock should be removed. must_succeed(&mut engine, k, ts(10, 0), ts(120, 0)); must_unlocked(&mut engine, k); - // Rollbacks of optimistic transactions needn't be protected - must_get_rollback_protected(&mut engine, k, ts(10, 0), false); + // Rollbacks of optimistic transactions need to be protected + // See: https://github.com/tikv/tikv/issues/16620 + must_get_rollback_protected(&mut engine, k, ts(10, 0), true); must_get_rollback_ts(&mut engine, k, ts(10, 0)); // Rollbacks of primary keys in pessimistic transactions should be protected diff --git a/src/storage/txn/actions/prewrite.rs b/src/storage/txn/actions/prewrite.rs index 90f739b8705..64e22a13585 100644 --- a/src/storage/txn/actions/prewrite.rs +++ b/src/storage/txn/actions/prewrite.rs @@ -239,6 +239,7 @@ impl LockStatus { } /// A single mutation to be prewritten. +#[derive(Debug)] struct PrewriteMutation<'a> { key: Key, value: Option, @@ -677,6 +678,12 @@ impl<'a> PrewriteMutation<'a> { if self.skip_constraint_check() { self.check_for_newer_version(reader)?; } + let (write, commit_ts) = write + .as_ref() + .map(|(w, ts)| (Some(w), Some(ts))) + .unwrap_or((None, None)); + error!("assertion failure"; "assertion" => ?self.assertion, "write" => ?write, + "commit_ts" => commit_ts, "mutation" => ?self); assertion_err?; } diff --git a/src/storage/txn/commands/acquire_pessimistic_lock.rs b/src/storage/txn/commands/acquire_pessimistic_lock.rs index 58c33706bbc..ceb7957c926 100644 --- a/src/storage/txn/commands/acquire_pessimistic_lock.rs +++ b/src/storage/txn/commands/acquire_pessimistic_lock.rs @@ -183,6 +183,7 @@ impl WriteCommand for AcquirePessimisticLock new_acquired_locks, lock_guards: vec![], response_policy: ResponsePolicy::OnProposed, + known_txn_status: vec![], }) } } diff --git a/src/storage/txn/commands/acquire_pessimistic_lock_resumed.rs b/src/storage/txn/commands/acquire_pessimistic_lock_resumed.rs index 7640edd7c0c..a1e2e6fc119 100644 --- a/src/storage/txn/commands/acquire_pessimistic_lock_resumed.rs +++ b/src/storage/txn/commands/acquire_pessimistic_lock_resumed.rs @@ -194,6 +194,7 @@ impl WriteCommand for AcquirePessimisticLockR new_acquired_locks, lock_guards: vec![], response_policy: ResponsePolicy::OnProposed, + known_txn_status: vec![], }) } } @@ -239,6 +240,7 @@ mod tests { txn::{ commands::pessimistic_rollback::tests::must_success as must_pessimistic_rollback, tests::{must_commit, must_pessimistic_locked, must_prewrite_put, must_rollback}, + txn_status_cache::TxnStatusCache, }, TestEngineBuilder, }; @@ -275,6 +277,7 @@ mod tests { statistics: &mut Default::default(), async_apply_prewrite: false, raw_ext: None, + txn_status_cache: &TxnStatusCache::new_for_test(), }, ) .unwrap(); diff --git a/src/storage/txn/commands/atomic_store.rs b/src/storage/txn/commands/atomic_store.rs index 9a54895e7e2..4bca5d514c5 100644 --- a/src/storage/txn/commands/atomic_store.rs +++ b/src/storage/txn/commands/atomic_store.rs @@ -63,6 +63,7 @@ impl WriteCommand for RawAtomicStore { new_acquired_locks: vec![], lock_guards: raw_ext.into_iter().map(|r| r.key_guard).collect(), response_policy: ResponsePolicy::OnApplied, + known_txn_status: vec![], }) } } @@ -77,7 +78,9 @@ mod tests { use super::*; use crate::storage::{ - lock_manager::MockLockManager, txn::scheduler::get_raw_ext, Statistics, TestEngineBuilder, + lock_manager::MockLockManager, + txn::{scheduler::get_raw_ext, txn_status_cache::TxnStatusCache}, + Statistics, TestEngineBuilder, }; #[test] @@ -116,6 +119,7 @@ mod tests { statistics: &mut statistic, async_apply_prewrite: false, raw_ext, + txn_status_cache: &TxnStatusCache::new_for_test(), }; let cmd: Command = cmd.into(); let write_result = cmd.process_write(snap, context).unwrap(); diff --git a/src/storage/txn/commands/check_secondary_locks.rs b/src/storage/txn/commands/check_secondary_locks.rs index 92985c4d90d..ceb169f79b2 100644 --- a/src/storage/txn/commands/check_secondary_locks.rs +++ b/src/storage/txn/commands/check_secondary_locks.rs @@ -201,6 +201,12 @@ impl WriteCommand for CheckSecondaryLocks { } } + let write_result_known_txn_status = + if let SecondaryLocksStatus::Committed(commit_ts) = &result { + vec![(self.start_ts, *commit_ts)] + } else { + vec![] + }; let mut rows = 0; if let SecondaryLocksStatus::RolledBack = &result { // One row is mutated only when a secondary lock is rolled back. @@ -220,6 +226,7 @@ impl WriteCommand for CheckSecondaryLocks { new_acquired_locks, lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, + known_txn_status: write_result_known_txn_status, }) } } @@ -235,7 +242,10 @@ pub mod tests { kv::TestEngineBuilder, lock_manager::MockLockManager, mvcc::tests::*, - txn::{commands::WriteCommand, scheduler::DEFAULT_EXECUTION_DURATION_LIMIT, tests::*}, + txn::{ + commands::WriteCommand, scheduler::DEFAULT_EXECUTION_DURATION_LIMIT, tests::*, + txn_status_cache::TxnStatusCache, + }, Engine, }; @@ -265,6 +275,7 @@ pub mod tests { statistics: &mut Default::default(), async_apply_prewrite: false, raw_ext: None, + txn_status_cache: &TxnStatusCache::new_for_test(), }, ) .unwrap(); @@ -303,6 +314,7 @@ pub mod tests { statistics: &mut Default::default(), async_apply_prewrite: false, raw_ext: None, + txn_status_cache: &TxnStatusCache::new_for_test(), }, ) .unwrap(); diff --git a/src/storage/txn/commands/check_txn_status.rs b/src/storage/txn/commands/check_txn_status.rs index dc99ebf3b01..ce5774c0db6 100644 --- a/src/storage/txn/commands/check_txn_status.rs +++ b/src/storage/txn/commands/check_txn_status.rs @@ -131,6 +131,12 @@ impl WriteCommand for CheckTxnStatus { let mut released_locks = ReleasedLocks::new(); released_locks.push(released); + let write_result_known_txn_status = if let TxnStatus::Committed { commit_ts } = &txn_status + { + vec![(self.lock_ts, *commit_ts)] + } else { + vec![] + }; let pr = ProcessResult::TxnStatus { txn_status }; let new_acquired_locks = txn.take_new_locks(); let mut write_data = WriteData::from_modifies(txn.into_modifies()); @@ -145,6 +151,7 @@ impl WriteCommand for CheckTxnStatus { new_acquired_locks, lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, + known_txn_status: write_result_known_txn_status, }) } } @@ -152,7 +159,9 @@ impl WriteCommand for CheckTxnStatus { #[cfg(test)] pub mod tests { use concurrency_manager::ConcurrencyManager; - use kvproto::kvrpcpb::{self, Context, LockInfo, PrewriteRequestPessimisticAction::*}; + use kvproto::kvrpcpb::{ + self, Context, LockInfo, PrewriteRequestPessimisticAction::*, WriteConflictReason, + }; use tikv_util::deadline::Deadline; use txn_types::{Key, LastChange, WriteType}; @@ -161,13 +170,14 @@ pub mod tests { kv::Engine, lock_manager::MockLockManager, mvcc, - mvcc::tests::*, + mvcc::{tests::*, ErrorInner}, txn::{ self, actions::acquire_pessimistic_lock::tests::acquire_pessimistic_lock_allow_lock_with_conflict, commands::{pessimistic_rollback, WriteCommand, WriteContext}, scheduler::DEFAULT_EXECUTION_DURATION_LIMIT, tests::*, + txn_status_cache::TxnStatusCache, }, types::TxnStatus, ProcessResult, TestEngineBuilder, @@ -211,11 +221,17 @@ pub mod tests { statistics: &mut Default::default(), async_apply_prewrite: false, raw_ext: None, + txn_status_cache: &TxnStatusCache::new_for_test(), }, ) .unwrap(); if let ProcessResult::TxnStatus { txn_status } = result.pr { - assert!(status_pred(txn_status)); + let formatted_txn_status = format!("{:?}", txn_status); + assert!( + status_pred(txn_status), + "txn_status returned by check_txn_status ({}) doesn't pass the check", + formatted_txn_status + ); } else { unreachable!(); } @@ -259,6 +275,7 @@ pub mod tests { statistics: &mut Default::default(), async_apply_prewrite: false, raw_ext: None, + txn_status_cache: &TxnStatusCache::new_for_test(), }, ) .map(|r| { @@ -404,7 +421,7 @@ pub mod tests { |s| s == TtlExpire, ); must_unlocked(&mut engine, b"k1"); - must_get_rollback_protected(&mut engine, b"k1", 1, false); + must_get_rollback_protected(&mut engine, b"k1", 1, true); // case 2: primary is prewritten (pessimistic) must_acquire_pessimistic_lock(&mut engine, b"k2", b"k2", 15, 15); @@ -819,6 +836,7 @@ pub mod tests { ts(20, 0), WriteType::Rollback, ); + must_get_rollback_protected(&mut engine, k, ts(20, 0), true); // Push the min_commit_ts of pessimistic locks. must_acquire_pessimistic_lock_for_large_txn(&mut engine, k, k, ts(4, 0), ts(130, 0), 200); @@ -1427,4 +1445,61 @@ pub mod tests { ) .unwrap_err(); } + + #[test] + fn test_check_txn_status_rollback_optimistic() { + let mut engine = TestEngineBuilder::new().build().unwrap(); + let k = b"k1"; + let (v1, v2) = (b"v1", b"v2"); + + let ts = TimeStamp::compose; + + must_prewrite_put_async_commit(&mut engine, k, v1, k, &Some(vec![]), ts(1, 0), ts(1, 1)); + must_commit(&mut engine, k, ts(1, 0), ts(2, 0)); + + must_prewrite_put(&mut engine, k, v2, k, ts(2, 0)); + assert!(!must_have_write(&mut engine, k, ts(2, 0)).has_overlapped_rollback); + + must_success( + &mut engine, + k, + ts(2, 0), + ts(3, 0), + ts(3, 0), + true, + false, + false, + |s| s == TtlExpire, + ); + must_get_overlapped_rollback( + &mut engine, + k, + ts(2, 0), + ts(1, 0), + WriteType::Put, + Some(0.into()), + ); + + let e = must_prewrite_put_err(&mut engine, k, v2, k, ts(2, 0)); + match &*e.0 { + ErrorInner::WriteConflict { + start_ts, + conflict_start_ts, + conflict_commit_ts, + key, + primary, + reason, + } => { + assert_eq!(*start_ts, ts(2, 0)); + assert_eq!(*conflict_start_ts, ts(1, 0)); + assert_eq!(*conflict_commit_ts, ts(2, 0)); + assert_eq!(key.as_slice(), k); + assert_eq!(primary.as_slice(), k); + assert_eq!(*reason, WriteConflictReason::SelfRolledBack); + } + e => { + panic!("unexpected error: {:?}", e); + } + } + } } diff --git a/src/storage/txn/commands/cleanup.rs b/src/storage/txn/commands/cleanup.rs index 302c4fe1308..886094a7f34 100644 --- a/src/storage/txn/commands/cleanup.rs +++ b/src/storage/txn/commands/cleanup.rs @@ -80,6 +80,7 @@ impl WriteCommand for Cleanup { new_acquired_locks, lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, + known_txn_status: vec![], }) } } diff --git a/src/storage/txn/commands/commit.rs b/src/storage/txn/commands/commit.rs index 4f05df8fe83..8daff9b2aee 100644 --- a/src/storage/txn/commands/commit.rs +++ b/src/storage/txn/commands/commit.rs @@ -80,6 +80,7 @@ impl WriteCommand for Commit { new_acquired_locks, lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, + known_txn_status: vec![(self.lock_ts, self.commit_ts)], }) } } diff --git a/src/storage/txn/commands/compare_and_swap.rs b/src/storage/txn/commands/compare_and_swap.rs index ca9213b57d3..3725de47273 100644 --- a/src/storage/txn/commands/compare_and_swap.rs +++ b/src/storage/txn/commands/compare_and_swap.rs @@ -117,6 +117,7 @@ impl WriteCommand for RawCompareAndSwap { new_acquired_locks: vec![], lock_guards, response_policy: ResponsePolicy::OnApplied, + known_txn_status: vec![], }) } } @@ -134,8 +135,9 @@ mod tests { use super::*; use crate::storage::{ - lock_manager::MockLockManager, txn::scheduler::get_raw_ext, Engine, Statistics, - TestEngineBuilder, + lock_manager::MockLockManager, + txn::{scheduler::get_raw_ext, txn_status_cache::TxnStatusCache}, + Engine, Statistics, TestEngineBuilder, }; #[test] @@ -215,6 +217,7 @@ mod tests { statistics: &mut statistic, async_apply_prewrite: false, raw_ext, + txn_status_cache: &TxnStatusCache::new_for_test(), }; let ret = cmd.cmd.process_write(snap, context)?; match ret.pr { @@ -269,6 +272,7 @@ mod tests { statistics: &mut statistic, async_apply_prewrite: false, raw_ext, + txn_status_cache: &TxnStatusCache::new_for_test(), }; let cmd: Command = cmd.into(); let write_result = cmd.process_write(snap, context).unwrap(); diff --git a/src/storage/txn/commands/flashback_to_version.rs b/src/storage/txn/commands/flashback_to_version.rs index 37d288fa266..efbeefa2494 100644 --- a/src/storage/txn/commands/flashback_to_version.rs +++ b/src/storage/txn/commands/flashback_to_version.rs @@ -185,6 +185,7 @@ impl WriteCommand for FlashbackToVersion { new_acquired_locks: vec![], lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, + known_txn_status: vec![], }) } } diff --git a/src/storage/txn/commands/mod.rs b/src/storage/txn/commands/mod.rs index 5896d6562f1..dabef707e61 100644 --- a/src/storage/txn/commands/mod.rs +++ b/src/storage/txn/commands/mod.rs @@ -70,7 +70,7 @@ use crate::storage::{ }, metrics, mvcc::{Lock as MvccLock, MvccReader, ReleasedLock, SnapshotReader}, - txn::{latch, ProcessResult, Result}, + txn::{latch, txn_status_cache::TxnStatusCache, ProcessResult, Result}, types::{ MvccInfo, PessimisticLockParameters, PessimisticLockResults, PrewriteResult, SecondaryLocksStatus, StorageCallbackType, TxnStatus, @@ -422,6 +422,12 @@ pub struct WriteResult { pub new_acquired_locks: Vec, pub lock_guards: Vec, pub response_policy: ResponsePolicy, + /// The txn status that can be inferred by the successful writing. This will + /// be used to update the cache. + /// + /// Currently only commit_ts of committed transactions will be collected. + /// Rolled-back transactions may also be collected in the future. + pub known_txn_status: Vec<(TimeStamp, TimeStamp)>, } pub struct WriteResultLockInfo { @@ -573,6 +579,7 @@ pub struct WriteContext<'a, L: LockManager> { pub statistics: &'a mut Statistics, pub async_apply_prewrite: bool, pub raw_ext: Option, // use for apiv2 + pub txn_status_cache: &'a TxnStatusCache, } pub struct ReaderWithStats<'a, S: Snapshot> { @@ -823,6 +830,7 @@ pub mod test_util { statistics, async_apply_prewrite: false, raw_ext: None, + txn_status_cache: &TxnStatusCache::new_for_test(), }; let ret = cmd.cmd.process_write(snap, context)?; let res = match ret.pr { @@ -983,6 +991,7 @@ pub mod test_util { statistics, async_apply_prewrite: false, raw_ext: None, + txn_status_cache: &TxnStatusCache::new_for_test(), }; let ret = cmd.cmd.process_write(snap, context)?; @@ -1008,6 +1017,7 @@ pub mod test_util { statistics, async_apply_prewrite: false, raw_ext: None, + txn_status_cache: &TxnStatusCache::new_for_test(), }; let ret = cmd.cmd.process_write(snap, context)?; diff --git a/src/storage/txn/commands/pause.rs b/src/storage/txn/commands/pause.rs index 5d3aa7f6d2f..1f5d40b2d4e 100644 --- a/src/storage/txn/commands/pause.rs +++ b/src/storage/txn/commands/pause.rs @@ -53,6 +53,7 @@ impl WriteCommand for Pause { new_acquired_locks: vec![], lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, + known_txn_status: vec![], }) } } diff --git a/src/storage/txn/commands/pessimistic_rollback.rs b/src/storage/txn/commands/pessimistic_rollback.rs index 4e0bf8c8c56..531eb256c40 100644 --- a/src/storage/txn/commands/pessimistic_rollback.rs +++ b/src/storage/txn/commands/pessimistic_rollback.rs @@ -96,6 +96,7 @@ impl WriteCommand for PessimisticRollback { new_acquired_locks, lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, + known_txn_status: vec![], }) } } @@ -116,6 +117,7 @@ pub mod tests { commands::{WriteCommand, WriteContext}, scheduler::DEFAULT_EXECUTION_DURATION_LIMIT, tests::*, + txn_status_cache::TxnStatusCache, }, TestEngineBuilder, }; @@ -146,6 +148,7 @@ pub mod tests { statistics: &mut Default::default(), async_apply_prewrite: false, raw_ext: None, + txn_status_cache: &TxnStatusCache::new_for_test(), }; let result = command.process_write(snapshot, write_context).unwrap(); write(engine, &ctx, result.to_be_write.modifies); diff --git a/src/storage/txn/commands/prewrite.rs b/src/storage/txn/commands/prewrite.rs index 10446db6292..34c98dab156 100644 --- a/src/storage/txn/commands/prewrite.rs +++ b/src/storage/txn/commands/prewrite.rs @@ -24,7 +24,7 @@ use crate::storage::{ kv::WriteData, lock_manager::LockManager, mvcc::{ - has_data_in_range, Error as MvccError, ErrorInner as MvccErrorInner, MvccTxn, + has_data_in_range, metrics::*, Error as MvccError, ErrorInner as MvccErrorInner, MvccTxn, Result as MvccResult, SnapshotReader, TxnCommitRecord, }, txn::{ @@ -489,6 +489,36 @@ impl Prewriter { snapshot: impl Snapshot, mut context: WriteContext<'_, impl LockManager>, ) -> Result { + // Handle special cases about retried prewrite requests for pessimistic + // transactions. + if let TransactionKind::Pessimistic(_) = self.kind.txn_kind() { + if let Some(commit_ts) = context.txn_status_cache.get_no_promote(self.start_ts) { + fail_point!("before_prewrite_txn_status_cache_hit"); + if self.ctx.is_retry_request { + MVCC_PREWRITE_REQUEST_AFTER_COMMIT_COUNTER_VEC + .retry_req + .inc(); + } else { + MVCC_PREWRITE_REQUEST_AFTER_COMMIT_COUNTER_VEC + .non_retry_req + .inc(); + } + warn!("prewrite request received due to transaction is known to be already committed"; "start_ts" => %self.start_ts, "commit_ts" => %commit_ts); + // In normal cases if the transaction is committed, then the key should have + // been already prewritten successfully. But in order to + // simplify code as well as prevent possible corner cases or + // special cases in the future, we disallow skipping constraint + // check in this case. + // We regard this request as a retried request no matter if it really is (the + // original request may arrive later than retried request due to + // network latency, in which case we'd better handle it like a + // retried request). + self.ctx.is_retry_request = true; + } else { + fail_point!("before_prewrite_txn_status_cache_miss"); + } + } + self.kind .can_skip_constraint_check(&mut self.mutations, &snapshot, &mut context)?; self.check_max_ts_synced(&snapshot)?; @@ -748,6 +778,11 @@ impl Prewriter { new_acquired_locks, lock_guards, response_policy: ResponsePolicy::OnApplied, + known_txn_status: if !one_pc_commit_ts.is_zero() { + vec![(self.start_ts, one_pc_commit_ts)] + } else { + vec![] + }, } } else { // Skip write stage if some keys are locked. @@ -768,6 +803,7 @@ impl Prewriter { new_acquired_locks: vec![], lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, + known_txn_status: vec![], } }; @@ -1002,6 +1038,7 @@ mod tests { must_acquire_pessimistic_lock, must_acquire_pessimistic_lock_err, must_commit, must_prewrite_put_err_impl, must_prewrite_put_impl, must_rollback, }, + txn_status_cache::TxnStatusCache, Error, ErrorInner, }, types::TxnStatus, @@ -1647,6 +1684,7 @@ mod tests { statistics: &mut Statistics::default(), async_apply_prewrite: false, raw_ext: None, + txn_status_cache: &TxnStatusCache::new_for_test(), } }; } @@ -1818,6 +1856,7 @@ mod tests { statistics: &mut statistics, async_apply_prewrite: case.async_apply_prewrite, raw_ext: None, + txn_status_cache: &TxnStatusCache::new_for_test(), }; let mut engine = TestEngineBuilder::new().build().unwrap(); let snap = engine.snapshot(Default::default()).unwrap(); @@ -1932,6 +1971,7 @@ mod tests { statistics: &mut statistics, async_apply_prewrite: false, raw_ext: None, + txn_status_cache: &TxnStatusCache::new_for_test(), }; let snap = engine.snapshot(Default::default()).unwrap(); let result = cmd.cmd.process_write(snap, context).unwrap(); @@ -1960,6 +2000,7 @@ mod tests { statistics: &mut statistics, async_apply_prewrite: false, raw_ext: None, + txn_status_cache: &TxnStatusCache::new_for_test(), }; let snap = engine.snapshot(Default::default()).unwrap(); let result = cmd.cmd.process_write(snap, context).unwrap(); @@ -2043,6 +2084,7 @@ mod tests { statistics: &mut statistics, async_apply_prewrite: false, raw_ext: None, + txn_status_cache: &TxnStatusCache::new_for_test(), }; let snap = engine.snapshot(Default::default()).unwrap(); let result = cmd.cmd.process_write(snap, context).unwrap(); @@ -2075,6 +2117,7 @@ mod tests { statistics: &mut statistics, async_apply_prewrite: false, raw_ext: None, + txn_status_cache: &TxnStatusCache::new_for_test(), }; let snap = engine.snapshot(Default::default()).unwrap(); let result = cmd.cmd.process_write(snap, context).unwrap(); @@ -2345,6 +2388,7 @@ mod tests { statistics: &mut statistics, async_apply_prewrite: false, raw_ext: None, + txn_status_cache: &TxnStatusCache::new_for_test(), }; let snap = engine.snapshot(Default::default()).unwrap(); assert!(prewrite_cmd.cmd.process_write(snap, context).is_err()); @@ -2369,6 +2413,7 @@ mod tests { statistics: &mut statistics, async_apply_prewrite: false, raw_ext: None, + txn_status_cache: &TxnStatusCache::new_for_test(), }; let snap = engine.snapshot(Default::default()).unwrap(); assert!(prewrite_cmd.cmd.process_write(snap, context).is_err()); @@ -2575,6 +2620,7 @@ mod tests { statistics: &mut statistics, async_apply_prewrite: false, raw_ext: None, + txn_status_cache: &TxnStatusCache::new_for_test(), }; let snap = engine.snapshot(Default::default()).unwrap(); let res = prewrite_cmd.cmd.process_write(snap, context).unwrap(); diff --git a/src/storage/txn/commands/resolve_lock.rs b/src/storage/txn/commands/resolve_lock.rs index f3d141807e8..cd01fc60475 100644 --- a/src/storage/txn/commands/resolve_lock.rs +++ b/src/storage/txn/commands/resolve_lock.rs @@ -83,6 +83,7 @@ impl WriteCommand for ResolveLock { let mut scan_key = self.scan_key.take(); let rows = key_locks.len(); let mut released_locks = ReleasedLocks::new(); + let mut known_txn_status = vec![]; for (current_key, current_lock) in key_locks { txn.start_ts = current_lock.ts; reader.start_ts = current_lock.ts; @@ -103,7 +104,10 @@ impl WriteCommand for ResolveLock { // type. They could be left if the transaction is finally committed and // pessimistic conflict retry happens during execution. match commit(&mut txn, &mut reader, current_key.clone(), commit_ts) { - Ok(res) => res, + Ok(res) => { + known_txn_status.push((current_lock.ts, commit_ts)); + res + } Err(MvccError(box MvccErrorInner::TxnLockNotFound { .. })) if current_lock.is_pessimistic_lock() => { @@ -125,6 +129,9 @@ impl WriteCommand for ResolveLock { } } + known_txn_status.sort(); + known_txn_status.dedup(); + let pr = if scan_key.is_none() { ProcessResult::Res } else { @@ -151,6 +158,7 @@ impl WriteCommand for ResolveLock { new_acquired_locks, lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, + known_txn_status, }) } } diff --git a/src/storage/txn/commands/resolve_lock_lite.rs b/src/storage/txn/commands/resolve_lock_lite.rs index 63fe201596d..318e5d57313 100644 --- a/src/storage/txn/commands/resolve_lock_lite.rs +++ b/src/storage/txn/commands/resolve_lock_lite.rs @@ -63,6 +63,11 @@ impl WriteCommand for ResolveLockLite { }); } + let known_txn_status = if !self.commit_ts.is_zero() { + vec![(self.start_ts, self.commit_ts)] + } else { + vec![] + }; let new_acquired_locks = txn.take_new_locks(); let mut write_data = WriteData::from_modifies(txn.into_modifies()); write_data.set_allowed_on_disk_almost_full(); @@ -76,6 +81,7 @@ impl WriteCommand for ResolveLockLite { new_acquired_locks, lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, + known_txn_status, }) } } diff --git a/src/storage/txn/commands/rollback.rs b/src/storage/txn/commands/rollback.rs index f3b674f4916..df60767e716 100644 --- a/src/storage/txn/commands/rollback.rs +++ b/src/storage/txn/commands/rollback.rs @@ -71,6 +71,7 @@ impl WriteCommand for Rollback { new_acquired_locks, lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, + known_txn_status: vec![], }) } } diff --git a/src/storage/txn/commands/txn_heart_beat.rs b/src/storage/txn/commands/txn_heart_beat.rs index 448395fc436..c900464099a 100644 --- a/src/storage/txn/commands/txn_heart_beat.rs +++ b/src/storage/txn/commands/txn_heart_beat.rs @@ -96,6 +96,7 @@ impl WriteCommand for TxnHeartBeat { new_acquired_locks, lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, + known_txn_status: vec![], }) } } @@ -111,7 +112,10 @@ pub mod tests { kv::TestEngineBuilder, lock_manager::MockLockManager, mvcc::tests::*, - txn::{commands::WriteCommand, scheduler::DEFAULT_EXECUTION_DURATION_LIMIT, tests::*}, + txn::{ + commands::WriteCommand, scheduler::DEFAULT_EXECUTION_DURATION_LIMIT, tests::*, + txn_status_cache::TxnStatusCache, + }, Engine, }; @@ -143,6 +147,7 @@ pub mod tests { statistics: &mut Default::default(), async_apply_prewrite: false, raw_ext: None, + txn_status_cache: &TxnStatusCache::new_for_test(), }, ) .unwrap(); @@ -185,6 +190,7 @@ pub mod tests { statistics: &mut Default::default(), async_apply_prewrite: false, raw_ext: None, + txn_status_cache: &TxnStatusCache::new_for_test(), }, ) .is_err() diff --git a/src/storage/txn/flow_controller/singleton_flow_controller.rs b/src/storage/txn/flow_controller/singleton_flow_controller.rs index 5d52c272db6..c5ac88c9f22 100644 --- a/src/storage/txn/flow_controller/singleton_flow_controller.rs +++ b/src/storage/txn/flow_controller/singleton_flow_controller.rs @@ -598,14 +598,24 @@ impl FlowChecker { if !enabled { return; } + if self.wait_for_destroy_range_finish { + // Concurrent unsafe destroy range, ignore the second one + info!("concurrent unsafe destroy range, ignore"); + return; + } self.wait_for_destroy_range_finish = true; let soft = (self.soft_pending_compaction_bytes_limit as f64).log2(); - for cf_checker in self.cf_checkers.values_mut() { + for (cf, cf_checker) in &mut self.cf_checkers { if let Some(long_term_pending_bytes) = cf_checker.long_term_pending_bytes.as_ref() { let v = long_term_pending_bytes.get_avg(); if v <= soft { + info!( + "before unsafe destroy range"; + "cf" => cf, + "pending_bytes" => v + ); cf_checker.pending_bytes_before_unsafe_destroy_range = Some(v); } } @@ -629,9 +639,13 @@ impl FlowChecker { SCHED_THROTTLE_ACTION_COUNTER .with_label_values(&[cf, "pending_bytes_jump"]) .inc(); - } else { - cf_checker.pending_bytes_before_unsafe_destroy_range = None; } + info!( + "after unsafe destroy range"; + "cf" => cf, + "before" => before, + "after" => after + ); } } } @@ -778,7 +792,17 @@ impl FlowChecker { let pending_compaction_bytes = long_term_pending_bytes.get_avg(); let ignore = if let Some(before) = checker.pending_bytes_before_unsafe_destroy_range { + // It assumes that the long term average will eventually come down below the + // soft limit. If the general traffic flow increases during destroy, the long + // term average may never come down and the flow control will be turned off for + // a long time, which would be a rather rare case, so just ignore it. if pending_compaction_bytes <= before && !self.wait_for_destroy_range_finish { + info!( + "pending compaction bytes is back to normal"; + "cf" => &cf, + "pending_compaction_bytes" => pending_compaction_bytes, + "before" => before + ); checker.pending_bytes_before_unsafe_destroy_range = None; } true @@ -1299,6 +1323,14 @@ pub(super) mod tests { stub.0 .pending_compaction_bytes .store(10000000 * 1024 * 1024 * 1024, Ordering::Relaxed); + send_flow_info(tx, region_id); + assert!(flow_controller.discard_ratio(region_id) < f64::EPSILON); + + // after unsafe destroy range, pending compaction bytes may jump back to a lower + // value + stub.0 + .pending_compaction_bytes + .store(100 * 1024 * 1024 * 1024, Ordering::Relaxed); tx.send(FlowInfo::Compaction("default".to_string(), region_id)) .unwrap(); tx.send(FlowInfo::AfterUnsafeDestroyRange(region_id)) @@ -1311,13 +1343,23 @@ pub(super) mod tests { flow_controller.discard_ratio(region_id) ); - // unfreeze the control + // the long term average pending compaction bytes is still high, shouldn't + // unfreeze the jump control + stub.0 + .pending_compaction_bytes + .store(100 * 1024 * 1024 * 1024, Ordering::Relaxed); + send_flow_info(tx, region_id); + assert!(flow_controller.discard_ratio(region_id) < f64::EPSILON); + + // the long term average pending compaction bytes falls below the threshold, + // should unfreeze the jump control stub.0 .pending_compaction_bytes .store(1024 * 1024, Ordering::Relaxed); send_flow_info(tx, region_id); assert!(flow_controller.discard_ratio(region_id) < f64::EPSILON); + // exceeds the threshold, should perform throttle stub.0 .pending_compaction_bytes .store(1000000000 * 1024 * 1024 * 1024, Ordering::Relaxed); diff --git a/src/storage/txn/mod.rs b/src/storage/txn/mod.rs index 640c534fc86..ca11fd6db30 100644 --- a/src/storage/txn/mod.rs +++ b/src/storage/txn/mod.rs @@ -6,6 +6,7 @@ pub mod commands; pub mod flow_controller; pub mod sched_pool; pub mod scheduler; +pub mod txn_status_cache; mod actions; mod latch; @@ -142,6 +143,9 @@ pub enum ErrorInner { )] MaxTimestampNotSynced { region_id: u64, start_ts: TimeStamp }, + #[error("RawKV write fails due to potentially stale max timestamp, region_id: {region_id}")] + RawKvMaxTimestampNotSynced { region_id: u64 }, + #[error("region {0} not prepared the flashback")] FlashbackNotPrepared(u64), } @@ -177,6 +181,9 @@ impl ErrorInner { region_id, start_ts, }), + ErrorInner::RawKvMaxTimestampNotSynced { region_id } => { + Some(ErrorInner::RawKvMaxTimestampNotSynced { region_id }) + } ErrorInner::FlashbackNotPrepared(region_id) => { Some(ErrorInner::FlashbackNotPrepared(region_id)) } @@ -230,6 +237,9 @@ impl ErrorCodeExt for Error { ErrorInner::MaxTimestampNotSynced { .. } => { error_code::storage::MAX_TIMESTAMP_NOT_SYNCED } + ErrorInner::RawKvMaxTimestampNotSynced { .. } => { + error_code::storage::MAX_TIMESTAMP_NOT_SYNCED + } ErrorInner::FlashbackNotPrepared(_) => error_code::storage::FLASHBACK_NOT_PREPARED, } } diff --git a/src/storage/txn/sched_pool.rs b/src/storage/txn/sched_pool.rs index 19736304373..70d54c24cfa 100644 --- a/src/storage/txn/sched_pool.rs +++ b/src/storage/txn/sched_pool.rs @@ -12,7 +12,10 @@ use kvproto::{kvrpcpb::CommandPri, pdpb::QueryKind}; use pd_client::{Feature, FeatureGate}; use prometheus::local::*; use raftstore::store::WriteStats; -use resource_control::{ControlledFuture, ResourceController, TaskMetadata}; +use resource_control::{ + priority_from_task_meta, with_resource_limiter, ControlledFuture, ResourceController, + ResourceGroupManager, TaskMetadata, +}; use tikv_util::{ sys::SysQuota, yatp_pool::{Full, FuturePool, PoolTicker, YatpPoolBuilder}, @@ -101,6 +104,7 @@ impl VanillaQueue { struct PriorityQueue { worker_pool: FuturePool, resource_ctl: Arc, + resource_mgr: Arc, } impl PriorityQueue { @@ -118,15 +122,17 @@ impl PriorityQueue { // TODO: maybe use a better way to generate task_id let task_id = rand::random::(); let group_name = metadata.group_name().to_owned(); + let resource_limiter = self.resource_mgr.get_resource_limiter( + unsafe { std::str::from_utf8_unchecked(&group_name) }, + "", + metadata.override_priority() as u64, + ); let mut extras = Extras::new_multilevel(task_id, fixed_level); extras.set_metadata(metadata.to_vec()); self.worker_pool.spawn_with_extras( - ControlledFuture::new( - async move { - f.await; - }, - self.resource_ctl.clone(), - group_name, + with_resource_limiter( + ControlledFuture::new(f, self.resource_ctl.clone(), group_name), + resource_limiter, ), extras, ) @@ -155,6 +161,7 @@ impl SchedPool { reporter: R, feature_gate: FeatureGate, resource_ctl: Option>, + resource_mgr: Option>, ) -> Self { let builder = |pool_size: usize, name_prefix: &str| { let engine = Arc::new(Mutex::new(engine.clone())); @@ -181,6 +188,8 @@ impl SchedPool { destroy_tls_engine::(); tls_flush(&reporter); }) + .enable_task_wait_metrics() + .metric_idx_from_task_meta(Arc::new(priority_from_task_meta)) }; let vanilla = VanillaQueue { worker_pool: builder(pool_size, "sched-worker-pool").build_future_pool(), @@ -191,6 +200,7 @@ impl SchedPool { worker_pool: builder(pool_size, "sched-worker-priority") .build_priority_future_pool(r.clone()), resource_ctl: r.clone(), + resource_mgr: resource_mgr.unwrap(), }); let queue_type = if resource_ctl.is_some() { QueueType::Dynamic diff --git a/src/storage/txn/scheduler.rs b/src/storage/txn/scheduler.rs index 4df7033c21a..bb2c5194f75 100644 --- a/src/storage/txn/scheduler.rs +++ b/src/storage/txn/scheduler.rs @@ -83,6 +83,7 @@ use crate::{ flow_controller::FlowController, latch::{Latches, Lock}, sched_pool::{tls_collect_query, tls_collect_scan_details, SchedPool}, + txn_status_cache::TxnStatusCache, Error, ErrorInner, ProcessResult, }, types::StorageCallback, @@ -293,6 +294,8 @@ struct TxnSchedulerInner { quota_limiter: Arc, resource_manager: Option>, feature_gate: FeatureGate, + + txn_status_cache: TxnStatusCache, } #[inline] @@ -469,6 +472,7 @@ impl TxnScheduler { reporter, feature_gate.clone(), resource_ctl, + resource_manager.clone(), ), control_mutex: Arc::new(tokio::sync::Mutex::new(false)), lock_mgr, @@ -484,6 +488,7 @@ impl TxnScheduler { quota_limiter, resource_manager, feature_gate, + txn_status_cache: TxnStatusCache::new(config.txn_status_cache_capacity), }); slow_log!( @@ -815,6 +820,7 @@ impl TxnScheduler { pipelined: bool, async_apply_prewrite: bool, new_acquired_locks: Vec, + known_txn_status: Vec<(TimeStamp, TimeStamp)>, tag: CommandKind, metadata: TaskMetadata<'_>, sched_details: &SchedulerDetails, @@ -837,6 +843,17 @@ impl TxnScheduler { debug!("write command finished"; "cid" => cid, "pipelined" => pipelined, "async_apply_prewrite" => async_apply_prewrite); drop(lock_guards); + + if result.is_ok() && !known_txn_status.is_empty() { + // Update cache before calling the callback. + // Reversing the order can lead to test failures as the cache may still + // remain not updated after receiving signal from the callback. + let now = std::time::SystemTime::now(); + for (start_ts, commit_ts) in known_txn_status { + self.inner.txn_status_cache.insert(start_ts, commit_ts, now); + } + } + let tctx = self.inner.dequeue_task_context(cid); let mut do_wake_up = !tctx.woken_up_resumable_lock_requests.is_empty(); @@ -923,6 +940,7 @@ impl TxnScheduler { let start_ts = lock_info.parameters.start_ts; let is_first_lock = lock_info.parameters.is_first_lock; let wait_timeout = lock_info.parameters.wait_timeout; + let allow_lock_with_conflict = lock_info.parameters.allow_lock_with_conflict; let diag_ctx = DiagnosticContext { key: lock_info.key.to_raw().unwrap(), @@ -950,6 +968,7 @@ impl TxnScheduler { key, lock_digest, lock_info: lock_info_pb, + allow_lock_with_conflict, }; self.inner.lock_mgr.wait_for( wait_token, @@ -1220,6 +1239,10 @@ impl TxnScheduler { .get_resource_control_context() .get_resource_group_name(), task.cmd.ctx().get_request_source(), + task.cmd + .ctx() + .get_resource_control_context() + .get_override_priority(), ) }); let mut sample = quota_limiter.new_sample(true); @@ -1258,6 +1281,7 @@ impl TxnScheduler { statistics: &mut sched_details.stat, async_apply_prewrite: self.inner.enable_async_apply_prewrite, raw_ext, + txn_status_cache: &self.inner.txn_status_cache, }; let begin_instant = Instant::now(); let res = unsafe { @@ -1279,10 +1303,14 @@ impl TxnScheduler { // TODO: write bytes can be a bit inaccurate due to error requests or in-memory // pessimistic locks. sample.add_write_bytes(write_bytes); - // estimate the cpu time for write by the schdule cpu time and write bytes - let expected_dur = (sample.cpu_time() + Duration::from_micros(write_bytes as u64)) - * SCHEDULER_CPU_TIME_FACTOR; if let Some(limiter) = resource_limiter { + let expected_dur = if limiter.is_background() { + // estimate the cpu time for write by the schduling cpu time and write bytes + (sample.cpu_time() + Duration::from_micros(write_bytes as u64)) + * SCHEDULER_CPU_TIME_FACTOR + } else { + sample.cpu_time() + }; limiter .async_consume( expected_dur, @@ -1328,6 +1356,7 @@ impl TxnScheduler { new_acquired_locks, lock_guards, response_policy, + known_txn_status, } = match deadline .check() .map_err(StorageError::from) @@ -1406,6 +1435,7 @@ impl TxnScheduler { false, false, new_acquired_locks, + known_txn_status, tag, metadata, sched_details, @@ -1441,6 +1471,7 @@ impl TxnScheduler { false, false, new_acquired_locks, + known_txn_status, tag, metadata, sched_details, @@ -1636,6 +1667,7 @@ impl TxnScheduler { pipelined, is_async_apply_prewrite, new_acquired_locks, + known_txn_status, tag, metadata, sched_details, @@ -1665,10 +1697,15 @@ impl TxnScheduler { // it may break correctness. // However, not release latch will cause deadlock which may ultimately block all // following txns, so we panic here. - panic!( - "response channel is unexpectedly dropped, tag {:?}, cid {}", - tag, cid - ); + // + // todo(spadea): Now, we only panic if it's not shutting down, although even in + // close, this behavior is not acceptable. + if !tikv_util::thread_group::is_shutdown(!cfg!(test)) { + panic!( + "response channel is unexpectedly dropped, tag {:?}, cid {}", + tag, cid + ); + } } /// Returns whether it succeeds to write pessimistic locks to the in-memory @@ -1874,6 +1911,11 @@ impl TxnScheduler { .push_lock_wait(entry, Default::default()); } } + + #[cfg(test)] + pub fn get_txn_status_cache(&self) -> &TxnStatusCache { + &self.inner.txn_status_cache + } } pub async fn get_raw_ext( @@ -1886,9 +1928,8 @@ pub async fn get_raw_ext( match cmd { Command::RawCompareAndSwap(_) | Command::RawAtomicStore(_) => { if !max_ts_synced { - return Err(ErrorInner::MaxTimestampNotSynced { + return Err(ErrorInner::RawKvMaxTimestampNotSynced { region_id: cmd.ctx().get_region_id(), - start_ts: TimeStamp::zero(), } .into()); } @@ -1997,6 +2038,8 @@ mod tests { enable_async_apply_prewrite: false, ..Default::default() }; + let resource_manager = Arc::new(ResourceGroupManager::default()); + let controller = resource_manager.derive_controller("test".into(), false); ( TxnScheduler::new( engine.clone(), @@ -2014,11 +2057,8 @@ mod tests { ResourceTagFactory::new_for_test(), Arc::new(QuotaLimiter::default()), latest_feature_gate(), - Some(Arc::new(ResourceController::new_for_test( - "test".to_owned(), - true, - ))), - None, + Some(controller), + Some(resource_manager), ), engine, ) @@ -2353,6 +2393,8 @@ mod tests { }; let feature_gate = FeatureGate::default(); feature_gate.set_version("6.0.0").unwrap(); + let resource_manager = Arc::new(ResourceGroupManager::default()); + let controller = resource_manager.derive_controller("test".into(), false); let scheduler = TxnScheduler::new( engine, @@ -2370,11 +2412,8 @@ mod tests { ResourceTagFactory::new_for_test(), Arc::new(QuotaLimiter::default()), feature_gate.clone(), - Some(Arc::new(ResourceController::new_for_test( - "test".to_owned(), - true, - ))), - None, + Some(controller), + Some(resource_manager), ); // Use sync mode if pipelined_pessimistic_lock is false. assert_eq!(scheduler.pessimistic_lock_mode(), PessimisticLockMode::Sync); diff --git a/src/storage/txn/txn_status_cache.rs b/src/storage/txn/txn_status_cache.rs new file mode 100644 index 00000000000..c9b231c60ec --- /dev/null +++ b/src/storage/txn/txn_status_cache.rs @@ -0,0 +1,980 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +//! This module implements a cache for the status of recent finished +//! transactions. When a transaction is committed or rolled back, we store the +//! information in the cache for a while. Later, in some cases, one can find +//! the transaction status without accessing the physical storage. This helps +//! to quickly find out the transaction status in some cases. +//! +//! > **Note:** +//! > * Currently, only committed transactions are cached. We may also cache +//! > rolled-back transactions in the future. +//! > * Currently, the cache is only used to filter unnecessary stale prewrite +//! > requests. We will also consider use the cache for other purposes in the +//! > future. +//! +//! ## Why we need this? +//! +//! ### For filtering out unwanted late-arrived stale prewrite requests +//! +//! This solves a problem which has a complicated background. +//! +//! There's such an optimization in pessimistic transactions when TiKV runs +//! accompanied with TiDB: non-unique index keys don't need to be pessimistic- +//! locked, and WRITE CF don't need to be checked either when prewriting. The +//! correctness in case there's any kinds of conflicts will be protected by +//! the corresponding row key, as the index key is never written without +//! writing the corresponding row key. +//! +//! However, it's later found to be problematic, especially with async commit +//! and 1PC, as the prewrite requests on these index keys lost its idempotency. +//! You can see [this issue](https://github.com/tikv/tikv/issues/11187) to see +//! how it causes problems, including those that affects transaction +//! correctness. +//! +//! The problem happens when the prewrite request to the same index key is +//! sent more than once. Our first solution is to add a `is_retry_request` flag +//! to the second (or even more) requests, which is sent due to retrying from +//! the client side. But it's still imperfect, considering that it's +//! theoretically possible that the original request arrives to TiKV later than +//! the retried one. In fact, we once observed this happens in an environment +//! where the network is terribly unstable. +//! +//! Our second solution, additional to the previous one, is to use this cache. +//! Each committed transaction should be guaranteed to be kept in the cache for +//! [a long-enough time](CACHE_ITEMS_REQUIRED_KEEP_TIME). When a prewrite +//! request is received, it should check the cache before executing. If it finds +//! its belonging transaction is already committed, it won't skip constraint +//! check in WRITE CF. Note that if the index key is already committed but the +//! transaction info is not cached, then a late-arrived prewrite request cannot +//! be protected by this mechanism. This means we shouldn't miss any cacheable +//! transactions, and it is the reason why committed transactions should be +//! cached for *a long-enough time*. +//! +//! Unfortunately, the solution is still imperfect. As it's already known, it +//! may still be problematic due to the following reasons: +//! +//! 1. We don't have mechanism to refuse requests that have +//! past more than [CACHE_ITEMS_REQUIRED_KEEP_TIME] since they were sent. +//! 2. To prevent the cache from consuming too much more memory than expected, +//! we have a limit to the capacity (though the limit is very large), and it's +//! configurable (so the cache can be disabled, see how the `capacity` parameter +//! of function [TxnStatusCache::new] is used) as a way to escape from potential +//! faults. +//! 3. The cache can't be synced across different TiKV instances. +//! +//! The third case above needs detailed explanation to be clarified. This is +//! an example of the problem: +//! +//! 1. Client try to send prewrite request to TiKV A, who has the leader of the +//! region containing a index key. The request is not received by TiKV and the +//! client retries. +//! 2. The leader is transferred to TiKV B, and the retries prewrite request +//! is sent to it and processed successfully. +//! 3. The transaction is committed on TiKV B, not being known by TiKV A. +//! 4. The leader transferred back to TiKV A. +//! 5. The original request arrives to TiKV A and being executed. As the +//! status of the transaction is not in the cache in TiKV A, the prewrite +//! request will be handled in normal way, skipping constraint checks. +//! +//! As of the time when this module is written, the above remaining cases have +//! not yet been handled, considering the extremely low possibility to happen +//! and high complexity to fix. +//! +//! The perfect and most elegant way to fix all of these problem is never to +//! skip constraint checks or never skipping pessimistic locks for index keys. +//! Or to say, totally remove the optimization mentioned above on index keys. +//! But for historical reason, this may lead to significant performance +//! regression in existing clusters. +//! +//! ### For read data locked by large transactions more efficiently +//! +//! * Note: the `TxnStatusCache` is designed prepared for this usage, but not +//! used yet for now. +//! +//! Consider the case that a very-large transaction locked a lot of keys after +//! prewriting, while many simple reads and writes executes frequently, thus +//! these simple transactions frequently meets the lock left by the large +//! transaction. It will be very inefficient for these small transactions to +//! come back to the client and start resolve lock procedure. Even if the client +//! side has the cache of that transaction, it still wastes an RTT. +//! +//! There would be more possibilities if we have such a cache in TiKV side: for +//! read requests, it can check the cache to know whether it can read from the +//! lock; and for write requests, if it finds the transaction of that lock is +//! already committed, it can merge together the resolve-lock-committing and the +//! write operation that the request needs to perform. + +use std::{ + sync::{atomic::AtomicU64, Arc}, + time::{Duration, SystemTime, UNIX_EPOCH}, +}; + +use crossbeam::utils::CachePadded; +use parking_lot::Mutex; +use tikv_util::{ + lru, + lru::{GetTailEntry, LruCache}, +}; +use txn_types::TimeStamp; + +use crate::storage::metrics::*; + +const TXN_STATUS_CACHE_SLOTS: usize = 128; + +/// An cache item should be kept for at least this time. +/// Actually this should be guaranteed only for committed transactions. See +/// [this section](# +/// for-filtering-out-unwanted-late-arrived-stale-prewrite-requests) for details +/// about why this is needed. +const CACHE_ITEMS_REQUIRED_KEEP_TIME: Duration = Duration::from_secs(30); + +struct CacheEntry { + commit_ts: TimeStamp, + /// The system timestamp in milliseconds when the entry is inserted to the + /// cache. + insert_time: u64, +} + +/// Defines the policy to evict expired entries from the cache. +/// [`TxnStatusCache`] needs to keep entries for a while, so the common +/// policy that only limiting capacity is not proper to be used here. +struct TxnStatusCacheEvictPolicy { + required_keep_time_millis: u64, + #[cfg(test)] + simulated_system_time: Option>, +} + +impl TxnStatusCacheEvictPolicy { + fn new( + required_keep_time: Duration, + #[allow(unused_variables)] simulated_system_time: Option>, + ) -> Self { + Self { + required_keep_time_millis: required_keep_time.as_millis() as u64, + #[cfg(test)] + simulated_system_time, + } + } + + #[inline] + #[cfg(not(test))] + fn now(&self) -> SystemTime { + SystemTime::now() + } + + /// When used in tests, the system time can be simulated by controlling the + /// field `simulated_system_time`. + #[inline] + #[cfg(test)] + fn now(&self) -> SystemTime { + // Always get the system time to simulate the latency. + let now = SystemTime::now(); + if let Some(pseudo_system_time) = &self.simulated_system_time { + UNIX_EPOCH + + std::time::Duration::from_millis( + pseudo_system_time.load(std::sync::atomic::Ordering::Acquire), + ) + } else { + now + } + } +} + +impl lru::EvictPolicy for TxnStatusCacheEvictPolicy { + fn should_evict( + &self, + current_size: usize, + capacity: usize, + get_tail_entry: &impl GetTailEntry, + ) -> bool { + // See how much time has been elapsed since the tail entry is inserted. + // If it's long enough, remove it. + if let Some((_, v)) = get_tail_entry.get_tail_entry() { + if self.now().duration_since(UNIX_EPOCH).unwrap().as_millis() as u64 + > self.required_keep_time_millis + v.insert_time + { + return true; + } + } + + // If the capacity limit is exceeded, remove it. + current_size > capacity + } +} + +type TxnStatusCacheSlot = + LruCache; + +/// The cache for storing transaction status. It holds recent +/// `start_ts` -> `commit_ts` pairs for a while, which can be useful for quickly +/// but not strictly determining transaction status. +/// +/// `TxnStatusCache` is divided into several slots +/// to make the lock more fine-grained. Each slot uses an [`LruCache`] as the +/// internal implementation, with customized evict policy. However, we do not +/// always adopt the LRU behavior. Some operation to an existing entry in the +/// cache won't promote it to the most-recent place. +/// +/// Note that the `TxnStatusCache` updates metrics in some operations assuming +/// there's at most one instance of `TxnStatusCache` in a process. +pub struct TxnStatusCache { + slots: Vec>>, + is_enabled: bool, +} + +unsafe impl Sync for TxnStatusCache {} + +impl TxnStatusCache { + fn new_impl( + slots: usize, + required_keep_time: Duration, + capacity: usize, + simulated_system_time: Option>, + ) -> Self { + if capacity == 0 { + return Self { + slots: vec![], + is_enabled: false, + }; + } + + // The limit of the LruCache of each slot. + let allowed_capacity_per_slot = capacity / slots; + // The total memory allocated initially by the LruCache's internal data + // structure for all slots. + + let mut initial_allocated_capacity_total = 0; + let res = Self { + slots: (0..slots) + .map(|_| { + let cache = LruCache::new( + allowed_capacity_per_slot, + 0, + lru::CountTracker::default(), + TxnStatusCacheEvictPolicy::new( + required_keep_time, + simulated_system_time.clone(), + ), + ); + let allocated_capacity = cache.internal_allocated_capacity(); + initial_allocated_capacity_total += allocated_capacity; + Mutex::new(cache).into() + }) + .collect(), + is_enabled: true, + }; + SCHED_TXN_STATUS_CACHE_SIZE + .allocated + .set(initial_allocated_capacity_total as i64); + res + } + + pub fn new(capacity: usize) -> Self { + Self::with_slots_and_time_limit( + TXN_STATUS_CACHE_SLOTS, + CACHE_ITEMS_REQUIRED_KEEP_TIME, + capacity, + ) + } + + #[cfg(test)] + pub fn new_for_test() -> Self { + // 1M capacity should be enough for tests. + Self::with_slots_and_time_limit(16, CACHE_ITEMS_REQUIRED_KEEP_TIME, 1 << 20) + } + + pub fn with_slots_and_time_limit( + slots: usize, + required_keep_time: Duration, + capacity: usize, + ) -> Self { + Self::new_impl(slots, required_keep_time, capacity, None) + } + + /// Create a `TxnStatusCache` instance for test purpose, with simulating + /// system time enabled. This helps when testing functionalities that are + /// related to system time. + /// + /// An `AtomicU64` will be returned. Store timestamps + /// in milliseconds in it to control the time. + #[cfg(test)] + fn with_simulated_system_time( + slots: usize, + requried_keep_time: Duration, + capacity: usize, + ) -> (Self, Arc) { + let system_time = Arc::new(AtomicU64::new(0)); + let res = Self::new_impl( + slots, + requried_keep_time, + capacity, + Some(system_time.clone()), + ); + (res, system_time) + } + + fn slot_index(&self, start_ts: TimeStamp) -> usize { + fxhash::hash(&start_ts) % self.slots.len() + } + + /// Insert a transaction status into the cache. The current system time + /// should be passed from outside to avoid getting system time repeatedly + /// when multiple items is being inserted. + /// + /// If the transaction's information is already in the cache, it will + /// **NOT** be promoted to the most-recent place of the internal LRU. + pub fn insert(&self, start_ts: TimeStamp, commit_ts: TimeStamp, now: SystemTime) { + if !self.is_enabled { + return; + } + + let insert_time = now.duration_since(UNIX_EPOCH).unwrap().as_millis() as u64; + let mut slot = self.slots[self.slot_index(start_ts)].lock(); + let previous_size = slot.size(); + let previous_allocated = slot.internal_allocated_capacity(); + slot.insert_if_not_exist( + start_ts, + CacheEntry { + commit_ts, + insert_time, + }, + ); + let size = slot.size(); + let allocated = slot.internal_allocated_capacity(); + drop(slot); + + // Update statistics. + // CAUTION: Assuming that only one TxnStatusCache instance is in a TiKV process. + SCHED_TXN_STATUS_CACHE_SIZE + .used + .add(size as i64 - previous_size as i64); + SCHED_TXN_STATUS_CACHE_SIZE + .allocated + .add(allocated as i64 - previous_allocated as i64); + } + + /// Try to get an item from the cache, without promoting the item (if + /// exists) to the most recent place. + pub fn get_no_promote(&self, start_ts: TimeStamp) -> Option { + if !self.is_enabled { + return None; + } + + let slot = self.slots[self.slot_index(start_ts)].lock(); + slot.get_no_promote(&start_ts).map(|entry| entry.commit_ts) + } + + pub fn get(&self, start_ts: TimeStamp) -> Option { + if !self.is_enabled { + return None; + } + + let mut slot = self.slots[self.slot_index(start_ts)].lock(); + slot.get(&start_ts).map(|entry| entry.commit_ts) + } + + /// Remove an entry from the cache. We usually don't need to remove anything + /// from the `TxnStatusCache`, but it's useful in tests to construct cache- + /// miss cases. + #[cfg(test)] + pub fn remove(&self, start_ts: TimeStamp) -> Option { + if !self.is_enabled { + return None; + } + + let res = { + let mut slot = self.slots[self.slot_index(start_ts)].lock(); + slot.remove(&start_ts).map(|e| e.commit_ts) + }; + debug_assert!(self.get_no_promote(start_ts).is_none()); + res + } +} + +#[cfg(test)] +mod tests { + use std::{ + sync::{ + atomic::{AtomicU64, Ordering}, + Arc, + }, + time::{Duration, Instant, SystemTime}, + }; + + use rand::{prelude::SliceRandom, Rng}; + + use super::*; + + fn bench_insert_impl(b: &mut test::Bencher, init_size: usize) { + let (c, time) = TxnStatusCache::with_simulated_system_time( + TXN_STATUS_CACHE_SLOTS, + Duration::from_millis(init_size as u64), + 1 << 20, + ); + let start_time = SystemTime::now(); + // Spread these items evenly in a specific time limit, so that every time + // a new item is inserted, an item will be popped out. + for i in 1..=init_size { + c.insert( + (i as u64).into(), + (i as u64 + 1).into(), + start_time + Duration::from_millis(i as u64), + ); + } + let mut current_time_shift = (init_size + 1) as u64; + b.iter(|| { + let simulated_now = start_time + Duration::from_millis(current_time_shift); + // Simulate the system time advancing. + time.store( + simulated_now + .duration_since(UNIX_EPOCH) + .unwrap() + .as_millis() as u64, + Ordering::Release, + ); + c.insert( + current_time_shift.into(), + (current_time_shift + 1).into(), + simulated_now, + ); + current_time_shift += 1; + }); + test::black_box(&c); + } + + fn bench_get_impl(b: &mut test::Bencher, init_size: usize) { + let c = TxnStatusCache::with_slots_and_time_limit( + TXN_STATUS_CACHE_SLOTS, + CACHE_ITEMS_REQUIRED_KEEP_TIME, + 1 << 20, + ); + let now = SystemTime::now(); + for i in 1..=init_size { + c.insert( + (i as u64).into(), + (i as u64 + 1).into(), + now + Duration::from_millis(i as u64), + ); + } + let rand_range = if init_size == 0 { 10000 } else { init_size } as u64; + b.iter(|| { + let ts = rand::thread_rng().gen_range(0u64, rand_range); + let res = c.get_no_promote(ts.into()); + test::black_box(&res); + }) + } + + #[bench] + fn bench_insert_empty(b: &mut test::Bencher) { + bench_insert_impl(b, 0); + } + + #[bench] + fn bench_insert_100000(b: &mut test::Bencher) { + bench_insert_impl(b, 100000); + } + + #[bench] + fn bench_get_empty(b: &mut test::Bencher) { + bench_get_impl(b, 0); + } + + #[bench] + fn bench_get_100000(b: &mut test::Bencher) { + bench_get_impl(b, 100000); + } + + /// A simple statistic tool for collecting a set of data and calculating the + /// average, stddev, and percentiles (by using a linear histogram). + /// Data is collected in u128, and results are given in f64. + struct SimpleStatistics { + sum: u128, + sum_square: u128, + count: usize, + bucket_width: u128, + buckets: Vec, + } + + impl SimpleStatistics { + fn new(bucket_width: u128) -> Self { + Self { + sum: 0, + sum_square: 0, + count: 0, + bucket_width, + buckets: vec![], + } + } + + /// Merge another instance into the current one + fn add(&mut self, other: Self) { + self.sum += other.sum; + self.sum_square += other.sum_square; + self.count += other.count; + assert_eq!(self.bucket_width, other.bucket_width); + if self.buckets.len() < other.buckets.len() { + self.buckets.resize(other.buckets.len(), 0); + } + for (count, other_count) in self.buckets.iter_mut().zip(other.buckets.iter()) { + *count += *other_count + } + } + + fn avg(&self) -> f64 { + self.sum as f64 / (self.count as f64) + } + + fn stddev(&self) -> f64 { + let avg = self.avg(); + let sum_sqr_diff: f64 = + (self.sum_square as f64) - (self.sum as f64 * avg * 2.0) + avg * self.count as f64; + (sum_sqr_diff / (self.count - 1) as f64).sqrt() + } + + /// Calculate the percentile value at specified position (should be in + /// range [0, 1]) + fn percentile(&self, position: f64) -> f64 { + let mut bucket = self.buckets.len(); + let mut prefix_sum = self.count; + while bucket > 0 { + bucket -= 1; + prefix_sum -= self.buckets[bucket]; + let prefix_percentile = prefix_sum as f64 / self.count as f64; + if prefix_percentile <= position { + assert_le!(prefix_sum as f64, position * self.count as f64); + assert_lt!( + position * self.count as f64, + (prefix_sum + self.buckets[bucket]) as f64 + ); + break; + } + } + + bucket as f64 * self.bucket_width as f64 + + (position * self.count as f64 - prefix_sum as f64) * self.bucket_width as f64 + / self.buckets[bucket] as f64 + } + + fn observe(&mut self, value: u128) { + self.sum += value; + self.sum_square += value * value; + self.count += 1; + let bucket = (value / self.bucket_width) as usize; + if self.buckets.len() <= bucket { + self.buckets.resize(bucket + 1, 0); + } + self.buckets[bucket] += 1; + } + } + + fn bench_concurrent_impl( + name: &str, + threads: usize, + function: impl Fn(u64) -> T + Send + Sync + 'static, + ) { + let start_time = Instant::now(); + // Run the benchmark code repeatedly for 10 seconds. + const TIME_LIMIT: Duration = Duration::from_secs(10); + let iteration = Arc::new(AtomicU64::new(0)); + + // Make the lifetime checker happy. + let function = Arc::new(function); + + let mut handles = Vec::with_capacity(threads); + for _ in 0..threads { + let f = function.clone(); + let iteration = iteration.clone(); + let handle = std::thread::spawn(move || { + let mut stats = SimpleStatistics::new(20); + loop { + if start_time.elapsed() > TIME_LIMIT { + break; + } + let i = iteration.fetch_add(1, Ordering::SeqCst); + let iter_start_time = Instant::now(); + test::black_box(f(i)); + let duration = iter_start_time.elapsed(); + stats.observe(duration.as_nanos()); + } + stats + }); + handles.push(handle); + } + + let mut total_stats = SimpleStatistics::new(20); + for h in handles { + total_stats.add(h.join().unwrap()); + } + + println!( + "benchmark {}: duration per iter: avg: {:?}, stddev: {:?}, percentile .99: {:?}, percentile .999: {:?}", + name, + Duration::from_nanos(total_stats.avg() as u64), + Duration::from_nanos(total_stats.stddev() as u64), + Duration::from_nanos(total_stats.percentile(0.99) as u64), + Duration::from_nanos(total_stats.percentile(0.999) as u64), + ); + } + + fn bench_txn_status_cache_concurrent_impl( + threads: usize, + init_size: usize, + simulate_contention: bool, + get_before_insert: bool, + ) { + let slots = if simulate_contention { + 1 + } else { + TXN_STATUS_CACHE_SLOTS + }; + let (c, time) = TxnStatusCache::with_simulated_system_time( + slots, + Duration::from_millis(init_size as u64), + 1 << 20, + ); + let start_time = SystemTime::now(); + for i in 1..=init_size { + c.insert( + (i as u64).into(), + (i as u64 + 1).into(), + start_time + Duration::from_millis(i as u64), + ); + } + + let name = format!( + "bench_concurrent_{}_{}_size{}{}", + if get_before_insert { + "get_and_insert" + } else { + "insert" + }, + threads, + init_size, + if simulate_contention { + "_contention" + } else { + "" + }, + ); + + bench_concurrent_impl(&name, threads, move |iter| { + let time_shift = init_size as u64 + iter; + let now = start_time + Duration::from_millis(time_shift); + time.store( + now.duration_since(UNIX_EPOCH).unwrap().as_millis() as u64, + Ordering::Release, + ); + + if get_before_insert { + test::black_box(c.get_no_promote(time_shift.into())); + } + c.insert(time_shift.into(), (time_shift + 1).into(), now); + test::black_box(&c); + }); + } + + #[bench] + #[ignore] + fn bench_txn_status_cache_concurrent(_b: &mut test::Bencher) { + // This case is implemented to run the concurrent benchmark in a handy way + // just like running other normal benchmarks. However, it doesn't seem + // to be possible to benchmark an operation in concurrent way by using + // either the built-in bencher or criterion. + // Here we test it in our own way without using the built-in bencher, + // and output the result by stdout. + // When you need to run this benchmark, comment out the `#[ignore]` and + // add --nocapture in your benchmark command line to get the result. + bench_txn_status_cache_concurrent_impl(16, 10000, false, false); + bench_txn_status_cache_concurrent_impl(16, 10000, true, false); + bench_txn_status_cache_concurrent_impl(16, 10000, false, true); + bench_txn_status_cache_concurrent_impl(16, 10000, true, true); + bench_txn_status_cache_concurrent_impl(64, 10000, false, false); + bench_txn_status_cache_concurrent_impl(64, 10000, true, false); + bench_txn_status_cache_concurrent_impl(64, 10000, false, true); + bench_txn_status_cache_concurrent_impl(64, 10000, true, true); + } + + #[test] + fn test_insert_and_get() { + let c = TxnStatusCache::new_for_test(); + assert!(c.get_no_promote(1.into()).is_none()); + + let now = SystemTime::now(); + + c.insert(1.into(), 2.into(), now); + assert_eq!(c.get_no_promote(1.into()).unwrap(), 2.into()); + c.insert(3.into(), 4.into(), now); + assert_eq!(c.get_no_promote(3.into()).unwrap(), 4.into()); + + // This won't actually happen, since a transaction will never have commit info + // with two different commit_ts. We just use this to check replacing + // won't happen. + c.insert(1.into(), 4.into(), now); + assert_eq!(c.get_no_promote(1.into()).unwrap(), 2.into()); + + let mut start_ts_list: Vec<_> = (1..100).step_by(2).map(TimeStamp::from).collect(); + start_ts_list.shuffle(&mut rand::thread_rng()); + for &start_ts in &start_ts_list { + let commit_ts = start_ts.next(); + c.insert(start_ts, commit_ts, now); + } + start_ts_list.shuffle(&mut rand::thread_rng()); + for &start_ts in &start_ts_list { + let commit_ts = start_ts.next(); + assert_eq!(c.get_no_promote(start_ts).unwrap(), commit_ts); + } + } + + #[test] + fn test_evicting_expired() { + let (c, time) = + TxnStatusCache::with_simulated_system_time(1, Duration::from_millis(1000), 1000); + let time_base = SystemTime::now(); + let set_time = |offset_millis: u64| { + time.store( + time_base.duration_since(UNIX_EPOCH).unwrap().as_millis() as u64 + offset_millis, + Ordering::Release, + ) + }; + let now = || UNIX_EPOCH + Duration::from_millis(time.load(Ordering::Acquire)); + + set_time(0); + assert_lt!( + time_base.duration_since(now()).unwrap(), + Duration::from_millis(1) + ); + + c.insert(1.into(), 2.into(), now()); + set_time(1); + c.insert(3.into(), 4.into(), now()); + set_time(2); + c.insert(5.into(), 6.into(), now()); + // Size should be calculated by count. + assert_eq!(c.slots[0].lock().size(), 3); + + // Insert entry 1 again. So if entry 1 is the first one to be popped out, it + // verifies that inserting an existing key won't promote it. + c.insert(1.into(), 2.into(), now()); + + // All the 3 entries are kept + assert_eq!(c.get_no_promote(1.into()).unwrap(), 2.into()); + assert_eq!(c.get_no_promote(3.into()).unwrap(), 4.into()); + assert_eq!(c.get_no_promote(5.into()).unwrap(), 6.into()); + + set_time(1001); + c.insert(7.into(), 8.into(), now()); + // Entry 1 will be popped out. + assert!(c.get_no_promote(1.into()).is_none()); + assert_eq!(c.get_no_promote(3.into()).unwrap(), 4.into()); + assert_eq!(c.get_no_promote(5.into()).unwrap(), 6.into()); + set_time(1004); + c.insert(9.into(), 10.into(), now()); + // It pops more than 1 entries if there are many expired items at the tail. + // Entry 3 and 5 will be popped out. + assert!(c.get_no_promote(1.into()).is_none()); + assert!(c.get_no_promote(3.into()).is_none()); + assert!(c.get_no_promote(5.into()).is_none()); + assert_eq!(c.get_no_promote(7.into()).unwrap(), 8.into()); + assert_eq!(c.get_no_promote(9.into()).unwrap(), 10.into()); + + // Now the cache's contents are: + // 7@1001, 9@1004 + // Test `get` promotes an entry and entries are not in order on insert time. + assert_eq!(c.get(7.into()).unwrap(), 8.into()); + set_time(2003); + c.insert(11.into(), 12.into(), now()); + assert_eq!(c.get_no_promote(7.into()).unwrap(), 8.into()); + assert_eq!(c.get_no_promote(9.into()).unwrap(), 10.into()); + assert_eq!(c.get_no_promote(11.into()).unwrap(), 12.into()); + + set_time(2005); + c.insert(13.into(), 14.into(), now()); + assert!(c.get_no_promote(7.into()).is_none()); + assert!(c.get_no_promote(9.into()).is_none()); + assert_eq!(c.get_no_promote(11.into()).unwrap(), 12.into()); + + // Now the cache's contents are: + // 11@2003, 13@2005 + // Test inserting existed entries. + // According to the implementation of LruCache, though it won't do any update to + // the content, it still check the tail to see if anything can be + // evicted. + set_time(3004); + c.insert(13.into(), 14.into(), now()); + assert!(c.get_no_promote(11.into()).is_none()); + assert_eq!(c.get_no_promote(13.into()).unwrap(), 14.into()); + + set_time(3006); + c.insert(13.into(), 14.into(), now()); + assert!(c.get_no_promote(13.into()).is_none()); + + // Now the cache is empty. + c.insert(15.into(), 16.into(), now()); + set_time(3008); + c.insert(17.into(), 18.into(), now()); + // Test inserting existed entry doesn't promote it. + // Re-insert 15. + set_time(3009); + c.insert(15.into(), 16.into(), now()); + set_time(4007); + c.insert(19.into(), 20.into(), now()); + // 15's insert time is not updated, and is at the tail of the LRU, so it should + // be popped. + assert!(c.get_no_promote(15.into()).is_none()); + assert_eq!(c.get_no_promote(17.into()).unwrap(), 18.into()); + + // Now the cache's contents are: + // 17@3008, 19@4007 + // Test system time being changed, which can lead to current time being less + // than entries' insert time. + set_time(2000); + c.insert(21.into(), 22.into(), now()); + assert_eq!(c.get_no_promote(17.into()).unwrap(), 18.into()); + assert_eq!(c.get_no_promote(19.into()).unwrap(), 20.into()); + assert_eq!(c.get_no_promote(21.into()).unwrap(), 22.into()); + set_time(3500); + c.insert(23.into(), 24.into(), now()); + assert_eq!(c.get_no_promote(21.into()).unwrap(), 22.into()); + assert_eq!(c.get(17.into()).unwrap(), 18.into()); + assert_eq!(c.get(19.into()).unwrap(), 20.into()); + assert_eq!(c.get(23.into()).unwrap(), 24.into()); + // `get` promotes the entries, and entry 21 is put to the tail. + c.insert(23.into(), 24.into(), now()); + assert_eq!(c.get_no_promote(17.into()).unwrap(), 18.into()); + assert_eq!(c.get_no_promote(19.into()).unwrap(), 20.into()); + assert!(c.get_no_promote(21.into()).is_none()); + assert_eq!(c.get_no_promote(23.into()).unwrap(), 24.into()); + + // Now the cache's contents are: + // 17@3008, 19@4007, 23@3500 + // The time passed to `insert` may differ from the time fetched in + // the `TxnStatusCacheEvictPolicy` as they are fetched at different time. + set_time(4009); + // Insert with time 4007, but check with time 4009 + c.insert(25.into(), 26.into(), now() - Duration::from_millis(2)); + assert!(c.get_no_promote(17.into()).is_none()); + assert_eq!(c.get_no_promote(19.into()).unwrap(), 20.into()); + + // The cache's contents: + // 19@4007, 23@3500, 25@4007 + set_time(4010); + c.insert(27.into(), 28.into(), now()); + // The cache's contents: + // 19@4007, 23@3500, 25@4007, 27@4010 + + // It's also possible to check with a lower time considering that system time + // may be changed. Insert with time 5018, but check with time 5008 + set_time(5008); + c.insert(29.into(), 30.into(), now() + Duration::from_millis(10)); + assert!(c.get_no_promote(19.into()).is_none()); + assert!(c.get_no_promote(23.into()).is_none()); + assert!(c.get_no_promote(25.into()).is_none()); + assert_eq!(c.get_no_promote(27.into()).unwrap(), 28.into()); + assert_eq!(c.get_no_promote(29.into()).unwrap(), 30.into()); + + // Now the the cache's contents are: + // 27@4010, 29@5018 + // Considering the case that system time is being changed, it's even + // possible that the entry being inserted is already expired + // comparing to the current time. It doesn't matter whether the + // entry will be dropped immediately or not. We just ensure it won't + // trigger more troubles. + set_time(7000); + c.insert(31.into(), 32.into(), now() - Duration::from_millis(1001)); + assert!(c.get_no_promote(27.into()).is_none()); + assert!(c.get_no_promote(29.into()).is_none()); + assert!(c.get_no_promote(31.into()).is_none()); + assert_eq!(c.slots[0].lock().size(), 0); + } + + #[test] + fn test_setting_capacity() { + let c = TxnStatusCache::new_impl(2, Duration::from_millis(1000), 10, None); + assert!(c.is_enabled); + assert_eq!(c.slots.len(), 2); + assert_eq!(c.slots[0].lock().capacity(), 5); + assert_eq!(c.slots[1].lock().capacity(), 5); + + let c = TxnStatusCache::new_impl(2, Duration::from_millis(1000), 0, None); + assert!(!c.is_enabled); + assert_eq!(c.slots.len(), 0); + // All operations are noops and won't cause panic or return any incorrect + // result. + c.insert(1.into(), 2.into(), SystemTime::now()); + assert!(c.get_no_promote(1.into()).is_none()); + assert!(c.get(1.into()).is_none()); + } + + #[test] + fn test_evicting_by_capacity() { + let (c, time) = + TxnStatusCache::with_simulated_system_time(1, Duration::from_millis(1000), 5); + let time_base = SystemTime::now(); + let set_time = |offset_millis: u64| { + time.store( + time_base.duration_since(UNIX_EPOCH).unwrap().as_millis() as u64 + offset_millis, + Ordering::Release, + ) + }; + let now = || UNIX_EPOCH + Duration::from_millis(time.load(Ordering::Acquire)); + + set_time(0); + c.insert(1.into(), 2.into(), now()); + set_time(2); + c.insert(3.into(), 4.into(), now()); + set_time(4); + c.insert(5.into(), 6.into(), now()); + set_time(6); + c.insert(7.into(), 8.into(), now()); + + // The cache can keep at most 5 entries. + set_time(8); + c.insert(9.into(), 10.into(), now()); + // Entry 1 not evicted. 5 entries in the cache currently + assert_eq!(c.slots[0].lock().len(), 5); + assert_eq!(c.get_no_promote(1.into()).unwrap(), 2.into()); + set_time(10); + c.insert(11.into(), 12.into(), now()); + // Entry 1 evicted. Still 5 entries in the cache. + assert_eq!(c.slots[0].lock().len(), 5); + assert!(c.get_no_promote(1.into()).is_none()); + assert_eq!(c.get_no_promote(3.into()).unwrap(), 4.into()); + + // Nothing will be evicted after trying to insert an existing key. + c.insert(11.into(), 12.into(), now()); + assert_eq!(c.slots[0].lock().len(), 5); + assert_eq!(c.get_no_promote(3.into()).unwrap(), 4.into()); + + // Current contents (key@time): + // 3@2, 5@4, 7@6. 9@8, 11@10 + // Evicting by time works as well. + set_time(1005); + c.insert(13.into(), 14.into(), now()); + assert_eq!(c.slots[0].lock().len(), 4); + assert!(c.get_no_promote(3.into()).is_none()); + assert!(c.get_no_promote(5.into()).is_none()); + assert_eq!(c.get_no_promote(7.into()).unwrap(), 8.into()); + + // Reorder the entries by `get` to prepare for testing the next case. + assert_eq!(c.get(7.into()).unwrap(), 8.into()); + assert_eq!(c.get(9.into()).unwrap(), 10.into()); + assert_eq!(c.get(11.into()).unwrap(), 12.into()); + + c.insert(15.into(), 16.into(), now()); + // Current contents: + // 13@1005, 7@6. 9@8, 11@10, 15@1005 + assert_eq!(c.slots[0].lock().len(), 5); + // Expired entries that are not the tail can be evicted after the tail + // is evicted due to capacity exceeded. + set_time(1011); + c.insert(17.into(), 18.into(), now()); + assert_eq!(c.slots[0].lock().len(), 2); + assert!(c.get_no_promote(13.into()).is_none()); + assert!(c.get_no_promote(7.into()).is_none()); + assert!(c.get_no_promote(9.into()).is_none()); + assert!(c.get_no_promote(11.into()).is_none()); + assert_eq!(c.get(15.into()).unwrap(), 16.into()); + assert_eq!(c.get(17.into()).unwrap(), 18.into()); + } +} diff --git a/tests/Cargo.toml b/tests/Cargo.toml index 158e56abcb1..fcf228de924 100644 --- a/tests/Cargo.toml +++ b/tests/Cargo.toml @@ -39,6 +39,11 @@ name = "deadlock_detector" harness = false path = "benches/deadlock_detector/mod.rs" +[[bench]] +name = "memory" +harness = false +path = "benches/memory/mod.rs" + [features] default = ["failpoints", "testexport", "test-engine-kv-rocksdb", "test-engine-raft-raft-engine", "cloud-aws", "cloud-gcp", "cloud-azure"] failpoints = ["fail/failpoints", "tikv/failpoints", "pd_client/failpoints"] @@ -55,10 +60,10 @@ test-engine-raft-raft-engine = [ "raftstore/test-engine-raft-raft-engine" ] test-engines-rocksdb = [ - "raftstore/test-engines-rocksdb", + "raftstore/test-engines-rocksdb" ] test-engines-panic = [ - "raftstore/test-engines-panic", + "raftstore/test-engines-panic" ] jemalloc = ["tikv/jemalloc"] mimalloc = ["tikv/mimalloc"] @@ -120,7 +125,6 @@ uuid = { version = "0.8.1", features = ["serde", "v4"] } procinfo = { git = "https://github.com/tikv/procinfo-rs", rev = "7693954bd1dd86eb1709572fd7b62fd5f7ff2ea1" } [dev-dependencies] -arrow = "13.0" byteorder = "1.2" # See https://bheisler.github.io/criterion.rs/book/user_guide/known_limitations.html for the usage # of `real_blackbox` feature. @@ -142,7 +146,7 @@ resource_metering = { workspace = true } security = { workspace = true } serde_json = "1.0" sst_importer = { workspace = true } -test_backup = { workspace = true } +test_backup = { workspace = true, default-features = false } test_coprocessor = { workspace = true } test_pd = { workspace = true } test_pd_client = { workspace = true } diff --git a/tests/benches/memory/mod.rs b/tests/benches/memory/mod.rs new file mode 100644 index 00000000000..33b7b66f80b --- /dev/null +++ b/tests/benches/memory/mod.rs @@ -0,0 +1,122 @@ +// Copyright 2024 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ + sync::{ + atomic::{AtomicBool, Ordering}, + Arc, + }, + thread, + time::Duration, +}; + +use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion}; +use tikv_util::memory::{MemoryQuota, OwnedAllocated}; + +fn bench_memory_quota_alloc(c: &mut Criterion) { + let mut group = c.benchmark_group("Alloc Only"); + + let bytes = 0b1010100; + let quota = Arc::new(MemoryQuota::new(bytes - 1)); + let max_quota = Arc::new(MemoryQuota::new(usize::MAX)); + + group.bench_function(BenchmarkId::new("Alloc", "ok"), |b| { + b.iter(|| { + let _ = black_box(max_quota.alloc(bytes)); + }) + }); + group.bench_function(BenchmarkId::new("Alloc", "fail"), |b| { + b.iter(|| { + let _ = black_box(quota.alloc(bytes)); + }) + }); + + group.finish(); +} + +fn bench_memory_quota_alloc_free(c: &mut Criterion) { + let mut group = c.benchmark_group("Alloc Free"); + + let bytes = 0b1010100; + let quota = Arc::new(MemoryQuota::new(10 * bytes)); + let quota_ = quota.clone(); + + group.bench_function(BenchmarkId::new("MemoryQuota", "alloc free"), |b| { + b.iter(|| { + let _ = black_box(quota.alloc(bytes)); + quota.free(bytes); + }) + }); + group.bench_function(BenchmarkId::new("OwnedAllocated", "alloc free"), |b| { + b.iter(|| { + let mut owned_quota = OwnedAllocated::new(quota_.clone()); + let _ = black_box(owned_quota.alloc(bytes)); + drop(owned_quota); + }) + }); + + group.finish(); +} + +fn bench_memory_quota_multi_threads(c: &mut Criterion) { + memory_quota_multi_threads(c, 32); + memory_quota_multi_threads(c, 64); +} + +fn memory_quota_multi_threads(c: &mut Criterion, total_threads: usize) { + let threads = total_threads - 1; + let mut group = c.benchmark_group(format!("{} Threads", total_threads)); + + let bytes = 0b1010100; + let quota = Arc::new(MemoryQuota::new(2 * threads * bytes)); + + // Alloc and free by multiple thread. + let mut handles = Vec::with_capacity(threads); + let done = Arc::new(AtomicBool::default()); + // Alloc and free take about 20ns on Intel(R) Xeon(R) Gold 6240 CPU @ 2.60GHz. + let duration = Duration::from_nanos(20); + let check_interval = Duration::from_millis(500); + let batch_work_count = check_interval.as_nanos() / duration.as_nanos(); + for _ in 0..threads { + let quota_ = quota.clone(); + let done_ = done.clone(); + handles.push(thread::spawn(move || { + loop { + if done_.load(Ordering::Relaxed) { + return; + } + for _ in 0..batch_work_count { + let _ = black_box(quota_.alloc(bytes)); + quota_.free(bytes); + } + } + })); + } + + let quota_ = quota.clone(); + group.bench_function(BenchmarkId::new("MemoryQuota", "alloc free"), |b| { + b.iter(|| { + let _ = black_box(quota.alloc(bytes)); + quota.free(bytes); + }) + }); + group.bench_function(BenchmarkId::new("OwnedAllocated", "alloc free"), |b| { + b.iter(|| { + let mut owned_quota = OwnedAllocated::new(quota_.clone()); + let _ = black_box(owned_quota.alloc(bytes)); + drop(owned_quota); + }) + }); + + done.store(true, Ordering::Relaxed); + let _ = handles.into_iter().map(|h| h.join().unwrap()); + group.finish(); +} + +criterion_group!( + benches, + bench_memory_quota_alloc, + bench_memory_quota_alloc_free, + bench_memory_quota_multi_threads, +); + +criterion_main!(benches); diff --git a/tests/benches/misc/coprocessor/codec/chunk/chunk.rs b/tests/benches/misc/coprocessor/codec/chunk/chunk.rs deleted file mode 100644 index 4c033f2a80d..00000000000 --- a/tests/benches/misc/coprocessor/codec/chunk/chunk.rs +++ /dev/null @@ -1,176 +0,0 @@ -// Copyright 2018 TiKV Project Authors. Licensed under Apache-2.0. - -use std::sync::Arc; - -use arrow::{ - array, - datatypes::{self, DataType, Field}, - record_batch::RecordBatch, -}; -use tidb_query_datatype::{codec::Datum, prelude::*, FieldTypeFlag, FieldTypeTp}; -use tipb::FieldType; - -pub struct Chunk { - pub data: RecordBatch, -} - -impl Chunk { - pub fn get_datum(&self, col_id: usize, row_id: usize, field_type: &FieldType) -> Datum { - if self.data.column(col_id).is_null(row_id) { - return Datum::Null; - } - - match field_type.as_accessor().tp() { - FieldTypeTp::Tiny - | FieldTypeTp::Short - | FieldTypeTp::Int24 - | FieldTypeTp::Long - | FieldTypeTp::LongLong - | FieldTypeTp::Year => { - if field_type - .as_accessor() - .flag() - .contains(FieldTypeFlag::UNSIGNED) - { - let data = self - .data - .column(col_id) - .as_any() - .downcast_ref::() - .unwrap(); - - Datum::U64(data.value(row_id)) - } else { - let data = self - .data - .column(col_id) - .as_any() - .downcast_ref::() - .unwrap(); - - Datum::I64(data.value(row_id)) - } - } - FieldTypeTp::Float | FieldTypeTp::Double => { - let data = self - .data - .column(col_id) - .as_any() - .downcast_ref::() - .unwrap(); - Datum::F64(data.value(row_id)) - } - _ => unreachable!(), - } - } -} - -pub struct ChunkBuilder { - columns: Vec, -} - -impl ChunkBuilder { - pub fn new(cols: usize, rows: usize) -> ChunkBuilder { - ChunkBuilder { - columns: vec![ColumnsBuilder::new(rows); cols], - } - } - - pub fn build(self, tps: &[FieldType]) -> Chunk { - let mut fields = Vec::with_capacity(tps.len()); - let mut arrays: Vec> = Vec::with_capacity(tps.len()); - for (field_type, column) in tps.iter().zip(self.columns.into_iter()) { - match field_type.as_accessor().tp() { - FieldTypeTp::Tiny - | FieldTypeTp::Short - | FieldTypeTp::Int24 - | FieldTypeTp::Long - | FieldTypeTp::LongLong - | FieldTypeTp::Year => { - if field_type - .as_accessor() - .flag() - .contains(FieldTypeFlag::UNSIGNED) - { - let (f, d) = column.into_u64_array(); - fields.push(f); - arrays.push(d); - } else { - let (f, d) = column.into_i64_array(); - fields.push(f); - arrays.push(d); - } - } - FieldTypeTp::Float | FieldTypeTp::Double => { - let (f, d) = column.into_f64_array(); - fields.push(f); - arrays.push(d); - } - _ => unreachable!(), - }; - } - let schema = datatypes::Schema::new(fields); - let batch = RecordBatch::try_new(Arc::new(schema), arrays).unwrap(); - Chunk { data: batch } - } - - pub fn append_datum(&mut self, col_id: usize, data: Datum) { - self.columns[col_id].append_datum(data) - } -} - -#[derive(Clone)] -pub struct ColumnsBuilder { - data: Vec, -} - -impl ColumnsBuilder { - fn new(rows: usize) -> ColumnsBuilder { - ColumnsBuilder { - data: Vec::with_capacity(rows), - } - } - - fn append_datum(&mut self, data: Datum) { - self.data.push(data) - } - - fn into_i64_array(self) -> (Field, Arc) { - let field = Field::new("", DataType::Int64, true); - let mut data: Vec> = Vec::with_capacity(self.data.len()); - for v in self.data { - match v { - Datum::Null => data.push(None), - Datum::I64(v) => data.push(Some(v)), - _ => unreachable!(), - } - } - (field, Arc::new(array::PrimitiveArray::from(data))) - } - - fn into_u64_array(self) -> (Field, Arc) { - let field = Field::new("", DataType::UInt64, true); - let mut data: Vec> = Vec::with_capacity(self.data.len()); - for v in self.data { - match v { - Datum::Null => data.push(None), - Datum::U64(v) => data.push(Some(v)), - _ => unreachable!(), - } - } - (field, Arc::new(array::PrimitiveArray::from(data))) - } - - fn into_f64_array(self) -> (Field, Arc) { - let field = Field::new("", DataType::Float64, true); - let mut data: Vec> = Vec::with_capacity(self.data.len()); - for v in self.data { - match v { - Datum::Null => data.push(None), - Datum::F64(v) => data.push(Some(v)), - _ => unreachable!(), - } - } - (field, Arc::new(array::PrimitiveArray::from(data))) - } -} diff --git a/tests/benches/misc/coprocessor/codec/chunk/mod.rs b/tests/benches/misc/coprocessor/codec/chunk/mod.rs deleted file mode 100644 index f956e2cb14e..00000000000 --- a/tests/benches/misc/coprocessor/codec/chunk/mod.rs +++ /dev/null @@ -1,140 +0,0 @@ -// Copyright 2018 TiKV Project Authors. Licensed under Apache-2.0. - -mod chunk; - -use test::Bencher; -use tidb_query_datatype::{ - codec::{ - chunk::{Chunk, ChunkEncoder}, - datum::Datum, - mysql::*, - }, - FieldTypeTp, -}; -use tipb::FieldType; - -#[bench] -fn bench_encode_chunk(b: &mut Bencher) { - let rows = 1024; - let fields: Vec = vec![ - FieldTypeTp::LongLong.into(), - FieldTypeTp::LongLong.into(), - FieldTypeTp::VarChar.into(), - FieldTypeTp::VarChar.into(), - FieldTypeTp::NewDecimal.into(), - FieldTypeTp::Json.into(), - ]; - let mut chunk = Chunk::new(&fields, rows); - for row_id in 0..rows { - let s = format!("{}.123435", row_id); - let bs = Datum::Bytes(s.as_bytes().to_vec()); - let dec = Datum::Dec(s.parse().unwrap()); - let json = Datum::Json(Json::from_string(s).unwrap()); - chunk.append_datum(0, &Datum::Null).unwrap(); - chunk.append_datum(1, &Datum::I64(row_id as i64)).unwrap(); - chunk.append_datum(2, &bs).unwrap(); - chunk.append_datum(3, &bs).unwrap(); - chunk.append_datum(4, &dec).unwrap(); - chunk.append_datum(5, &json).unwrap(); - } - - b.iter(|| { - let mut buf = vec![]; - buf.write_chunk(&chunk).unwrap(); - }); -} - -#[bench] -fn bench_chunk_build_tidb(b: &mut Bencher) { - let rows = 1024; - let fields: Vec = vec![FieldTypeTp::LongLong.into(), FieldTypeTp::LongLong.into()]; - - b.iter(|| { - let mut chunk = Chunk::new(&fields, rows); - for row_id in 0..rows { - chunk.append_datum(0, &Datum::Null).unwrap(); - chunk.append_datum(1, &Datum::I64(row_id as i64)).unwrap(); - } - }); -} - -#[bench] -fn bench_chunk_build_official(b: &mut Bencher) { - let rows = 1024; - let fields: Vec = vec![FieldTypeTp::LongLong.into(), FieldTypeTp::LongLong.into()]; - - b.iter(|| { - let mut chunk = chunk::ChunkBuilder::new(fields.len(), rows); - for row_id in 0..rows { - chunk.append_datum(0, Datum::Null); - chunk.append_datum(1, Datum::I64(row_id as i64)); - } - chunk.build(&fields); - }); -} - -#[bench] -fn bench_chunk_iter_tidb(b: &mut Bencher) { - let rows = 1024; - let fields: Vec = vec![FieldTypeTp::LongLong.into(), FieldTypeTp::Double.into()]; - let mut chunk = Chunk::new(&fields, rows); - for row_id in 0..rows { - if row_id & 1 == 0 { - chunk.append_datum(0, &Datum::Null).unwrap(); - } else { - chunk.append_datum(0, &Datum::I64(row_id as i64)).unwrap(); - } - chunk.append_datum(1, &Datum::F64(row_id as f64)).unwrap(); - } - - b.iter(|| { - let mut col1 = 0; - let mut col2 = 0.0; - for row in chunk.iter() { - col1 += match row.get_datum(0, &fields[0]).unwrap() { - Datum::I64(v) => v, - Datum::Null => 0, - _ => unreachable!(), - }; - col2 += match row.get_datum(1, &fields[1]).unwrap() { - Datum::F64(v) => v, - _ => unreachable!(), - }; - } - assert_eq!(col1, 262_144); - assert!(!(523_776.0 - col2).is_normal()); - }); -} - -#[bench] -fn bench_chunk_iter_official(b: &mut Bencher) { - let rows = 1024; - let fields: Vec = vec![FieldTypeTp::LongLong.into(), FieldTypeTp::Double.into()]; - let mut chunk = chunk::ChunkBuilder::new(fields.len(), rows); - for row_id in 0..rows { - if row_id & 1 == 0 { - chunk.append_datum(0, Datum::Null); - } else { - chunk.append_datum(0, Datum::I64(row_id as i64)); - } - - chunk.append_datum(1, Datum::F64(row_id as f64)); - } - let chunk = chunk.build(&fields); - b.iter(|| { - let (mut col1, mut col2) = (0, 0.0); - for row_id in 0..chunk.data.num_rows() { - col1 += match chunk.get_datum(0, row_id, &fields[0]) { - Datum::I64(v) => v, - Datum::Null => 0, - _ => unreachable!(), - }; - col2 += match chunk.get_datum(1, row_id, &fields[1]) { - Datum::F64(v) => v, - _ => unreachable!(), - }; - } - assert_eq!(col1, 262_144); - assert!(!(523_776.0 - col2).is_normal()); - }); -} diff --git a/tests/benches/misc/coprocessor/codec/mod.rs b/tests/benches/misc/coprocessor/codec/mod.rs index 274ec362377..082f1c55894 100644 --- a/tests/benches/misc/coprocessor/codec/mod.rs +++ b/tests/benches/misc/coprocessor/codec/mod.rs @@ -1,6 +1,5 @@ // Copyright 2018 TiKV Project Authors. Licensed under Apache-2.0. -mod chunk; mod mysql; use byteorder::{BigEndian, ByteOrder, LittleEndian}; diff --git a/tests/failpoints/cases/mod.rs b/tests/failpoints/cases/mod.rs index 9c90211c073..caf994fc1cd 100644 --- a/tests/failpoints/cases/mod.rs +++ b/tests/failpoints/cases/mod.rs @@ -7,14 +7,18 @@ mod test_bootstrap; mod test_cmd_epoch_checker; mod test_conf_change; mod test_coprocessor; +mod test_debugger; mod test_disk_full; +mod test_disk_snap_br; mod test_early_apply; mod test_encryption; +mod test_engine; mod test_gc_metrics; mod test_gc_worker; mod test_hibernate; mod test_import_service; mod test_kv_service; +mod test_life; mod test_local_read; mod test_memory_usage_limit; mod test_merge; diff --git a/tests/failpoints/cases/test_async_io.rs b/tests/failpoints/cases/test_async_io.rs index 3d53b9c5f14..8ce349805b0 100644 --- a/tests/failpoints/cases/test_async_io.rs +++ b/tests/failpoints/cases/test_async_io.rs @@ -8,13 +8,15 @@ use std::{ use pd_client::PdClient; use raft::eraftpb::MessageType; use test_raftstore::*; +use test_raftstore_macro::test_case; use tikv_util::HandyRwLock; // Test if the entries can be committed and applied on followers even when // leader's io is paused. -#[test] +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] fn test_async_io_commit_without_leader_persist() { - let mut cluster = new_node_cluster(0, 3); + let mut cluster = new_cluster(0, 3); cluster.cfg.raft_store.cmd_batch_concurrent_ready_max_count = 0; cluster.cfg.raft_store.store_io_pool_size = 2; let pd_client = Arc::clone(&cluster.pd_client); @@ -49,9 +51,10 @@ fn test_async_io_commit_without_leader_persist() { /// Test if the leader delays its destroy after applying conf change to /// remove itself. -#[test] +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] fn test_async_io_delay_destroy_after_conf_change() { - let mut cluster = new_node_cluster(0, 3); + let mut cluster = new_cluster(0, 3); cluster.cfg.raft_store.store_io_pool_size = 2; let pd_client = Arc::clone(&cluster.pd_client); pd_client.disable_default_operator(); @@ -93,6 +96,9 @@ fn test_async_io_delay_destroy_after_conf_change() { /// Test if the peer can be destroyed when it receives a tombstone msg and /// its snapshot is persisting. +/// +/// Note: snapshot flow is changed, so partitioend-raft-kv does not support this +/// test. #[test] fn test_async_io_cannot_destroy_when_persist_snapshot() { let mut cluster = new_node_cluster(0, 3); @@ -176,6 +182,9 @@ fn test_async_io_cannot_destroy_when_persist_snapshot() { } /// Test if the peer can handle ready when its snapshot is persisting. +/// +/// Note: snapshot flow is changed, so partitioend-raft-kv does not support this +/// test. #[test] fn test_async_io_cannot_handle_ready_when_persist_snapshot() { let mut cluster = new_node_cluster(0, 3); diff --git a/tests/failpoints/cases/test_cmd_epoch_checker.rs b/tests/failpoints/cases/test_cmd_epoch_checker.rs index 73bc741d9bb..8af8e29f3ac 100644 --- a/tests/failpoints/cases/test_cmd_epoch_checker.rs +++ b/tests/failpoints/cases/test_cmd_epoch_checker.rs @@ -10,7 +10,7 @@ use kvproto::raft_cmdpb::{RaftCmdRequest, RaftCmdResponse}; use raft::eraftpb::MessageType; use raftstore::store::msg::*; use test_raftstore::*; -use tikv_util::{mpsc::future, HandyRwLock}; +use tikv_util::{future::block_on_timeout, mpsc::future, HandyRwLock}; struct CbReceivers { proposed: mpsc::Receiver<()>, @@ -399,9 +399,7 @@ fn test_accept_proposal_during_conf_change() { let conf_change_fp = "apply_on_conf_change_all_1"; fail::cfg(conf_change_fp, "pause").unwrap(); let mut add_peer_rx = cluster.async_add_peer(r, new_peer(2, 2)).unwrap(); - add_peer_rx - .recv_timeout(Duration::from_millis(100)) - .unwrap_err(); + block_on_timeout(add_peer_rx.as_mut(), Duration::from_millis(100)).unwrap_err(); // Conf change doesn't affect proposals. let write_req = make_write_req(&mut cluster, b"k"); @@ -419,8 +417,7 @@ fn test_accept_proposal_during_conf_change() { fail::remove(conf_change_fp); assert!( - !add_peer_rx - .recv_timeout(Duration::from_secs(1)) + !block_on_timeout(add_peer_rx, Duration::from_secs(1)) .unwrap() .get_header() .has_error() diff --git a/tests/failpoints/cases/test_coprocessor.rs b/tests/failpoints/cases/test_coprocessor.rs index 0710f778aa7..be9d978b23a 100644 --- a/tests/failpoints/cases/test_coprocessor.rs +++ b/tests/failpoints/cases/test_coprocessor.rs @@ -31,8 +31,15 @@ fn test_deadline() { fail::cfg("deadline_check_fail", "return()").unwrap(); let resp = handle_request(&endpoint, req); - - assert!(resp.get_other_error().contains("exceeding the deadline")); + let region_err = resp.get_region_error(); + assert_eq!( + region_err.get_server_is_busy().reason, + "deadline is exceeded".to_string() + ); + assert_eq!( + region_err.get_message(), + "Coprocessor task terminated due to exceeding the deadline" + ); } #[test] @@ -46,8 +53,15 @@ fn test_deadline_2() { fail::cfg("rockskv_async_snapshot", "panic").unwrap(); fail::cfg("deadline_check_fail", "return()").unwrap(); let resp = handle_request(&endpoint, req); - - assert!(resp.get_other_error().contains("exceeding the deadline")); + let region_err = resp.get_region_error(); + assert_eq!( + region_err.get_server_is_busy().reason, + "deadline is exceeded".to_string() + ); + assert_eq!( + region_err.get_message(), + "Coprocessor task terminated due to exceeding the deadline" + ); } /// Test deadline exceeded when request is handling @@ -80,12 +94,14 @@ fn test_deadline_3() { let mut resp = SelectResponse::default(); resp.merge_from_bytes(cop_resp.get_data()).unwrap(); - assert!( - cop_resp.other_error.contains("exceeding the deadline") - || resp - .get_error() - .get_msg() - .contains("exceeding the deadline") + let region_err = cop_resp.get_region_error(); + assert_eq!( + region_err.get_server_is_busy().reason, + "deadline is exceeded".to_string() + ); + assert_eq!( + region_err.get_message(), + "Coprocessor task terminated due to exceeding the deadline" ); } diff --git a/tests/failpoints/cases/test_debugger.rs b/tests/failpoints/cases/test_debugger.rs new file mode 100644 index 00000000000..f70ebcb6d32 --- /dev/null +++ b/tests/failpoints/cases/test_debugger.rs @@ -0,0 +1,147 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +use std::path::Path; + +use engine_traits::{RaftEngine, RaftLogBatch, TabletRegistry}; +use kvproto::{ + kvrpcpb::MvccInfo, + metapb, + raft_serverpb::{PeerState, RegionLocalState}, +}; +use raft_log_engine::RaftLogEngine; +use test_raftstore::new_peer; +use tikv::{ + config::TikvConfig, + server::{debug::Debugger, debug2::new_debugger, KvEngineFactoryBuilder}, + storage::{txn::tests::must_prewrite_put, TestEngineBuilder}, +}; + +const INITIAL_TABLET_INDEX: u64 = 5; +const INITIAL_APPLY_INDEX: u64 = 5; + +// Prepare some data +// Region meta range and rocksdb range of each region: +// Region 1: k01 .. k04 rocksdb: zk00 .. zk04 +// Region 2: k05 .. k09 rocksdb: zk05 .. zk09 +// Region 3: k10 .. k14 rocksdb: zk10 .. zk14 +// Region 4: k15 .. k19 rocksdb: zk15 .. zk19 +// Region 5: k20 .. k24 rocksdb: zk20 .. zk24 +// Region 6: k26 .. k27 rocksdb: zk25 .. zk29 +fn prepare_data_on_disk(path: &Path) { + let mut cfg = TikvConfig::default(); + cfg.storage.data_dir = path.to_str().unwrap().to_string(); + cfg.raft_store.raftdb_path = cfg.infer_raft_db_path(None).unwrap(); + cfg.raft_engine.mut_config().dir = cfg.infer_raft_engine_path(None).unwrap(); + cfg.gc.enable_compaction_filter = false; + let cache = cfg.storage.block_cache.build_shared_cache(); + let env = cfg.build_shared_rocks_env(None, None).unwrap(); + + let factory = KvEngineFactoryBuilder::new(env, &cfg, cache, None).build(); + let reg = TabletRegistry::new(Box::new(factory), path).unwrap(); + + let raft_engine = RaftLogEngine::new(cfg.raft_engine.config(), None, None).unwrap(); + let mut wb = raft_engine.log_batch(5); + for i in 0..6 { + let mut region = metapb::Region::default(); + let start_key = if i != 0 { + format!("k{:02}", i * 5) + } else { + String::from("k01") + }; + let end_key = format!("k{:02}", (i + 1) * 5); + region.set_id(i + 1); + region.set_start_key(start_key.into_bytes()); + region.set_end_key(end_key.into_bytes()); + let mut region_state = RegionLocalState::default(); + region_state.set_tablet_index(INITIAL_TABLET_INDEX); + if region.get_id() == 4 { + region_state.set_state(PeerState::Tombstone); + } else if region.get_id() == 6 { + region.set_start_key(b"k26".to_vec()); + region.set_end_key(b"k28".to_vec()); + } + // add dummy peer to pass verification + region.mut_peers().push(new_peer(0, 0)); + region_state.set_region(region); + + let tablet_path = reg.tablet_path(i + 1, INITIAL_TABLET_INDEX); + // Use tikv_kv::RocksEngine instead of loading tablet from registry in order to + // use prewrite method to prepare mvcc data + let mut engine = TestEngineBuilder::new().path(tablet_path).build().unwrap(); + for i in i * 5..(i + 1) * 5 { + let key = format!("zk{:02}", i); + let val = format!("val{:02}", i); + // Use prewrite only is enough for preparing mvcc data + must_prewrite_put( + &mut engine, + key.as_bytes(), + val.as_bytes(), + key.as_bytes(), + 10, + ); + } + + wb.put_region_state(i + 1, INITIAL_APPLY_INDEX, ®ion_state) + .unwrap(); + } + raft_engine.consume(&mut wb, true).unwrap(); +} + +// For simplicity, the format of the key is inline with data in +// prepare_data_on_disk +fn extract_key(key: &[u8]) -> &[u8] { + &key[1..4] +} + +#[test] +fn test_scan_mvcc() { + // We deliberately make region meta not match with rocksdb, set unlimited range + // compaction filter to avoid trim operation. + fail::cfg("unlimited_range_compaction_filter", "return").unwrap(); + + let dir = test_util::temp_dir("test-debugger", false); + prepare_data_on_disk(dir.path()); + let debugger = new_debugger(dir.path()); + // Test scan with bad start, end or limit. + assert!(debugger.scan_mvcc(b"z", b"", 0).is_err()); + assert!(debugger.scan_mvcc(b"z", b"x", 3).is_err()); + + let verify_scanner = + |range, scanner: &mut dyn Iterator, MvccInfo)>>| { + for i in range { + let key = format!("k{:02}", i).into_bytes(); + assert_eq!(key, extract_key(&scanner.next().unwrap().unwrap().0)); + } + }; + + // full scan + let mut scanner = debugger.scan_mvcc(b"", b"", 100).unwrap(); + verify_scanner(1..15, &mut scanner); + verify_scanner(20..25, &mut scanner); + verify_scanner(26..28, &mut scanner); + assert!(scanner.next().is_none()); + + // Range has more elements than limit + let mut scanner = debugger.scan_mvcc(b"zk01", b"zk09", 5).unwrap(); + verify_scanner(1..6, &mut scanner); + assert!(scanner.next().is_none()); + + // Range has less elements than limit + let mut scanner = debugger.scan_mvcc(b"zk07", b"zk10", 10).unwrap(); + verify_scanner(7..10, &mut scanner); + assert!(scanner.next().is_none()); + + // Start from the key where no region contains it + let mut scanner = debugger.scan_mvcc(b"zk16", b"", 100).unwrap(); + verify_scanner(20..25, &mut scanner); + verify_scanner(26..28, &mut scanner); + assert!(scanner.next().is_none()); + + // Scan a range not existed in the cluster + let mut scanner = debugger.scan_mvcc(b"zk16", b"zk19", 100).unwrap(); + assert!(scanner.next().is_none()); + + // The end key is less than the start_key of the first region + let mut scanner = debugger.scan_mvcc(b"", b"zj", 100).unwrap(); + assert!(scanner.next().is_none()); +} diff --git a/tests/failpoints/cases/test_disk_full.rs b/tests/failpoints/cases/test_disk_full.rs index bd4271be12d..d8b3fadb054 100644 --- a/tests/failpoints/cases/test_disk_full.rs +++ b/tests/failpoints/cases/test_disk_full.rs @@ -5,12 +5,12 @@ use std::{thread, time::Duration}; use kvproto::{ disk_usage::DiskUsage, kvrpcpb::{DiskFullOpt, Op}, - metapb::Region, raft_cmdpb::*, }; use raft::eraftpb::MessageType; use raftstore::store::msg::*; use test_raftstore::*; +use test_raftstore_macro::test_case; use tikv_util::{config::ReadableDuration, future::block_on_timeout, time::Instant}; fn assert_disk_full(resp: &RaftCmdResponse) { @@ -34,148 +34,147 @@ fn get_fp(usage: DiskUsage, store_id: u64) -> String { } // check the region new leader is elected. -fn assert_region_leader_changed( - cluster: &mut Cluster, - region_id: u64, - original_leader: u64, -) { - let timer = Instant::now(); - loop { - if timer.saturating_elapsed() > Duration::from_secs(5) { - panic!("Leader cannot change when the only disk full node is leader"); +macro_rules! assert_region_leader_changed { + ($cluster:expr, $region_id:expr, $original_leader:expr) => {{ + let timer = Instant::now(); + loop { + if timer.saturating_elapsed() > Duration::from_secs(5) { + panic!("Leader cannot change when the only disk full node is leader"); + } + let new_leader = $cluster.query_leader(1, $region_id, Duration::from_secs(1)); + if new_leader.is_none() { + sleep_ms(10); + continue; + } + if new_leader.unwrap().get_id() == $original_leader { + sleep_ms(10); + continue; + } else { + break; + } } - let new_leader = cluster.query_leader(1, region_id, Duration::from_secs(1)); - if new_leader.is_none() { - sleep_ms(10); - continue; - } - if new_leader.unwrap().get_id() == original_leader { - sleep_ms(10); - continue; - } else { - break; - } - } + }}; } -fn ensure_disk_usage_is_reported( - cluster: &mut Cluster, - peer_id: u64, - store_id: u64, - region: &Region, -) { - let peer = new_peer(store_id, peer_id); - let key = region.get_start_key(); - let ch = async_read_on_peer(cluster, peer, region.clone(), key, true, true); - block_on_timeout(ch, Duration::from_secs(1)).unwrap(); +macro_rules! ensure_disk_usage_is_reported { + ($cluster:expr, $peer_id:expr, $store_id:expr, $region:expr) => {{ + let peer = new_peer($store_id, $peer_id); + let key = $region.get_start_key(); + let ch = async_read_on_peer($cluster, peer, $region.clone(), key, true, true); + block_on_timeout(ch, Duration::from_secs(1)).unwrap(); + }}; } -fn test_disk_full_leader_behaviors(usage: DiskUsage) { - let mut cluster = new_node_cluster(0, 3); - cluster.pd_client.disable_default_operator(); - cluster.run(); - - // To ensure all replicas are not pending. - cluster.must_put(b"k1", b"v1"); - must_get_equal(&cluster.get_engine(1), b"k1", b"v1"); - must_get_equal(&cluster.get_engine(2), b"k1", b"v1"); - must_get_equal(&cluster.get_engine(3), b"k1", b"v1"); - - cluster.must_transfer_leader(1, new_peer(1, 1)); - fail::cfg(get_fp(usage, 1), "return").unwrap(); - - // Test new normal proposals won't be allowed when disk is full. - let old_last_index = cluster.raft_local_state(1, 1).last_index; - let mut rx = cluster.async_put(b"k2", b"v2").unwrap(); - assert_disk_full(&rx.recv_timeout(Duration::from_secs(2)).unwrap()); - let new_last_index = cluster.raft_local_state(1, 1).last_index; - assert_eq!(old_last_index, new_last_index); - - assert_region_leader_changed(&mut cluster, 1, 1); - fail::remove(get_fp(usage, 1)); - cluster.must_transfer_leader(1, new_peer(1, 1)); - fail::cfg(get_fp(usage, 1), "return").unwrap(); - - // merge/split is only allowed on disk almost full. - if usage != DiskUsage::AlreadyFull { - // Test split must be allowed when disk is full. - let region = cluster.get_region(b"k1"); - cluster.must_split(®ion, b"k1"); +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] +fn test_disk_full_leader_behaviors() { + for usage in [DiskUsage::AlmostFull, DiskUsage::AlreadyFull] { + let mut cluster = new_cluster(0, 3); + cluster.cfg.raft_store.gc_peer_check_interval = ReadableDuration::millis(500); // set gc duration for v2 + cluster.pd_client.disable_default_operator(); + cluster.run(); + + // To ensure all replicas are not pending. + cluster.must_put(b"k1", b"v1"); + must_get_equal(&cluster.get_engine(1), b"k1", b"v1"); + must_get_equal(&cluster.get_engine(2), b"k1", b"v1"); + must_get_equal(&cluster.get_engine(3), b"k1", b"v1"); + + cluster.must_transfer_leader(1, new_peer(1, 1)); + fail::cfg(get_fp(usage, 1), "return").unwrap(); + + // Test new normal proposals won't be allowed when disk is full. + let old_last_index = cluster.raft_local_state(1, 1).last_index; + let rx = cluster.async_put(b"k2", b"v2").unwrap(); + assert_disk_full(&block_on_timeout(rx, Duration::from_secs(2)).unwrap()); + let new_last_index = cluster.raft_local_state(1, 1).last_index; + assert_eq!(old_last_index, new_last_index); + + assert_region_leader_changed!(&cluster, 1, 1); + fail::remove(get_fp(usage, 1)); + cluster.must_transfer_leader(1, new_peer(1, 1)); + fail::cfg(get_fp(usage, 1), "return").unwrap(); + + // merge/split is only allowed on disk almost full. + if usage != DiskUsage::AlreadyFull { + // Test split must be allowed when disk is full. + let region = cluster.get_region(b"k1"); + cluster.must_split(®ion, b"k1"); + } + // Test transfer leader should be allowed. + cluster.must_transfer_leader(1, new_peer(2, 2)); + + // Transfer the leadership back to store 1. + fail::remove(get_fp(usage, 1)); + cluster.must_transfer_leader(1, new_peer(1, 1)); + fail::cfg(get_fp(usage, 1), "return").unwrap(); + + // Test remove peer should be allowed. + cluster.pd_client.must_remove_peer(1, new_peer(3, 3)); + // Sleep for a while until the disk usage and peer changes have been synced. + thread::sleep(Duration::from_secs(1)); + must_get_none(&cluster.get_engine(3), b"k1"); + + // Test add peer should be allowed. It must be a higher peer-id in v2. + cluster.pd_client.must_add_peer(1, new_peer(3, 4)); + must_get_equal(&cluster.get_engine(3), b"k1", b"v1"); + + fail::remove(get_fp(usage, 1)); + // Sleep for a while before next case to make it clear. + thread::sleep(Duration::from_secs(1)); } - // Test transfer leader should be allowed. - cluster.must_transfer_leader(1, new_peer(2, 2)); - - // Transfer the leadership back to store 1. - fail::remove(get_fp(usage, 1)); - cluster.must_transfer_leader(1, new_peer(1, 1)); - fail::cfg(get_fp(usage, 1), "return").unwrap(); - - // Test remove peer should be allowed. - cluster.pd_client.must_remove_peer(1, new_peer(3, 3)); - must_get_none(&cluster.get_engine(3), b"k1"); - - // Test add peer should be allowed. - cluster.pd_client.must_add_peer(1, new_peer(3, 3)); - must_get_equal(&cluster.get_engine(3), b"k1", b"v1"); - - fail::remove(get_fp(usage, 1)); -} - -#[test] -fn test_disk_full_for_region_leader() { - test_disk_full_leader_behaviors(DiskUsage::AlmostFull); - test_disk_full_leader_behaviors(DiskUsage::AlreadyFull); -} - -fn test_disk_full_follower_behaviors(usage: DiskUsage) { - let mut cluster = new_node_cluster(0, 3); - cluster.pd_client.disable_default_operator(); - cluster.run(); - - // To ensure all replicas are not pending. - cluster.must_put(b"k1", b"v1"); - must_get_equal(&cluster.get_engine(1), b"k1", b"v1"); - must_get_equal(&cluster.get_engine(2), b"k1", b"v1"); - must_get_equal(&cluster.get_engine(3), b"k1", b"v1"); - - cluster.must_transfer_leader(1, new_peer(1, 1)); - fail::cfg(get_fp(usage, 2), "return").unwrap(); - - // Test followers will reject pre-transfer-leader command. - let epoch = cluster.get_region_epoch(1); - let transfer = new_admin_request(1, &epoch, new_transfer_leader_cmd(new_peer(2, 2))); - cluster - .call_command_on_leader(transfer, Duration::from_secs(3)) - .unwrap(); - assert_eq!(cluster.leader_of_region(1).unwrap(), new_peer(1, 1)); - cluster.must_put(b"k2", b"v2"); - - // Test leader shouldn't append entries to disk full followers. - let old_last_index = cluster.raft_local_state(1, 2).last_index; - cluster.must_put(b"k3", b"v3"); - let new_last_index = cluster.raft_local_state(1, 2).last_index; - assert_eq!(old_last_index, new_last_index); - must_get_none(&cluster.get_engine(2), b"k3"); - - // Test followers will response votes when disk is full. - cluster.add_send_filter(CloneFilterFactory( - RegionPacketFilter::new(1, 1) - .direction(Direction::Send) - .msg_type(MessageType::MsgRequestVoteResponse), - )); - cluster.must_transfer_leader(1, new_peer(3, 3)); - - fail::remove(get_fp(usage, 2)); } -#[test] -fn test_disk_full_for_region_follower() { - test_disk_full_follower_behaviors(DiskUsage::AlmostFull); - test_disk_full_follower_behaviors(DiskUsage::AlreadyFull); +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] +fn test_disk_full_follower_behaviors() { + for usage in [DiskUsage::AlmostFull, DiskUsage::AlreadyFull] { + let mut cluster = new_cluster(0, 3); + cluster.pd_client.disable_default_operator(); + cluster.run(); + + // To ensure all replicas are not pending. + cluster.must_put(b"k1", b"v1"); + must_get_equal(&cluster.get_engine(1), b"k1", b"v1"); + must_get_equal(&cluster.get_engine(2), b"k1", b"v1"); + must_get_equal(&cluster.get_engine(3), b"k1", b"v1"); + + cluster.must_transfer_leader(1, new_peer(1, 1)); + fail::cfg(get_fp(usage, 2), "return").unwrap(); + + // Test followers will reject pre-transfer-leader command. + let epoch = cluster.get_region_epoch(1); + let transfer = new_admin_request(1, &epoch, new_transfer_leader_cmd(new_peer(2, 2))); + cluster + .call_command_on_leader(transfer, Duration::from_secs(3)) + .unwrap(); + assert_eq!(cluster.leader_of_region(1).unwrap(), new_peer(1, 1)); + cluster.must_put(b"k2", b"v2"); + + // Test leader shouldn't append entries to disk full followers. + let old_last_index = cluster.raft_local_state(1, 2).last_index; + cluster.must_put(b"k3", b"v3"); + let new_last_index = cluster.raft_local_state(1, 2).last_index; + assert_eq!(old_last_index, new_last_index); + must_get_none(&cluster.get_engine(2), b"k3"); + + // Test followers will response votes when disk is full. + cluster.add_send_filter(CloneFilterFactory( + RegionPacketFilter::new(1, 1) + .direction(Direction::Send) + .msg_type(MessageType::MsgRequestVoteResponse), + )); + cluster.must_transfer_leader(1, new_peer(3, 3)); + + fail::remove(get_fp(usage, 2)); + } } -fn test_disk_full_txn_behaviors(usage: DiskUsage) { - let mut cluster = new_server_cluster(0, 3); +#[test_case(test_raftstore::new_server_cluster)] +#[test_case(test_raftstore_v2::new_server_cluster)] +fn test_disk_full_txn_behaviors() { + let usage = DiskUsage::AlmostFull; + let mut cluster = new_cluster(0, 3); cluster.pd_client.disable_default_operator(); cluster.run(); @@ -199,7 +198,7 @@ fn test_disk_full_txn_behaviors(usage: DiskUsage) { DiskFullOpt::NotAllowedOnFull, ); assert!(res.get_region_error().has_disk_full()); - assert_region_leader_changed(&mut cluster, 1, 1); + assert_region_leader_changed!(&cluster, 1, 1); fail::remove(get_fp(usage, 1)); cluster.must_transfer_leader(1, new_peer(1, 1)); @@ -269,16 +268,13 @@ fn test_disk_full_txn_behaviors(usage: DiskUsage) { fail::remove(get_fp(usage, 1)); } -#[test] -fn test_disk_full_for_txn_operations() { - test_disk_full_txn_behaviors(DiskUsage::AlmostFull); -} - -#[test] +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] fn test_majority_disk_full() { - let mut cluster = new_node_cluster(0, 3); + let mut cluster = new_cluster(0, 3); // To ensure the thread has full store disk usage infomation. cluster.cfg.raft_store.store_batch_system.pool_size = 1; + cluster.cfg.raft_store.gc_peer_check_interval = ReadableDuration::millis(500); // set gc duration for v2 cluster.pd_client.disable_default_operator(); cluster.run(); @@ -295,12 +291,12 @@ fn test_majority_disk_full() { // To ensure followers have reported disk usages to the leader. for i in 1..3 { fail::cfg(get_fp(DiskUsage::AlmostFull, i + 1), "return").unwrap(); - ensure_disk_usage_is_reported(&mut cluster, i + 1, i + 1, ®ion); + ensure_disk_usage_is_reported!(&mut cluster, i + 1, i + 1, ®ion); } // Normal proposals will be rejected because of majority peers' disk full. - let mut ch = cluster.async_put(b"k2", b"v2").unwrap(); - let resp = ch.recv_timeout(Duration::from_secs(1)).unwrap(); + let ch = cluster.async_put(b"k2", b"v2").unwrap(); + let resp = block_on_timeout(ch, Duration::from_secs(1)).unwrap(); assert_eq!(disk_full_stores(&resp), vec![2, 3]); // Proposals with special `DiskFullOpt`s can be accepted even if all peers are @@ -310,8 +306,8 @@ fn test_majority_disk_full() { let put = new_request(1, epoch.clone(), reqs, false); let mut opts = RaftCmdExtraOpts::default(); opts.disk_full_opt = DiskFullOpt::AllowedOnAlmostFull; - let mut ch = cluster.async_request_with_opts(put, opts).unwrap(); - let resp = ch.recv_timeout(Duration::from_secs(1)).unwrap(); + let ch = cluster.async_request_with_opts(put, opts).unwrap(); + let resp = block_on_timeout(ch, Duration::from_secs(1)).unwrap(); assert!(!resp.get_header().has_error()); // Reset disk full status for peer 2 and 3. 2 follower reads must success @@ -319,14 +315,14 @@ fn test_majority_disk_full() { // new disk usages are reported. for i in 1..3 { fail::remove(get_fp(DiskUsage::AlmostFull, i + 1)); - ensure_disk_usage_is_reported(&mut cluster, i + 1, i + 1, ®ion); + ensure_disk_usage_is_reported!(&mut cluster, i + 1, i + 1, ®ion); must_get_equal(&cluster.get_engine(i + 1), b"k3", b"v3"); } // To ensure followers have reported disk usages to the leader. for i in 1..3 { fail::cfg(get_fp(DiskUsage::AlreadyFull, i + 1), "return").unwrap(); - ensure_disk_usage_is_reported(&mut cluster, i + 1, i + 1, ®ion); + ensure_disk_usage_is_reported!(&mut cluster, i + 1, i + 1, ®ion); } // Proposals with special `DiskFullOpt`s will still be rejected if majority @@ -335,17 +331,19 @@ fn test_majority_disk_full() { let put = new_request(1, epoch.clone(), reqs, false); let mut opts = RaftCmdExtraOpts::default(); opts.disk_full_opt = DiskFullOpt::AllowedOnAlmostFull; - let mut ch = cluster.async_request_with_opts(put, opts).unwrap(); - let resp = ch.recv_timeout(Duration::from_secs(10)).unwrap(); + let ch = cluster.async_request_with_opts(put, opts).unwrap(); + let resp = block_on_timeout(ch, Duration::from_secs(10)).unwrap(); assert_eq!(disk_full_stores(&resp), vec![2, 3]); // Peer 2 disk usage changes from already full to almost full. fail::remove(get_fp(DiskUsage::AlreadyFull, 2)); fail::cfg(get_fp(DiskUsage::AlmostFull, 2), "return").unwrap(); - ensure_disk_usage_is_reported(&mut cluster, 2, 2, ®ion); + ensure_disk_usage_is_reported!(&mut cluster, 2, 2, ®ion); - // Configuration change should be alloed. + // Configuration change should be allowed. cluster.pd_client.must_remove_peer(1, new_peer(2, 2)); + // Sleep for a while until the disk usage and peer changes have been synced. + thread::sleep(Duration::from_secs(1)); // After the last configuration change is applied, the raft group will be like // `[(1, DiskUsage::AlmostFull), (3, DiskUsage::AlreadyFull)]`. So no more @@ -354,8 +352,8 @@ fn test_majority_disk_full() { let put = new_request(1, epoch, reqs, false); let mut opts = RaftCmdExtraOpts::default(); opts.disk_full_opt = DiskFullOpt::AllowedOnAlmostFull; - let mut ch = cluster.async_request_with_opts(put, opts).unwrap(); - let resp = ch.recv_timeout(Duration::from_secs(1)).unwrap(); + let ch = cluster.async_request_with_opts(put, opts).unwrap(); + let resp = block_on_timeout(ch, Duration::from_secs(1)).unwrap(); assert_eq!(disk_full_stores(&resp), vec![3]); for i in 0..3 { @@ -364,9 +362,10 @@ fn test_majority_disk_full() { } } -#[test] +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] fn test_disk_full_followers_with_hibernate_regions() { - let mut cluster = new_node_cluster(0, 2); + let mut cluster = new_cluster(0, 2); // To ensure the thread has full store disk usage infomation. cluster.cfg.raft_store.store_batch_system.pool_size = 1; cluster.pd_client.disable_default_operator(); @@ -391,31 +390,13 @@ fn test_disk_full_followers_with_hibernate_regions() { must_get_equal(&cluster.get_engine(2), b"k2", b"v2"); } -// check the region new leader is elected. -fn assert_region_merged( - cluster: &mut Cluster, - left_region_key: &[u8], - right_region_key: &[u8], -) { - let timer = Instant::now(); - loop { - if timer.saturating_elapsed() > Duration::from_secs(5) { - panic!("region merge failed"); - } - let region_left = cluster.get_region(left_region_key); - let region_right = cluster.get_region(right_region_key); - if region_left.get_id() != region_right.get_id() { - sleep_ms(10); - continue; - } else { - break; - } - } -} - -#[test] +// #[test_case(test_raftstore_v2::new_server_cluster)] +// FIXME: #[test_case(test_raftstore_v2::new_server_cluster)] +// In v2 `must_try_merge` always return error. Also the last `must_merge` +// sometimes cannot get an updated min_matched. +#[test_case(test_raftstore::new_server_cluster)] fn test_merge_on_majority_disk_full() { - let mut cluster = new_server_cluster(0, 3); + let mut cluster = new_cluster(0, 3); // To ensure the thread has full store disk usage infomation. cluster.cfg.raft_store.store_batch_system.pool_size = 1; cluster.pd_client.disable_default_operator(); @@ -448,23 +429,42 @@ fn test_merge_on_majority_disk_full() { fail::cfg(get_fp(DiskUsage::AlmostFull, i), "return").unwrap(); } for peer in region1.get_peers().iter() { - ensure_disk_usage_is_reported(&mut cluster, peer.get_id(), peer.get_store_id(), ®ion1); + ensure_disk_usage_is_reported!(&mut cluster, peer.get_id(), peer.get_store_id(), ®ion1); } for peer in region2.get_peers().iter() { - ensure_disk_usage_is_reported(&mut cluster, peer.get_id(), peer.get_store_id(), ®ion2); + ensure_disk_usage_is_reported!(&mut cluster, peer.get_id(), peer.get_store_id(), ®ion2); } cluster.must_try_merge(region1.get_id(), region2.get_id()); - assert_region_merged(&mut cluster, b"k1", b"k3"); + + // check the region new leader is elected. + let assert_region_merged = |left_region_key: &[u8], right_region_key: &[u8]| { + let timer = Instant::now(); + loop { + if timer.saturating_elapsed() > Duration::from_secs(5) { + panic!("region merge failed"); + } + let region_left = cluster.get_region(left_region_key); + let region_right = cluster.get_region(right_region_key); + if region_left.get_id() != region_right.get_id() { + sleep_ms(10); + continue; + } else { + break; + } + } + }; + assert_region_merged(b"k1", b"k3"); for i in 1..3 { fail::remove(get_fp(DiskUsage::AlmostFull, i)); } } -#[test] +#[test_case(test_raftstore::new_server_cluster)] +#[test_case(test_raftstore_v2::new_server_cluster)] fn test_almost_and_already_full_behavior() { - let mut cluster = new_server_cluster(0, 5); + let mut cluster = new_cluster(0, 5); // To ensure the thread has full store disk usage infomation. cluster.cfg.raft_store.store_batch_system.pool_size = 1; cluster.pd_client.disable_default_operator(); @@ -481,7 +481,7 @@ fn test_almost_and_already_full_behavior() { fail::cfg(get_fp(DiskUsage::AlreadyFull, i), "return").unwrap(); } for i in 1..5 { - ensure_disk_usage_is_reported(&mut cluster, i + 1, i + 1, ®ion); + ensure_disk_usage_is_reported!(&mut cluster, i + 1, i + 1, ®ion); } let lead_client = PeerClient::new(&cluster, 1, new_peer(1, 1)); @@ -521,29 +521,10 @@ fn test_almost_and_already_full_behavior() { } } -fn wait_down_peers_reported( - cluster: &Cluster, - total_down_count: u64, - target_report_peer: u64, -) { - let mut peers = cluster.get_down_peers(); - let timer = Instant::now(); - loop { - if timer.saturating_elapsed() > Duration::from_secs(5) { - panic!("Leader cannot change when the only disk full node is leader"); - } - - if peers.len() == total_down_count as usize && peers.contains_key(&target_report_peer) { - return; - } - sleep_ms(10); - peers = cluster.get_down_peers(); - } -} - -#[test] +#[test_case(test_raftstore::new_server_cluster)] +#[test_case(test_raftstore_v2::new_server_cluster)] fn test_down_node_when_disk_full() { - let mut cluster = new_server_cluster(0, 5); + let mut cluster = new_cluster(0, 5); // To ensure the thread has full store disk usage infomation. cluster.cfg.raft_store.store_batch_system.pool_size = 1; cluster.cfg.raft_store.max_peer_down_duration = ReadableDuration::secs(1); @@ -555,7 +536,7 @@ fn test_down_node_when_disk_full() { let region = cluster.get_region(b"k1"); for i in 3..6 { fail::cfg(get_fp(DiskUsage::AlmostFull, i), "return").unwrap(); - ensure_disk_usage_is_reported(&mut cluster, i, i, ®ion); + ensure_disk_usage_is_reported!(&mut cluster, i, i, ®ion); } let lead_client = PeerClient::new(&cluster, 1, new_peer(1, 1)); @@ -574,7 +555,23 @@ fn test_down_node_when_disk_full() { ); cluster.stop_node(2); - wait_down_peers_reported(&cluster, 1, 2u64); + + let wait_down_peers_reported = |total_down_count: u64, target_report_peer: u64| { + let mut peers = cluster.get_down_peers(); + let timer = Instant::now(); + loop { + if timer.saturating_elapsed() > Duration::from_secs(5) { + panic!("Leader cannot change when the only disk full node is leader"); + } + + if peers.len() == total_down_count as usize && peers.contains_key(&target_report_peer) { + return; + } + sleep_ms(10); + peers = cluster.get_down_peers(); + } + }; + wait_down_peers_reported(1u64, 2u64); let prewrite_ts = get_tso(&cluster.pd_client); let res = lead_client.try_kv_prewrite( diff --git a/tests/failpoints/cases/test_disk_snap_br.rs b/tests/failpoints/cases/test_disk_snap_br.rs new file mode 100644 index 00000000000..83956aa9367 --- /dev/null +++ b/tests/failpoints/cases/test_disk_snap_br.rs @@ -0,0 +1,42 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +// FIXME: Now, for making sure there isn't a regression after the advanced +// prepare patch (anyway it is just a patch...), we won't reject the +// `CommitMerge` command, or the client may fall into an eternal wait over it +// while waiting pending admin command finish. +// +// Omitting rejecting the command won't break the consistency (at least won't +// make things worse), but will break the case: this case itself wants to prove +// that the `CommitMerge` won't be proposed. +#[test] +#[ignore = "See the comment of `test_merge`"] +fn test_merge() { + use std::time::Duration; + + use test_backup::disk_snap::{assert_success, Suite}; + + let mut suite = Suite::new(1); + suite.split(b"k"); + let mut source = suite.cluster.get_region(b"a"); + let target = suite.cluster.get_region(b"z"); + assert_ne!(source.id, target.id); + fail::cfg("on_schedule_merge", "pause").unwrap(); + let resp = suite.cluster.try_merge(source.id, target.id); + assert_success(&resp); + let mut call = suite.prepare_backup(1); + call.prepare(60); + fail::remove("on_schedule_merge"); + // Manually "apply" the prepare merge on region epoch. + source.mut_region_epoch().set_conf_ver(2); + source.mut_region_epoch().set_version(3); + call.wait_apply([&source, &target].into_iter().cloned()); + let source = suite.cluster.get_region(b"a"); + let target = suite.cluster.get_region(b"z"); + assert_ne!(source.id, target.id); + suite.nodes[&1].rejector.reset(); + test_util::eventually(Duration::from_secs(1), Duration::from_secs(10), || { + let source = suite.cluster.get_region(b"a"); + let target = suite.cluster.get_region(b"z"); + source.id == target.id + }) +} diff --git a/tests/failpoints/cases/test_early_apply.rs b/tests/failpoints/cases/test_early_apply.rs index a194ef74d8f..bf403fb4668 100644 --- a/tests/failpoints/cases/test_early_apply.rs +++ b/tests/failpoints/cases/test_early_apply.rs @@ -7,14 +7,16 @@ use std::sync::{ use raft::eraftpb::MessageType; use test_raftstore::*; +use test_raftstore_macro::test_case; // Test if a singleton can apply a log before persisting it. -#[test] +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] fn test_singleton_cannot_early_apply() { - let mut cluster = new_node_cluster(0, 1); + let mut cluster = new_cluster(0, 1); cluster.pd_client.disable_default_operator(); // So compact log will not be triggered automatically. - configure_for_request_snapshot(&mut cluster); + configure_for_request_snapshot(&mut cluster.cfg); cluster.run(); // Put one key first to cache leader. @@ -33,13 +35,14 @@ fn test_singleton_cannot_early_apply() { must_get_equal(&cluster.get_engine(1), b"k1", b"v1"); } -#[test] +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] fn test_multi_early_apply() { - let mut cluster = new_node_cluster(0, 3); + let mut cluster = new_cluster(0, 3); cluster.pd_client.disable_default_operator(); cluster.cfg.raft_store.store_batch_system.pool_size = 1; // So compact log will not be triggered automatically. - configure_for_request_snapshot(&mut cluster); + configure_for_request_snapshot(&mut cluster.cfg); cluster.run_conf_change(); // Check mixed regions can be scheduled correctly. @@ -68,9 +71,11 @@ fn test_multi_early_apply() { })), )); cluster.async_put(b"k4", b"v4").unwrap(); - // Sleep a while so that follower will send append response. + // Sleep a while so that follower will send append response sleep_ms(100); cluster.async_put(b"k11", b"v22").unwrap(); + // Sleep a while so that follower will send append response. + sleep_ms(100); // Now the store thread of store 1 pauses on `store_1_fp`. // Set `store_1_fp` again to make this store thread does not pause on it. // Then leader 1 will receive the append response and commit the log. @@ -92,6 +97,9 @@ fn test_multi_early_apply() { /// the peer to fix this issue. /// For simplicity, this test uses region merge to ensure that the apply state /// will be written to kv db before crash. +/// +/// Note: partitioned-raft-kv does not need this due to change in disk +/// persistence logic #[test] fn test_early_apply_yield_followed_with_many_entries() { let mut cluster = new_node_cluster(0, 3); diff --git a/tests/failpoints/cases/test_engine.rs b/tests/failpoints/cases/test_engine.rs new file mode 100644 index 00000000000..073f7276419 --- /dev/null +++ b/tests/failpoints/cases/test_engine.rs @@ -0,0 +1,139 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ + sync::{mpsc::channel, Mutex}, + time::Duration, +}; + +use engine_traits::{MiscExt, CF_DEFAULT, CF_LOCK, CF_WRITE}; +use tikv_util::config::ReadableSize; + +fn dummy_string(len: usize) -> String { + String::from_utf8(vec![0; len]).unwrap() +} + +#[test] +fn test_write_buffer_manager() { + use test_raftstore_v2::*; + let count = 1; + let mut cluster = new_node_cluster(0, count); + cluster.cfg.rocksdb.lockcf.write_buffer_limit = Some(ReadableSize::kb(10)); + cluster.cfg.rocksdb.defaultcf.write_buffer_limit = Some(ReadableSize::kb(10)); + cluster.cfg.rocksdb.write_buffer_limit = Some(ReadableSize::kb(30)); + + // Let write buffer size small to make memtable request fewer memories. + // Otherwise, one single memory request can exceeds the write buffer limit set + // above. + cluster.cfg.rocksdb.lockcf.write_buffer_size = Some(ReadableSize::kb(64)); + cluster.cfg.rocksdb.writecf.write_buffer_size = Some(ReadableSize::kb(64)); + cluster.cfg.rocksdb.defaultcf.write_buffer_size = Some(ReadableSize::kb(64)); + cluster.run(); + + let dummy = dummy_string(500); + let fp = "on_memtable_sealed"; + fail::cfg(fp, "return(lock)").unwrap(); + + for i in 0..10 { + let key = format!("key-{:03}", i); + for cf in &[CF_WRITE, CF_LOCK] { + cluster.must_put_cf(cf, key.as_bytes(), dummy.as_bytes()); + } + } + + fail::cfg(fp, "return(default)").unwrap(); + + for i in 0..10 { + let key = format!("key-{:03}", i); + for cf in &[CF_WRITE, CF_DEFAULT] { + cluster.must_put_cf(cf, key.as_bytes(), dummy.as_bytes()); + } + } + + fail::cfg(fp, "return(write)").unwrap(); + let dummy = dummy_string(1000); + for i in 0..10 { + let key = format!("key-{:03}", i); + cluster.must_put_cf(CF_WRITE, key.as_bytes(), dummy.as_bytes()); + } +} + +// The test mocks the senario before https://github.com/tikv/rocksdb/pull/347: +// note: before rocksdb/pull/347, lock is called before on_memtable_sealed. +// Case: +// Assume FlushMemtable cf1 (schedule flush task) and BackgroundCallFlush cf1 +// (execute flush task) are performed concurrently. +// t FlushMemtable cf1 BackgroundCallFlush cf1 +// 1. lock +// 2. convert memtable t2(seqno. 10-20) +// to immemtable +// 3. unlock +// 4. lock +// 5. pick memtables to flush: +// t1(0-10), t2(10-20) +// flush job(0-20) +// 6. finish flush +// 7. unlock +// 8. on_flush_completed: +// update last_flushed to 20 +// 9. on_memtable_sealed +// 10 > 20 *panic* +#[test] +fn test_rocksdb_listener() { + use test_raftstore_v2::*; + let count = 1; + let mut cluster = new_node_cluster(0, count); + // make flush thread num 1 to be easy to construct the case + cluster.cfg.rocksdb.max_background_flushes = 1; + cluster.run(); + + let r = cluster.get_region(b"k10"); + cluster.must_split(&r, b"k10"); + + for i in 0..20 { + let k = format!("k{:02}", i); + cluster.must_put(k.as_bytes(), b"val"); + } + + let r1 = cluster.get_region(b"k00").get_id(); + let r2 = cluster.get_region(b"k15").get_id(); + + let engine = cluster.get_engine(1); + let tablet1 = engine.get_tablet_by_id(r1).unwrap(); + let tablet2 = engine.get_tablet_by_id(r2).unwrap(); + + fail::cfg("on_flush_begin", "1*pause").unwrap(); + tablet1.flush_cf("default", false).unwrap(); // call flush 1 + std::thread::sleep(Duration::from_secs(1)); + + tablet2.flush_cf("default", false).unwrap(); // call flush 2 + for i in 20..30 { + let k = format!("k{:02}", i); + cluster.must_put(k.as_bytes(), b"val"); + } + fail::cfg("on_memtable_sealed", "pause").unwrap(); + + let h = std::thread::spawn(move || { + tablet2.flush_cf("default", true).unwrap(); + }); + + let (tx, rx) = channel(); + let tx = Mutex::new(tx); + fail::cfg_callback("on_flush_completed", move || { + let _ = tx.lock().unwrap().send(true); // call flush 3 + }) + .unwrap(); + fail::remove("on_flush_begin"); + + let _ = rx.recv(); // flush 1 done + // Now, flush 1 has done, flush 3 is blocked at on_memtable_sealed. + // Before https://github.com/tikv/rocksdb/pull/347, unlock will be called + // before calling on_memtable_sealed, so flush 2 can pick the memtable sealed by + // flush 3 and thus make the order chaos. + // Now, unlock will not be called, so we have to remove failpoint to avoid + // deadlock. 2 seconds is long enough to make the test failed before + // rocksdb/pull/347. + std::thread::sleep(Duration::from_secs(2)); + fail::remove("on_memtable_sealed"); + + h.join().unwrap(); +} diff --git a/tests/failpoints/cases/test_gc_worker.rs b/tests/failpoints/cases/test_gc_worker.rs index d24ec85f040..50b71b59f47 100644 --- a/tests/failpoints/cases/test_gc_worker.rs +++ b/tests/failpoints/cases/test_gc_worker.rs @@ -14,9 +14,10 @@ use raftstore::coprocessor::{ RegionInfo, RegionInfoCallback, RegionInfoProvider, Result as CopResult, SeekRegionCallback, }; use test_raftstore::*; +use test_raftstore_macro::test_case; use tikv::{ server::gc_worker::{ - AutoGcConfig, GcSafePointProvider, GcTask, Result as GcWorkerResult, TestGcRunner, + sync_gc, AutoGcConfig, GcSafePointProvider, GcTask, Result as GcWorkerResult, TestGcRunner, }, storage::{ kv::TestEngineBuilder, @@ -61,11 +62,38 @@ fn test_error_in_compaction_filter() { fail::remove(fp); } +#[derive(Clone)] +struct MockSafePointProvider; +impl GcSafePointProvider for MockSafePointProvider { + fn get_safe_point(&self) -> GcWorkerResult { + Ok(TimeStamp::from(0)) + } +} + +#[derive(Clone)] +struct MockRegionInfoProvider; +impl RegionInfoProvider for MockRegionInfoProvider { + fn seek_region(&self, _: &[u8], _: SeekRegionCallback) -> CopResult<()> { + Ok(()) + } + fn find_region_by_id( + &self, + _: u64, + _: RegionInfoCallback>, + ) -> CopResult<()> { + Ok(()) + } + fn get_regions_in_range(&self, _start_key: &[u8], _end_key: &[u8]) -> CopResult> { + Ok(vec![]) + } +} + // Test GC worker can receive and handle orphan versions emit from write CF's // compaction filter correctly. -#[test] +#[test_case(test_raftstore::must_new_and_configure_cluster)] +#[test_case(test_raftstore_v2::must_new_and_configure_cluster)] fn test_orphan_versions_from_compaction_filter() { - let (cluster, leader, ctx) = must_new_and_configure_cluster(|cluster| { + let (cluster, leader, ctx) = new_cluster(|cluster| { cluster.cfg.gc.enable_compaction_filter = true; cluster.cfg.gc.compaction_filter_skip_version_check = true; cluster.pd_client.disable_default_operator(); @@ -76,8 +104,20 @@ fn test_orphan_versions_from_compaction_filter() { let channel = ChannelBuilder::new(env).connect(&cluster.sim.rl().get_addr(leader_store)); let client = TikvClient::new(channel); - init_compaction_filter(&cluster, leader_store); - let engine = cluster.engines.get(&leader_store).unwrap(); + // Call `start_auto_gc` like `cmd/src/server.rs` does. It will combine + // compaction filter and GC worker so that GC worker can help to process orphan + // versions on default CF. + { + let sim = cluster.sim.rl(); + let gc_worker = sim.get_gc_worker(leader_store); + gc_worker + .start_auto_gc( + AutoGcConfig::new(MockSafePointProvider, MockRegionInfoProvider, 1), + Arc::new(AtomicU64::new(0)), + ) + .unwrap(); + } + let engine = cluster.get_engine(leader_store); let pk = b"k1".to_vec(); let large_value = vec![b'x'; 300]; @@ -91,22 +131,23 @@ fn test_orphan_versions_from_compaction_filter() { if start_ts < 40 { let key = Key::from_raw(b"k1").append_ts(start_ts.into()); let key = data_key(key.as_encoded()); - assert!(engine.kv.get_value(&key).unwrap().is_some()); + assert!(engine.get_value(&key).unwrap().is_some()); } } let fp = "write_compaction_filter_flush_write_batch"; fail::cfg(fp, "return").unwrap(); - let mut gc_runner = TestGcRunner::new(100); - gc_runner.gc_scheduler = cluster.sim.rl().get_gc_worker(1).scheduler(); - gc_runner.gc(&engine.kv); + let gc_safe_ponit = TimeStamp::from(100); + let gc_scheduler = cluster.sim.rl().get_gc_worker(1).scheduler(); + let region = cluster.get_region(&pk); + sync_gc(&gc_scheduler, region, gc_safe_ponit).unwrap(); 'IterKeys: for &start_ts in &[10, 20, 30] { let key = Key::from_raw(b"k1").append_ts(start_ts.into()); let key = data_key(key.as_encoded()); for _ in 0..100 { - if engine.kv.get_value(&key).unwrap().is_some() { + if engine.get_value(&key).unwrap().is_some() { thread::sleep(Duration::from_millis(20)); continue; } @@ -117,47 +158,3 @@ fn test_orphan_versions_from_compaction_filter() { fail::remove(fp); } - -// Call `start_auto_gc` like `cmd/src/server.rs` does. It will combine -// compaction filter and GC worker so that GC worker can help to process orphan -// versions on default CF. -fn init_compaction_filter(cluster: &Cluster, store_id: u64) { - #[derive(Clone)] - struct MockSafePointProvider; - impl GcSafePointProvider for MockSafePointProvider { - fn get_safe_point(&self) -> GcWorkerResult { - Ok(TimeStamp::from(0)) - } - } - - #[derive(Clone)] - struct MockRegionInfoProvider; - impl RegionInfoProvider for MockRegionInfoProvider { - fn seek_region(&self, _: &[u8], _: SeekRegionCallback) -> CopResult<()> { - Ok(()) - } - fn find_region_by_id( - &self, - _: u64, - _: RegionInfoCallback>, - ) -> CopResult<()> { - Ok(()) - } - fn get_regions_in_range( - &self, - _start_key: &[u8], - _end_key: &[u8], - ) -> CopResult> { - Ok(vec![]) - } - } - - let sim = cluster.sim.rl(); - let gc_worker = sim.get_gc_worker(store_id); - gc_worker - .start_auto_gc( - AutoGcConfig::new(MockSafePointProvider, MockRegionInfoProvider, 1), - Arc::new(AtomicU64::new(0)), - ) - .unwrap(); -} diff --git a/tests/failpoints/cases/test_hibernate.rs b/tests/failpoints/cases/test_hibernate.rs index 616a4e5e196..4fe7406e372 100644 --- a/tests/failpoints/cases/test_hibernate.rs +++ b/tests/failpoints/cases/test_hibernate.rs @@ -6,7 +6,7 @@ use std::{ time::Duration, }; -use kvproto::raft_serverpb::RaftMessage; +use kvproto::raft_serverpb::{ExtraMessage, ExtraMessageType, RaftMessage}; use raft::eraftpb::MessageType; use raftstore::store::{PeerMsg, PeerTick}; use test_raftstore::*; @@ -101,10 +101,10 @@ fn test_store_disconnect_with_hibernate() { cluster.cfg.raft_store.raft_election_timeout_ticks = 10; cluster.cfg.raft_store.unreachable_backoff = ReadableDuration::millis(500); cluster.cfg.server.raft_client_max_backoff = ReadableDuration::millis(200); - // So the random election timeout will always be 10, which makes the case more - // stable. + // Use a small range but still random election timeouts, which makes the case + // more stable. cluster.cfg.raft_store.raft_min_election_timeout_ticks = 10; - cluster.cfg.raft_store.raft_max_election_timeout_ticks = 11; + cluster.cfg.raft_store.raft_max_election_timeout_ticks = 13; configure_for_hibernate(&mut cluster.cfg); cluster.pd_client.disable_default_operator(); let r = cluster.run_conf_change(); @@ -116,7 +116,7 @@ fn test_store_disconnect_with_hibernate() { must_get_equal(&cluster.get_engine(3), b"k1", b"v1"); // Wait until all peers of region 1 hibernate. - thread::sleep(Duration::from_millis(base_tick_ms * 30)); + thread::sleep(Duration::from_millis(base_tick_ms * 40)); // Stop the region leader. fail::cfg("receive_raft_message_from_outside", "pause").unwrap(); @@ -128,7 +128,7 @@ fn test_store_disconnect_with_hibernate() { fail::remove("receive_raft_message_from_outside"); // Wait for a while. Peers of region 1 shouldn't hibernate. - thread::sleep(Duration::from_millis(base_tick_ms * 30)); + thread::sleep(Duration::from_millis(base_tick_ms * 40)); must_get_equal(&cluster.get_engine(2), b"k2", b"v2"); must_get_equal(&cluster.get_engine(3), b"k2", b"v2"); } @@ -178,3 +178,58 @@ fn test_check_long_uncommitted_proposals_while_hibernate() { rx.recv_timeout(2 * cluster.cfg.raft_store.long_uncommitted_base_threshold.0) .unwrap(); } + +#[test] +fn test_forcely_awaken_hibenrate_regions() { + let mut cluster = new_node_cluster(0, 3); + let base_tick_ms = 50; + cluster.cfg.raft_store.raft_base_tick_interval = ReadableDuration::millis(base_tick_ms); + cluster.cfg.raft_store.raft_heartbeat_ticks = 2; + cluster.cfg.raft_store.raft_election_timeout_ticks = 10; + // So the random election timeout will always be 10, which makes the case more + // stable. + cluster.cfg.raft_store.raft_min_election_timeout_ticks = 10; + cluster.cfg.raft_store.raft_max_election_timeout_ticks = 11; + configure_for_hibernate(&mut cluster.cfg); + cluster.pd_client.disable_default_operator(); + let r = cluster.run_conf_change(); + cluster.pd_client.must_add_peer(r, new_peer(2, 2)); + cluster.pd_client.must_add_peer(r, new_peer(3, 3)); + + cluster.must_put(b"k1", b"v1"); + must_get_equal(&cluster.get_engine(2), b"k1", b"v1"); + must_get_equal(&cluster.get_engine(3), b"k1", b"v1"); + + // Wait until all peers of region 1 hibernate. + thread::sleep(Duration::from_millis(base_tick_ms * 30)); + + // Firstly, send `CheckPeerStaleState` message to trigger the check. + let router = cluster.sim.rl().get_router(3).unwrap(); + router + .send(1, PeerMsg::Tick(PeerTick::CheckPeerStaleState)) + .unwrap(); + + // Secondly, forcely send `MsgRegionWakeUp` message for awakening hibernated + // regions. + let (tx, rx) = mpsc::sync_channel(128); + fail::cfg_callback("on_raft_base_tick_chaos", move || { + tx.send(base_tick_ms).unwrap() + }) + .unwrap(); + let mut message = RaftMessage::default(); + message.region_id = 1; + message.set_from_peer(new_peer(3, 3)); + message.set_to_peer(new_peer(3, 3)); + message.mut_region_epoch().version = 1; + message.mut_region_epoch().conf_ver = 3; + let mut msg = ExtraMessage::default(); + msg.set_type(ExtraMessageType::MsgRegionWakeUp); + msg.forcely_awaken = true; + message.set_extra_msg(msg); + router.send_raft_message(message).unwrap(); + assert_eq!( + rx.recv_timeout(Duration::from_secs(1)).unwrap(), + base_tick_ms + ); + fail::remove("on_raft_base_tick_chaos"); +} diff --git a/tests/failpoints/cases/test_import_service.rs b/tests/failpoints/cases/test_import_service.rs index a2487456108..9aa0ad5af20 100644 --- a/tests/failpoints/cases/test_import_service.rs +++ b/tests/failpoints/cases/test_import_service.rs @@ -7,13 +7,13 @@ use std::{ use file_system::calc_crc32; use futures::{executor::block_on, stream, SinkExt}; -use grpcio::{Result, WriteFlags}; -use kvproto::import_sstpb::*; +use grpcio::{ChannelBuilder, Environment, Result, WriteFlags}; +use kvproto::{disk_usage::DiskUsage, import_sstpb::*, tikvpb_grpc::TikvClient}; use tempfile::{Builder, TempDir}; -use test_raftstore::Simulator; +use test_raftstore::{must_raw_put, Simulator}; use test_sst_importer::*; use tikv::config::TikvConfig; -use tikv_util::{config::ReadableSize, HandyRwLock}; +use tikv_util::{config::ReadableSize, sys::disk, HandyRwLock}; #[allow(dead_code)] #[path = "../../integrations/import/util.rs"] @@ -90,6 +90,43 @@ fn upload_sst(import: &ImportSstClient, meta: &SstMeta, data: &[u8]) -> Result) -> u64 { } count } + +#[test] +fn test_flushed_applied_index_after_ingset() { + // disable data flushed + fail::cfg("on_flush_completed", "return()").unwrap(); + // disable data flushed + let (mut cluster, ctx, _tikv, import) = open_cluster_and_tikv_import_client_v2(None); + let temp_dir = Builder::new().prefix("test_ingest_sst").tempdir().unwrap(); + let sst_path = temp_dir.path().join("test.sst"); + + // Create clients. + let env = Arc::new(Environment::new(1)); + let channel = ChannelBuilder::new(Arc::clone(&env)).connect(&cluster.sim.rl().get_addr(1)); + let client = TikvClient::new(channel); + + for i in 0..5 { + let sst_range = (i * 20, (i + 1) * 20); + let (mut meta, data) = gen_sst_file(sst_path.clone(), sst_range); + // No region id and epoch. + send_upload_sst(&import, &meta, &data).unwrap(); + let mut ingest = IngestRequest::default(); + ingest.set_context(ctx.clone()); + ingest.set_sst(meta.clone()); + meta.set_region_id(ctx.get_region_id()); + meta.set_region_epoch(ctx.get_region_epoch().clone()); + send_upload_sst(&import, &meta, &data).unwrap(); + ingest.set_sst(meta.clone()); + let resp = import.ingest(&ingest).unwrap(); + assert!(!resp.has_error(), "{:?}", resp.get_error()); + } + + // only 1 sst left because there is no more event to trigger a raft ready flush. + let count = sst_file_count(&cluster.paths); + assert_eq!(1, count); + + for i in 5..8 { + let sst_range = (i * 20, (i + 1) * 20); + let (mut meta, data) = gen_sst_file(sst_path.clone(), sst_range); + // No region id and epoch. + send_upload_sst(&import, &meta, &data).unwrap(); + let mut ingest = IngestRequest::default(); + ingest.set_context(ctx.clone()); + ingest.set_sst(meta.clone()); + meta.set_region_id(ctx.get_region_id()); + meta.set_region_epoch(ctx.get_region_epoch().clone()); + send_upload_sst(&import, &meta, &data).unwrap(); + ingest.set_sst(meta.clone()); + let resp = import.ingest(&ingest).unwrap(); + assert!(!resp.has_error(), "{:?}", resp.get_error()); + } + + // ingest more sst files, unflushed index still be 1. + let count = sst_file_count(&cluster.paths); + assert_eq!(1, count); + + // file a write to trigger ready flush, even if the write is not flushed. + must_raw_put(&client, ctx, b"key1".to_vec(), b"value1".to_vec()); + let count = sst_file_count(&cluster.paths); + assert_eq!(0, count); + + // restart node, should not tirgger any ingest + fail::cfg("on_apply_ingest", "panic").unwrap(); + cluster.stop_node(1); + cluster.start().unwrap(); + let count = sst_file_count(&cluster.paths); + assert_eq!(0, count); + + fail::remove("on_apply_ingest"); + fail::remove("on_flush_completed"); +} diff --git a/tests/failpoints/cases/test_kv_service.rs b/tests/failpoints/cases/test_kv_service.rs index f3831bb984b..2ec1109edd4 100644 --- a/tests/failpoints/cases/test_kv_service.rs +++ b/tests/failpoints/cases/test_kv_service.rs @@ -3,11 +3,16 @@ use std::{sync::Arc, time::Duration}; use grpcio::{ChannelBuilder, Environment}; -use kvproto::{kvrpcpb::*, tikvpb::TikvClient}; +use kvproto::{ + kvrpcpb::{PrewriteRequestPessimisticAction::SkipPessimisticCheck, *}, + tikvpb::TikvClient, +}; use test_raftstore::{ - must_kv_prewrite, must_new_cluster_and_kv_client, must_new_cluster_mul, - try_kv_prewrite_with_impl, + configure_for_lease_read, must_kv_commit, must_kv_have_locks, must_kv_prewrite, + must_kv_prewrite_with, must_new_cluster_and_kv_client, must_new_cluster_mul, + new_server_cluster, try_kv_prewrite_with, try_kv_prewrite_with_impl, }; +use tikv_util::{config::ReadableDuration, HandyRwLock}; #[test] fn test_batch_get_memory_lock() { @@ -91,6 +96,7 @@ fn test_undetermined_write_err() { &client, ctx, vec![mutation], + vec![], b"k".to_vec(), 10, 0, @@ -103,3 +109,157 @@ fn test_undetermined_write_err() { // The previous panic hasn't been captured. assert!(std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| drop(cluster))).is_err()); } +#[test] +fn test_stale_read_on_local_leader() { + let mut cluster = new_server_cluster(0, 1); + // Increase the election tick to make this test case running reliably. + configure_for_lease_read(&mut cluster.cfg, Some(50), Some(10_000)); + let max_lease = Duration::from_secs(2); + cluster.cfg.raft_store.raft_store_max_leader_lease = ReadableDuration(max_lease); + cluster.pd_client.disable_default_operator(); + cluster.run(); + + let region_id = 1; + let leader = cluster.leader_of_region(region_id).unwrap(); + let epoch = cluster.get_region_epoch(region_id); + let mut ctx = Context::default(); + ctx.set_region_id(region_id); + ctx.set_peer(leader.clone()); + ctx.set_region_epoch(epoch); + let env = Arc::new(Environment::new(1)); + let channel = + ChannelBuilder::new(env).connect(&cluster.sim.rl().get_addr(leader.get_store_id())); + let client = TikvClient::new(channel); + + let (k, v) = (b"key".to_vec(), b"value".to_vec()); + let v1 = b"value1".to_vec(); + + // Write record. + let mut mutation = Mutation::default(); + mutation.set_op(Op::Put); + mutation.set_key(k.clone()); + mutation.set_value(v.clone()); + must_kv_prewrite(&client, ctx.clone(), vec![mutation], k.clone(), 10); + must_kv_commit(&client, ctx.clone(), vec![k.clone()], 10, 30, 30); + + // Prewrite and leave a lock. + let mut mutation = Mutation::default(); + mutation.set_op(Op::Put); + mutation.set_key(k.clone()); + mutation.set_value(v1); + must_kv_prewrite(&client, ctx.clone(), vec![mutation], k.clone(), 50); + + let mut req = GetRequest::default(); + req.set_context(ctx); + req.set_key(k); + req.version = 40; + req.mut_context().set_stale_read(true); + + // The stale read should fallback and succeed on the leader peer. + let resp = client.kv_get(&req).unwrap(); + assert!(resp.error.is_none()); + assert!(resp.region_error.is_none()); + assert_eq!(v, resp.get_value()); +} + +#[test] +fn test_storage_do_not_update_txn_status_cache_on_write_error() { + let cache_hit_fp = "before_prewrite_txn_status_cache_hit"; + let cache_miss_fp = "before_prewrite_txn_status_cache_miss"; + + let (cluster, leader, ctx) = must_new_cluster_mul(1); + let env = Arc::new(Environment::new(1)); + let channel = ChannelBuilder::new(env) + .connect(&cluster.sim.read().unwrap().get_addr(leader.get_store_id())); + let client = TikvClient::new(channel); + + let pk = b"pk".to_vec(); + + // Case 1: Test write successfully. + + let mut mutation = Mutation::default(); + mutation.set_op(Op::Put); + mutation.set_key(b"k1".to_vec()); + mutation.set_value(b"v1".to_vec()); + must_kv_prewrite_with( + &client, + ctx.clone(), + vec![mutation.clone()], + vec![SkipPessimisticCheck], + pk.clone(), + 10, + 10, + true, + false, + ); + must_kv_commit(&client, ctx.clone(), vec![b"k1".to_vec()], 10, 15, 15); + + // Expect cache hit + fail::cfg(cache_miss_fp, "panic").unwrap(); + must_kv_prewrite_with( + &client, + ctx.clone(), + vec![mutation], + vec![SkipPessimisticCheck], + pk.clone(), + 10, + 10, + true, + false, + ); + // Key not locked. + must_kv_have_locks(&client, ctx.clone(), 19, b"k1", b"k2", &[]); + fail::remove(cache_miss_fp); + + // Case 2: Write failed. + + let mut mutation = Mutation::default(); + mutation.set_op(Op::Put); + mutation.set_key(b"k2".to_vec()); + mutation.set_value(b"v2".to_vec()); + + try_kv_prewrite_with( + &client, + ctx.clone(), + vec![mutation.clone()], + vec![SkipPessimisticCheck], + pk.clone(), + 20, + 20, + true, + false, + ); + fail::cfg("raftkv_early_error_report", "return").unwrap(); + let mut commit_req = CommitRequest::default(); + commit_req.set_context(ctx.clone()); + commit_req.set_start_version(20); + commit_req.set_commit_version(25); + commit_req.set_keys(vec![b"k2".to_vec()].into()); + let commit_resp = client.kv_commit(&commit_req).unwrap(); + assert!(commit_resp.has_region_error()); + fail::remove("raftkv_early_error_report"); + must_kv_have_locks( + &client, + ctx.clone(), + 29, + b"k2", + b"k3", + &[(b"k2", Op::Put, 20, 20)], + ); + + // Expect cache miss + fail::cfg(cache_hit_fp, "panic").unwrap(); + try_kv_prewrite_with( + &client, + ctx.clone(), + vec![mutation], + vec![SkipPessimisticCheck], + pk, + 20, + 20, + true, + false, + ); + must_kv_have_locks(&client, ctx, 29, b"k2", b"k3", &[(b"k2", Op::Put, 20, 20)]); + fail::remove(cache_hit_fp); +} diff --git a/tests/failpoints/cases/test_life.rs b/tests/failpoints/cases/test_life.rs new file mode 100644 index 00000000000..2bc833075c6 --- /dev/null +++ b/tests/failpoints/cases/test_life.rs @@ -0,0 +1,36 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +use std::sync::Arc; + +use test_raftstore::*; +use test_raftstore_macro::test_case; +use tikv_util::config::ReadableDuration; + +#[test_case(test_raftstore_v2::new_server_cluster)] +fn test_gc_peer_on_tombstone_store() { + let mut cluster = new_cluster(0, 3); + configure_for_merge(&mut cluster.cfg); + cluster.cfg.raft_store.gc_peer_check_interval = ReadableDuration::millis(500); + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + cluster.run(); + cluster.must_put(b"k1", b"v1"); + + let region = cluster.get_region(b"k1"); + + let peer_on_store1 = find_peer(®ion, 1).unwrap().clone(); + let peer_on_store3 = find_peer(®ion, 3).unwrap().clone(); + cluster.must_transfer_leader(region.get_id(), peer_on_store1); + cluster.add_send_filter(IsolationFilterFactory::new(3)); + pd_client.must_remove_peer(region.get_id(), peer_on_store3); + + // Immediately invalidate store address cache. + fail::cfg("mock_store_refresh_interval_secs", "return(0)").unwrap(); + + // Shutdown store 3 and wait for gc peer ticks. + cluster.stop_node(3); + cluster.clear_send_filters(); + sleep_ms(3 * cluster.cfg.raft_store.gc_peer_check_interval.as_millis()); + + cluster.must_empty_region_removed_records(region.get_id()); +} diff --git a/tests/failpoints/cases/test_merge.rs b/tests/failpoints/cases/test_merge.rs index 3cc72d44da1..652bb479b87 100644 --- a/tests/failpoints/cases/test_merge.rs +++ b/tests/failpoints/cases/test_merge.rs @@ -3,6 +3,7 @@ use std::{ sync::{ atomic::{AtomicBool, Ordering}, + mpsc::{channel, sync_channel, Sender}, *, }, thread, @@ -18,16 +19,19 @@ use kvproto::{ }; use pd_client::PdClient; use raft::eraftpb::MessageType; -use raftstore::store::*; +use raftstore::{router::RaftStoreRouter, store::*}; +use raftstore_v2::router::{PeerMsg, PeerTick}; use test_raftstore::*; +use test_raftstore_macro::test_case; use tikv::storage::{kv::SnapshotExt, Snapshot}; -use tikv_util::{config::*, time::Instant, HandyRwLock}; +use tikv_util::{config::*, future::block_on_timeout, time::Instant, HandyRwLock}; use txn_types::{Key, LastChange, PessimisticLock}; /// Test if merge is rollback as expected. -#[test] +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] fn test_node_merge_rollback() { - let mut cluster = new_node_cluster(0, 3); + let mut cluster = new_cluster(0, 3); configure_for_merge(&mut cluster.cfg); let pd_client = Arc::clone(&cluster.pd_client); pd_client.disable_default_operator(); @@ -51,8 +55,16 @@ fn test_node_merge_rollback() { let schedule_merge_fp = "on_schedule_merge"; fail::cfg(schedule_merge_fp, "return()").unwrap(); - // The call is finished when prepare_merge is applied. - cluster.must_try_merge(region.get_id(), target_region.get_id()); + let (tx, rx) = channel(); + let tx = Mutex::new(tx); + fail::cfg_callback("on_apply_res_prepare_merge", move || { + tx.lock().unwrap().send(()).unwrap(); + }) + .unwrap(); + + cluster.merge_region(region.get_id(), target_region.get_id(), Callback::None); + // PrepareMerge is applied. + rx.recv().unwrap(); // Add a peer to trigger rollback. pd_client.must_add_peer(right.get_id(), new_peer(3, 5)); @@ -72,12 +84,7 @@ fn test_node_merge_rollback() { region.mut_region_epoch().set_version(4); for i in 1..3 { must_get_equal(&cluster.get_engine(i), b"k11", b"v11"); - let state_key = keys::region_state_key(region.get_id()); - let state: RegionLocalState = cluster - .get_engine(i) - .get_msg_cf(CF_RAFT, &state_key) - .unwrap() - .unwrap(); + let state = cluster.region_local_state(region.get_id(), i); assert_eq!(state.get_state(), PeerState::Normal); assert_eq!(*state.get_region(), region); } @@ -86,7 +93,10 @@ fn test_node_merge_rollback() { fail::cfg(schedule_merge_fp, "return()").unwrap(); let target_region = pd_client.get_region(b"k3").unwrap(); - cluster.must_try_merge(region.get_id(), target_region.get_id()); + cluster.merge_region(region.get_id(), target_region.get_id(), Callback::None); + // PrepareMerge is applied. + rx.recv().unwrap(); + let mut region = pd_client.get_region(b"k1").unwrap(); // Split to trigger rollback. @@ -101,12 +111,7 @@ fn test_node_merge_rollback() { region.mut_region_epoch().set_version(6); for i in 1..3 { must_get_equal(&cluster.get_engine(i), b"k12", b"v12"); - let state_key = keys::region_state_key(region.get_id()); - let state: RegionLocalState = cluster - .get_engine(i) - .get_msg_cf(CF_RAFT, &state_key) - .unwrap() - .unwrap(); + let state = cluster.region_local_state(region.get_id(), i); assert_eq!(state.get_state(), PeerState::Normal); assert_eq!(*state.get_region(), region); } @@ -1532,7 +1537,7 @@ fn test_retry_pending_prepare_merge_fail() { let mut rx = cluster.async_put(b"k1", b"v11").unwrap(); propose_rx.recv_timeout(Duration::from_secs(2)).unwrap(); - rx.recv_timeout(Duration::from_millis(200)).unwrap_err(); + block_on_timeout(rx.as_mut(), Duration::from_millis(200)).unwrap_err(); // Then, start merging. PrepareMerge should become pending because applied_index // is smaller than proposed_index. @@ -1546,7 +1551,7 @@ fn test_retry_pending_prepare_merge_fail() { fail::cfg("disk_already_full_peer_1", "return").unwrap(); fail::cfg("disk_already_full_peer_2", "return").unwrap(); fail::remove("on_handle_apply"); - let res = rx.recv_timeout(Duration::from_secs(1)).unwrap(); + let res = block_on_timeout(rx, Duration::from_secs(1)).unwrap(); assert!(!res.get_header().has_error(), "{:?}", res); propose_rx.recv_timeout(Duration::from_secs(2)).unwrap(); @@ -1706,3 +1711,512 @@ fn test_destroy_source_peer_while_merging() { must_get_equal(&cluster.get_engine(i), b"k5", b"v5"); } } + +struct MsgTimeoutFilter { + // wrap with mutex to make tx Sync. + tx: Mutex>, +} + +impl Filter for MsgTimeoutFilter { + fn before(&self, msgs: &mut Vec) -> raftstore::Result<()> { + let mut res = Vec::with_capacity(msgs.len()); + for m in msgs.drain(..) { + if m.get_message().msg_type == MessageType::MsgTimeoutNow { + self.tx.lock().unwrap().send(m).unwrap(); + } else { + res.push(m); + } + } + + *msgs = res; + check_messages(msgs) + } +} + +// Concurrent execution between transfer leader and merge can cause rollback and +// commit merge at the same time before this fix which corrupt the region. +// It can happen as this: +// Assume at the begin, leader of source and target are both on node-1 +// 1. node-1 transfer leader to node-2: execute up to sending MsgTimeoutNow +// (leader_transferre has been set), but before becoming follower. +// 2. node-1 source region propose, and apply PrepareMerge +// 3. node-1 target region propose CommitMerge but fail (due to +// leader_transferre being set) +// 4. node-1 source region successfully proposed rollback merge +// 5. node-2 target region became leader and apply the first no-op entry +// 6. node-2 target region successfully proposed commit merge +// Now, rollback at source region and commit at target region are both proposed +// and will be executed which will cause region corrupt +#[test] +fn test_concurrent_between_transfer_leader_and_merge() { + use test_raftstore_v2::*; + let mut cluster = new_node_cluster(0, 3); + configure_for_merge(&mut cluster.cfg); + cluster.run(); + + cluster.must_put(b"k1", b"v1"); + cluster.must_put(b"k3", b"v3"); + for i in 0..3 { + must_get_equal(&cluster.get_engine(i + 1), b"k1", b"v1"); + must_get_equal(&cluster.get_engine(i + 1), b"k3", b"v3"); + } + + let pd_client = Arc::clone(&cluster.pd_client); + let region = pd_client.get_region(b"k1").unwrap(); + cluster.must_split(®ion, b"k2"); + + let right = pd_client.get_region(b"k1").unwrap(); + let left = pd_client.get_region(b"k3").unwrap(); + cluster.must_transfer_leader( + left.get_id(), + left.get_peers() + .iter() + .find(|p| p.store_id == 1) + .cloned() + .unwrap(), + ); + + cluster.must_transfer_leader( + right.get_id(), + right + .get_peers() + .iter() + .find(|p| p.store_id == 1) + .cloned() + .unwrap(), + ); + + // Source region: 1, Target Region: 1000 + // Let target region in leader_transfering status by interceptting MsgTimeoutNow + // msg by using Filter. So we make node-1-1000 be in leader_transferring status + // for some time. + let (tx, rx_msg) = channel(); + let filter = MsgTimeoutFilter { tx: Mutex::new(tx) }; + cluster.add_send_filter_on_node(1, Box::new(filter)); + + pd_client.transfer_leader( + right.get_id(), + right + .get_peers() + .iter() + .find(|p| p.store_id == 2) + .cloned() + .unwrap(), + vec![], + ); + + let msg = rx_msg.recv().unwrap(); + + // Now, node-1-1000 is in leader_transferring status. After it reject proposing + // commit merge, make node-1-1 block before proposing rollback merge until + // node-2-1000 propose commit merge. + + fail::cfg("on_reject_commit_merge_1", "pause").unwrap(); + + let router = cluster.get_router(2).unwrap(); + let (tx, rx) = channel(); + let tx = Mutex::new(tx); + let _ = fail::cfg_callback("propose_commit_merge_1", move || { + tx.lock().unwrap().send(()).unwrap(); + }); + + let (tx2, rx2) = channel(); + let tx2 = Mutex::new(tx2); + let _ = fail::cfg_callback("on_propose_commit_merge_success", move || { + tx2.lock().unwrap().send(()).unwrap(); + }); + + cluster.merge_region(left.get_id(), right.get_id(), Callback::None); + + // Actually, store 1 should not reach the line of propose_commit_merge_1 + let _ = rx.recv_timeout(Duration::from_secs(2)); + router + .force_send( + msg.get_region_id(), + PeerMsg::RaftMessage(Box::new(msg), None), + ) + .unwrap(); + + // Wait region 1 of node 2 to become leader + rx2.recv().unwrap(); + fail::remove("on_reject_commit_merge_1"); + + wait_region_epoch_change(&cluster, &right, Duration::from_secs(5)); + + let region = pd_client.get_region(b"k1").unwrap(); + assert_eq!(region.get_id(), right.get_id()); + assert_eq!(region.get_start_key(), right.get_start_key()); + assert_eq!(region.get_end_key(), left.get_end_key()); + + cluster.must_put(b"k4", b"v4"); +} + +#[test] +fn test_deterministic_commit_rollback_merge() { + use test_raftstore_v2::*; + let mut cluster = new_node_cluster(0, 3); + configure_for_merge(&mut cluster.cfg); + // Use a large election tick to stable test. + configure_for_lease_read(&mut cluster.cfg, None, Some(1000)); + // Use 2 threads for polling peers, so that they can run concurrently. + cluster.cfg.raft_store.store_batch_system.pool_size = 2; + cluster.cfg.raft_store.store_batch_system.max_batch_size = Some(1); + cluster.run(); + + let pd_client = Arc::clone(&cluster.pd_client); + let region = pd_client.get_region(b"k1").unwrap(); + cluster.must_split(®ion, b"k2"); + + let left = pd_client.get_region(b"k1").unwrap(); + let right = pd_client.get_region(b"k3").unwrap(); + let right_1 = find_peer(&right, 1).unwrap().clone(); + cluster.must_transfer_leader(right.get_id(), right_1); + let left_2 = find_peer(&left, 2).unwrap().clone(); + cluster.must_transfer_leader(left.get_id(), left_2); + + cluster.must_put(b"k1", b"v1"); + cluster.must_put(b"k3", b"v3"); + for i in 0..3 { + must_get_equal(&cluster.get_engine(i + 1), b"k1", b"v1"); + must_get_equal(&cluster.get_engine(i + 1), b"k3", b"v3"); + } + + // Delay 1003 apply by dropping append response, so that proposal will fail + // due to applied_term != current_term. + let target_region_id = left.get_id(); + cluster.add_recv_filter_on_node( + 1, + Box::new(DropMessageFilter::new(Arc::new(move |m| { + if m.get_region_id() == target_region_id { + return m.get_message().get_msg_type() != MessageType::MsgAppendResponse; + } + true + }))), + ); + + let left_1 = find_peer(&left, 1).unwrap().clone(); + cluster.must_transfer_leader(left.get_id(), left_1); + + // left(1000) <- right(1). + let (tx1, rx1) = channel(); + let (tx2, rx2) = channel(); + let tx1 = Mutex::new(tx1); + let rx2 = Mutex::new(rx2); + fail::cfg_callback("on_propose_commit_merge_fail_store_1", move || { + tx1.lock().unwrap().send(()).unwrap(); + rx2.lock().unwrap().recv().unwrap(); + }) + .unwrap(); + cluster.merge_region(right.get_id(), left.get_id(), Callback::None); + + // Wait for target fails to propose commit merge. + rx1.recv_timeout(Duration::from_secs(5)).unwrap(); + // Let target apply continue, and new AskCommitMerge messages will propose + // commit merge successfully. + cluster.clear_recv_filter_on_node(1); + + // Trigger a CheckMerge tick, so source will send a AskCommitMerge again. + fail::cfg("ask_target_peer_to_commit_merge_store_1", "pause").unwrap(); + let router = cluster.get_router(1).unwrap(); + router + .check_send(1, PeerMsg::Tick(PeerTick::CheckMerge)) + .unwrap(); + + // Send RejectCommitMerge to source. + tx2.send(()).unwrap(); + fail::remove("on_propose_commit_merge_fail_store_1"); + + // Wait for target applies to current term. + cluster.must_put(b"k1", b"v11"); + + // By remove the failpoint, CheckMerge tick sends a AskCommitMerge again. + fail::remove("ask_target_peer_to_commit_merge_store_1"); + // At this point, source region will propose rollback merge if commit merge + // is not deterministic. + + // Wait for source handle commit or rollback merge. + wait_region_epoch_change(&cluster, &left, Duration::from_secs(5)); + + // No matter commit merge or rollback merge, cluster must be available to + // process requests + cluster.must_put(b"k0", b"v0"); + cluster.must_put(b"k4", b"v4"); +} + +struct MsgVoteFilter {} + +impl Filter for MsgVoteFilter { + fn before(&self, msgs: &mut Vec) -> raftstore::Result<()> { + msgs.retain(|m| { + let msg_type = m.get_message().msg_type; + msg_type != MessageType::MsgRequestPreVote && msg_type != MessageType::MsgRequestVote + }); + check_messages(msgs) + } +} + +// Before the fix of this PR (#15649), after prepare merge, raft cmd can still +// be proposed if restart is involved. If the proposed raft cmd is CompactLog, +// panic can occur during fetch entries: see issue https://github.com/tikv/tikv/issues/15633. +// Consider the case: +// 1. node-1 apply PrepareMerge (assume log index 30), so it's in is_merging +// status which reject all proposals except for Rollback Merge +// 2. node-1 advance persisted_apply to 30 +// 3. node-1 restart and became leader. Now, it's not in is_merging status, so +// proposals can be proposed +// 4. node-1 propose CompactLog, replicate it to other nodes, and commit +// 5. node-0 apply PrepareMerge +// 6. node-0 apply CompactLog +// 6. node-0 fetches raft log entries which is required by +// AdminCmdType::CommitMerge and panic (due to compacted) +#[test] +fn test_restart_may_lose_merging_state() { + use test_raftstore_v2::*; + let mut cluster = new_node_cluster(0, 2); + configure_for_merge(&mut cluster.cfg); + cluster.cfg.raft_store.raft_log_gc_count_limit = Some(12); + cluster.cfg.raft_store.raft_log_gc_tick_interval = ReadableDuration::millis(10); + cluster.cfg.raft_store.raft_base_tick_interval = ReadableDuration::millis(10); + cluster.cfg.raft_store.merge_check_tick_interval = ReadableDuration::millis(10); + + cluster.run(); + fail::cfg("maybe_propose_compact_log", "return").unwrap(); + fail::cfg("on_ask_commit_merge", "return").unwrap(); + fail::cfg("flush_before_close_threshold", "return(0)").unwrap(); + + let (tx, rx) = channel(); + let tx = Mutex::new(tx); + fail::cfg_callback("on_apply_res_prepare_merge", move || { + tx.lock().unwrap().send(()).unwrap(); + }) + .unwrap(); + + let region = cluster.get_region(b""); + cluster.must_split(®ion, b"k20"); + + let source = cluster.get_region(b"k05"); + let target = cluster.get_region(b"k25"); + + cluster.add_send_filter_on_node(2, Box::new(MsgVoteFilter {})); + + cluster.must_transfer_leader( + source.id, + source + .get_peers() + .iter() + .find(|p| p.store_id == 1) + .cloned() + .unwrap(), + ); + cluster.must_transfer_leader( + target.id, + target + .get_peers() + .iter() + .find(|p| p.store_id == 1) + .cloned() + .unwrap(), + ); + + for i in 0..20 { + let k = format!("k{:02}", i); + cluster.must_put(k.as_bytes(), b"val"); + } + + cluster.merge_region(source.id, target.id, Callback::None); + + rx.recv().unwrap(); + let router = cluster.get_router(1).unwrap(); + let (tx, rx) = sync_channel(1); + let msg = PeerMsg::FlushBeforeClose { tx }; + router.force_send(source.id, msg).unwrap(); + rx.recv().unwrap(); + + let (tx, rx) = channel(); + let tx = Mutex::new(tx); + fail::cfg_callback("on_apply_res_commit_merge_2", move || { + tx.lock().unwrap().send(()).unwrap(); + }) + .unwrap(); + + cluster.stop_node(1); + // Need to avoid propose commit merge, before node 1 becomes leader. Otherwise, + // the commit merge will be rejected. + let (tx2, rx2) = channel(); + let tx2 = Mutex::new(tx2); + fail::cfg_callback("on_applied_current_term", move || { + tx2.lock().unwrap().send(()).unwrap(); + }) + .unwrap(); + + fail::remove("maybe_propose_compact_log"); + cluster.run_node(1).unwrap(); + + // we have two regions. + rx2.recv().unwrap(); + rx2.recv().unwrap(); + fail::remove("on_ask_commit_merge"); + // wait node 2 to apply commit merge + rx.recv_timeout(Duration::from_secs(10)).unwrap(); + + wait_region_epoch_change(&cluster, &target, Duration::from_secs(5)); + + let region = cluster.get_region(b"k1"); + assert_eq!(region.get_id(), target.get_id()); + assert_eq!(region.get_start_key(), source.get_start_key()); + assert_eq!(region.get_end_key(), target.get_end_key()); + + cluster.must_put(b"k400", b"v400"); +} + +// If a node is isolated during merge, and the target peer is replaced by a peer +// with a larger ID, then the snapshot of the target peer covers the source +// regions as well. +// In such cases, the snapshot becomes an "atomic_snapshot" which needs to +// destroy the source peer too. +// This test case checks the race between destroying the source peer by atomic +// snapshot and the gc message. The source peer must be successfully destroyed +// in this case. +#[test_case(test_raftstore::new_node_cluster)] +fn test_destroy_race_during_atomic_snapshot_after_merge() { + let mut cluster = new_cluster(0, 3); + configure_for_merge(&mut cluster.cfg); + cluster.run(); + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + + cluster.must_transfer_leader(1, new_peer(1, 1)); + + cluster.must_put(b"k1", b"v1"); + cluster.must_put(b"k3", b"v3"); + + let region = cluster.get_region(b"k1"); + cluster.must_split(®ion, b"k2"); + let left = cluster.get_region(b"k1"); + let right = cluster.get_region(b"k3"); + + // Allow raft messages to source peer on store 3 before PrepareMerge. + let left_filter_block = Arc::new(atomic::AtomicBool::new(false)); + let left_filter_block_ = left_filter_block.clone(); + let left_blocked_messages = Arc::new(Mutex::new(vec![])); + let left_filter = RegionPacketFilter::new(left.get_id(), 3) + .direction(Direction::Recv) + .when(left_filter_block.clone()) + .reserve_dropped(left_blocked_messages.clone()) + .set_msg_callback(Arc::new(move |msg: &RaftMessage| { + debug!("dbg left msg_callback"; "msg" => ?msg); + if left_filter_block.load(atomic::Ordering::SeqCst) { + return; + } + for e in msg.get_message().get_entries() { + let ctx = raftstore::store::ProposalContext::from_bytes(&e.context); + if ctx.contains(raftstore::store::ProposalContext::PREPARE_MERGE) { + // Block further messages. + left_filter_block.store(true, atomic::Ordering::SeqCst); + } + } + })); + cluster.sim.wl().add_recv_filter(3, Box::new(left_filter)); + // Block messages to target peer on store 3. + let right_filter_block = Arc::new(atomic::AtomicBool::new(true)); + let new_peer_id = 1004; + let (new_peer_id_tx, new_peer_id_rx) = std::sync::mpsc::channel(); + let new_peer_id_tx = Mutex::new(Some(new_peer_id_tx)); + let (new_peer_snap_tx, new_peer_snap_rx) = std::sync::mpsc::channel(); + let new_peer_snap_tx = Mutex::new(new_peer_snap_tx); + let right_filter = RegionPacketFilter::new(right.get_id(), 3) + .direction(Direction::Recv) + .when(right_filter_block.clone()) + .set_msg_callback(Arc::new(move |msg: &RaftMessage| { + debug!("dbg right msg_callback"; "msg" => ?msg); + if msg.get_to_peer().get_id() == new_peer_id { + let _ = new_peer_id_tx.lock().unwrap().take().map(|tx| tx.send(())); + if msg.get_message().get_msg_type() == MessageType::MsgSnapshot { + let _ = new_peer_snap_tx.lock().unwrap().send(()); + } + } + })); + cluster.sim.wl().add_recv_filter(3, Box::new(right_filter)); + pd_client.must_merge(left.get_id(), right.get_id()); + + // Make target peer on store 3 a stale peer. + pd_client.must_remove_peer(right.get_id(), find_peer(&right, 3).unwrap().to_owned()); + pd_client.must_add_peer(right.get_id(), new_peer(3, new_peer_id)); + // Unblock messages to target peer on store 3. + right_filter_block.store(false, atomic::Ordering::SeqCst); + // Wait for receiving new peer id message to destroy stale target peer. + new_peer_id_rx.recv_timeout(Duration::from_secs(5)).unwrap(); + cluster.must_region_not_exist(right.get_id(), 3); + // Let source peer continue prepare merge. It will fails to schedule merge, + // because the target peer is destroyed. + left_filter_block_.store(false, atomic::Ordering::SeqCst); + // Before sending blocked messages, make sure source peer is paused at + // destroy apply delegate, so that the new right peer snapshot can will + // try to destroy source peer before applying snapshot. + fail::cfg("on_apply_handle_destroy", "pause").unwrap(); + // Send blocked messages to source peer. Prepare merge must fail to schedule + // CommitMerge because now target peer stale peer is destroyed. + let router = cluster.sim.wl().get_router(3).unwrap(); + for raft_msg in std::mem::take(&mut *left_blocked_messages.lock().unwrap()) { + router.send_raft_message(raft_msg).unwrap(); + } + // Wait the new right peer snapshot. + new_peer_snap_rx + .recv_timeout(Duration::from_secs(5)) + .unwrap(); + // Give it some time to step snapshot message. + sleep_ms(500); + // Let source peer destroy continue, so it races with atomic snapshot destroy. + fail::remove("on_apply_handle_destroy"); + + // New peer applies snapshot eventually. + cluster.must_transfer_leader(right.get_id(), new_peer(3, new_peer_id)); + cluster.must_put(b"k4", b"v4"); +} + +// `test_raft_log_gc_after_merge` tests when a region is destoryed, e.g. due to +// region merge, PeerFsm can still handle pending raft messages correctly. +#[test] +fn test_raft_log_gc_after_merge() { + let mut cluster = new_node_cluster(0, 1); + configure_for_merge(&mut cluster.cfg); + cluster.cfg.raft_store.store_batch_system.pool_size = 2; + cluster.run(); + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + + cluster.must_put(b"k1", b"v1"); + cluster.must_put(b"k3", b"v3"); + + let region = cluster.get_region(b"k1"); + cluster.must_split(®ion, b"k2"); + let left = cluster.get_region(b"k1"); + let right = cluster.get_region(b"k3"); + + fail::cfg_callback("destroy_region_before_gc_flush", move || { + fail::cfg("pause_on_peer_collect_message", "pause").unwrap(); + }) + .unwrap(); + + let (tx, rx) = channel(); + let tx = Arc::new(Mutex::new(tx)); + fail::cfg_callback("destroy_region_after_gc_flush", move || { + tx.lock().unwrap().send(()).unwrap(); + }) + .unwrap(); + + // the right peer's id is 1. + pd_client.must_merge(right.get_id(), left.get_id()); + rx.recv_timeout(Duration::from_secs(1)).unwrap(); + + let raft_router = cluster.get_router(1).unwrap(); + raft_router + .send_casual_msg(1, CasualMessage::ForceCompactRaftLogs) + .unwrap(); + + fail::remove("pause_on_peer_collect_message"); + + // wait some time for merge finish. + std::thread::sleep(Duration::from_secs(1)); + must_get_equal(&cluster.get_engine(1), b"k3", b"v3"); +} diff --git a/tests/failpoints/cases/test_pending_peers.rs b/tests/failpoints/cases/test_pending_peers.rs index c41c97034b4..b584f24c83c 100644 --- a/tests/failpoints/cases/test_pending_peers.rs +++ b/tests/failpoints/cases/test_pending_peers.rs @@ -1,7 +1,10 @@ // Copyright 2017 TiKV Project Authors. Licensed under Apache-2.0. -use std::sync::Arc; +use std::{sync::Arc, time::Duration}; +use crossbeam::channel; +use kvproto::raft_serverpb::RaftMessage; +use raft::eraftpb::MessageType; use test_raftstore::*; use tikv_util::{config::*, time::Instant}; @@ -109,3 +112,158 @@ fn test_pending_snapshot() { state2 ); } + +// Tests if store is marked with busy when there exists peers on +// busy on applying raft logs. +#[test] +fn test_on_check_busy_on_apply_peers() { + let mut cluster = new_node_cluster(0, 3); + cluster.cfg.raft_store.raft_base_tick_interval = ReadableDuration::millis(5); + cluster.cfg.raft_store.raft_store_max_leader_lease = ReadableDuration::millis(100); + cluster.cfg.raft_store.leader_transfer_max_log_lag = 10; + cluster.cfg.raft_store.check_long_uncommitted_interval = ReadableDuration::millis(10); // short check interval for recovery + cluster.cfg.raft_store.pd_heartbeat_tick_interval = ReadableDuration::millis(50); + + let pd_client = Arc::clone(&cluster.pd_client); + // Disable default max peer count check. + pd_client.disable_default_operator(); + + let r1 = cluster.run_conf_change(); + pd_client.must_add_peer(r1, new_peer(2, 1002)); + pd_client.must_add_peer(r1, new_peer(3, 1003)); + + cluster.must_put(b"k1", b"v1"); + must_get_equal(&cluster.get_engine(2), b"k1", b"v1"); + must_get_equal(&cluster.get_engine(3), b"k1", b"v1"); + + // Check the start status for peer 1003. + cluster.must_send_store_heartbeat(3); + sleep_ms(100); + let stats = cluster.pd_client.get_store_stats(3).unwrap(); + assert!(!stats.is_busy); + + // Pause peer 1003 on applying logs to make it pending. + let before_apply_stat = cluster.apply_state(r1, 3); + cluster.stop_node(3); + for i in 0..=cluster.cfg.raft_store.leader_transfer_max_log_lag { + let bytes = format!("k{:03}", i).into_bytes(); + cluster.must_put(&bytes, &bytes); + } + cluster.must_put(b"k2", b"v2"); + must_get_equal(&cluster.get_engine(1), b"k2", b"v2"); + must_get_equal(&cluster.get_engine(2), b"k2", b"v2"); + + // Restart peer 1003 and make it busy for applying pending logs. + fail::cfg("on_handle_apply_1003", "pause").unwrap(); + // Case 1: check the leader committed index comes from MsgAppend and + // MsgReadIndexResp is valid. + let (read_tx, read_rx) = channel::unbounded::(); + let (append_tx, append_rx) = channel::unbounded::(); + cluster.add_send_filter_on_node( + 1, + Box::new( + RegionPacketFilter::new(r1, 1) + .direction(Direction::Send) + .msg_type(MessageType::MsgReadIndexResp) + .set_msg_callback(Arc::new(move |msg: &RaftMessage| { + read_tx.send(msg.clone()).unwrap(); + })), + ), + ); + cluster.add_send_filter_on_node( + 1, + Box::new( + RegionPacketFilter::new(r1, 1) + .direction(Direction::Send) + .msg_type(MessageType::MsgAppend) + .set_msg_callback(Arc::new(move |msg: &RaftMessage| { + append_tx.send(msg.clone()).unwrap(); + })), + ), + ); + let leader_apply_state = cluster.apply_state(r1, 1); + cluster.run_node(3).unwrap(); + let append_msg = append_rx.recv_timeout(Duration::from_secs(2)).unwrap(); + assert_eq!( + append_msg.get_message().get_commit(), + leader_apply_state.applied_index + ); + let read_msg = read_rx.recv_timeout(Duration::from_secs(2)).unwrap(); + assert_eq!( + read_msg.get_message().get_index(), + leader_apply_state.applied_index + ); + cluster.clear_send_filter_on_node(1); + + // Case 2: completed regions < target count. + let after_apply_stat = cluster.apply_state(r1, 3); + assert!(after_apply_stat.applied_index == before_apply_stat.applied_index); + sleep_ms(100); + cluster.must_send_store_heartbeat(3); + sleep_ms(100); + let stats = cluster.pd_client.get_store_stats(3).unwrap(); + assert!(stats.is_busy); + sleep_ms(100); + + // Case 3: completed_apply_peers_count > completed_target_count but + // there exists busy peers. + fail::cfg("on_mock_store_completed_target_count", "return").unwrap(); + cluster.must_send_store_heartbeat(3); + sleep_ms(100); + let stats = cluster.pd_client.get_store_stats(3).unwrap(); + assert!(stats.is_busy); + fail::remove("on_mock_store_completed_target_count"); + // After peer 1003 is recovered, store also should not be marked with busy. + fail::remove("on_handle_apply_1003"); + sleep_ms(100); + must_get_equal(&cluster.get_engine(3), b"k2", b"v2"); + sleep_ms(100); + let after_apply_stat = cluster.apply_state(r1, 3); + assert!(after_apply_stat.applied_index > before_apply_stat.applied_index); + cluster.must_send_store_heartbeat(3); + sleep_ms(100); + let stats = cluster.pd_client.get_store_stats(3).unwrap(); + assert!(!stats.is_busy); +} + +#[test] +fn test_on_apply_snap_failed() { + let mut cluster = new_node_cluster(0, 3); + cluster.cfg.raft_store.raft_base_tick_interval = ReadableDuration::millis(5); + cluster.cfg.raft_store.raft_store_max_leader_lease = ReadableDuration::millis(100); + cluster.cfg.raft_store.pd_heartbeat_tick_interval = ReadableDuration::millis(100); + cluster.cfg.raft_store.pd_store_heartbeat_tick_interval = ReadableDuration::millis(100); + + let pd_client = Arc::clone(&cluster.pd_client); + // Disable default max peer count check. + pd_client.disable_default_operator(); + + let region_id = cluster.run_conf_change(); + pd_client.must_add_peer(region_id, new_peer(2, 2)); + + // To ensure peer 2 is not pending. + cluster.must_put(b"k1", b"v1"); + must_get_equal(&cluster.get_engine(2), b"k1", b"v1"); + + // Mock applying snapshot failed on peer 3. + fail::cfg("region_apply_snap_io_err", "return").unwrap(); + pd_client.must_add_peer(region_id, new_peer(3, 3)); + // Region worker is failed on applying snapshot. + test_util::eventually(Duration::from_millis(100), Duration::from_secs(1), || { + let pending_peers = pd_client.get_pending_peers(); + pending_peers[&3] == new_peer(3, 3) + }); + must_get_none(&cluster.get_engine(3), b"k1"); + cluster.must_send_store_heartbeat(3); + // Check that the region is marked as damaged. + test_util::eventually(Duration::from_millis(100), Duration::from_secs(1), || { + if let Some(stats) = pd_client.get_store_stats(3) { + !stats.damaged_regions_id.is_empty() + } else { + false + } + }); + let stats = pd_client.get_store_stats(3).unwrap(); + assert!(stats.damaged_regions_id.contains(®ion_id)); + fail::remove("region_apply_snap_io_err"); +} diff --git a/tests/failpoints/cases/test_rawkv.rs b/tests/failpoints/cases/test_rawkv.rs index a795422c120..1716068321e 100644 --- a/tests/failpoints/cases/test_rawkv.rs +++ b/tests/failpoints/cases/test_rawkv.rs @@ -289,7 +289,8 @@ fn test_raw_put_key_guard() { let region_id = region.get_id(); let client = suite.get_client(region_id); let ctx = suite.get_context(region_id); - let node_id = region.get_peers()[0].get_id(); + let leader = suite.cluster.leader_of_region(region_id).unwrap(); + let node_id = leader.get_id(); let leader_cm = suite.cluster.sim.rl().get_concurrency_manager(node_id); let ts_provider = suite.get_causal_ts_provider(node_id).unwrap(); let ts = block_on(ts_provider.async_get_ts()).unwrap(); @@ -304,9 +305,10 @@ fn test_raw_put_key_guard() { // Wait for global_min_lock_ts. sleep_ms(500); let start = Instant::now(); - while leader_cm.global_min_lock_ts().is_none() - && start.saturating_elapsed() < Duration::from_secs(5) - { + while leader_cm.global_min_lock_ts().is_none() { + if start.saturating_elapsed() > Duration::from_secs(5) { + panic!("wait for global_min_lock_ts timeout"); + } sleep_ms(200); } diff --git a/tests/failpoints/cases/test_read_execution_tracker.rs b/tests/failpoints/cases/test_read_execution_tracker.rs index c5ff93a70c1..7351044b297 100644 --- a/tests/failpoints/cases/test_read_execution_tracker.rs +++ b/tests/failpoints/cases/test_read_execution_tracker.rs @@ -2,13 +2,13 @@ use kvproto::kvrpcpb::*; use test_coprocessor::{init_with_data, DagSelect, ProductTable}; -use test_raftstore::{ - kv_batch_read, kv_read, must_kv_commit, must_kv_prewrite, must_new_cluster_and_kv_client, -}; +use test_raftstore::{kv_batch_read, kv_read, must_kv_commit, must_kv_prewrite}; +use test_raftstore_macro::test_case; -#[test] +#[test_case(test_raftstore::must_new_cluster_and_kv_client)] +#[test_case(test_raftstore_v2::must_new_cluster_and_kv_client)] fn test_read_execution_tracking() { - let (_cluster, client, ctx) = must_new_cluster_and_kv_client(); + let (_cluster, client, ctx) = new_cluster(); let (k1, v1) = (b"k1".to_vec(), b"v1".to_vec()); let (k2, v2) = (b"k2".to_vec(), b"v2".to_vec()); @@ -104,18 +104,21 @@ fn test_read_execution_tracking() { ); }; - fail::cfg("perform_read_index", "return()").unwrap(); + // return read_index twich: one for local reader and one for raftstore + fail::cfg("perform_read_index", "2*return()").unwrap(); // should perform read index let resp = kv_read(&client, ctx.clone(), k1.clone(), 100); read_index_checker(resp.get_exec_details_v2().get_scan_detail_v2()); + fail::cfg("perform_read_index", "2*return()").unwrap(); // should perform read index let resp = kv_batch_read(&client, ctx, vec![k1, k2], 100); read_index_checker(resp.get_exec_details_v2().get_scan_detail_v2()); + fail::cfg("perform_read_index", "2*return()").unwrap(); // should perform read index let resp = client.coprocessor(&coprocessor_request).unwrap(); diff --git a/tests/failpoints/cases/test_replica_stale_read.rs b/tests/failpoints/cases/test_replica_stale_read.rs index b7d436d92d7..cb986250d82 100644 --- a/tests/failpoints/cases/test_replica_stale_read.rs +++ b/tests/failpoints/cases/test_replica_stale_read.rs @@ -288,9 +288,11 @@ fn test_update_resoved_ts_before_apply_index() { sleep_ms(100); // The leader can't handle stale read with `commit_ts2` because its `safe_ts` - // can't update due to its `apply_index` not update + // can't update due to its `apply_index` not update. + // The request would be handled as a snapshot read on the valid leader peer + // after fallback. let resp = leader_client.kv_read(b"key1".to_vec(), commit_ts2); - assert!(resp.get_region_error().has_data_is_not_ready(),); + assert_eq!(resp.get_value(), b"value2"); // The follower can't handle stale read with `commit_ts2` because it don't // have enough data let resp = follower_client2.kv_read(b"key1".to_vec(), commit_ts2); @@ -667,10 +669,10 @@ fn test_stale_read_future_ts_not_update_max_ts() { b"key1".to_vec(), ); - // Perform stale read with a future ts should return error + // Perform stale read with a future ts, the stale read could be processed + // falling back to snapshot read on the leader peer. let read_ts = get_tso(&pd_client) + 10000000; - let resp = leader_client.kv_read(b"key1".to_vec(), read_ts); - assert!(resp.get_region_error().has_data_is_not_ready()); + leader_client.must_kv_read_equal(b"key1".to_vec(), b"value1".to_vec(), read_ts); // The `max_ts` should not updated by the stale read request, so we can prewrite // and commit `async_commit` transaction with a ts that smaller than the @@ -687,10 +689,10 @@ fn test_stale_read_future_ts_not_update_max_ts() { leader_client.must_kv_commit(vec![b"key2".to_vec()], prewrite_ts, commit_ts); leader_client.must_kv_read_equal(b"key2".to_vec(), b"value1".to_vec(), get_tso(&pd_client)); - // Perform stale read with a future ts should return error + // Perform stale read with a future ts, the stale read could be processed + // falling back to snapshot read on the leader peer. let read_ts = get_tso(&pd_client) + 10000000; - let resp = leader_client.kv_read(b"key1".to_vec(), read_ts); - assert!(resp.get_region_error().has_data_is_not_ready()); + leader_client.must_kv_read_equal(b"key2".to_vec(), b"value1".to_vec(), read_ts); // The `max_ts` should not updated by the stale read request, so 1pc transaction // with a ts that smaller than the `read_ts` should not be fallbacked to 2pc diff --git a/tests/failpoints/cases/test_snap.rs b/tests/failpoints/cases/test_snap.rs index 7748b1d2985..8f2ae2f61cc 100644 --- a/tests/failpoints/cases/test_snap.rs +++ b/tests/failpoints/cases/test_snap.rs @@ -992,3 +992,58 @@ fn test_snapshot_send_failed() { sleep_ms(100); assert!(mgr.list_snapshot().unwrap().is_empty()); } + +#[test] +/// Test a corrupted snapshot can be detected and retry to generate a new one. +fn test_retry_corrupted_snapshot() { + let mut cluster = new_node_cluster(0, 3); + let pd_client = cluster.pd_client.clone(); + pd_client.disable_default_operator(); + + let r = cluster.run_conf_change(); + cluster.must_put(b"k1", b"v1"); + must_get_none(&cluster.get_engine(3), b"k1"); + pd_client.must_add_peer(r, new_peer(2, 2)); + fail::cfg("inject_sst_file_corruption", "return").unwrap(); + pd_client.must_add_peer(r, new_peer(3, 3)); + + must_get_equal(&cluster.get_engine(3), b"k1", b"v1"); +} + +#[test] +fn test_send_snapshot_timeout() { + let mut cluster = new_server_cluster(1, 5); + cluster.cfg.raft_store.raft_log_gc_tick_interval = ReadableDuration::millis(20); + cluster.cfg.raft_store.raft_log_gc_count_limit = Some(8); + cluster.cfg.raft_store.merge_max_log_gap = 3; + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + + cluster.run(); + cluster.must_transfer_leader(1, new_peer(1, 1)); + cluster.stop_node(4); + cluster.stop_node(5); + (0..10).for_each(|_| cluster.must_put(b"k2", b"v2")); + // Sleep for a while to ensure all logs are compacted. + thread::sleep(Duration::from_millis(100)); + + fail::cfg("snap_send_duration_timeout", "return(100)").unwrap(); + + // Let store 4 inform leader to generate a snapshot. + cluster.run_node(4).unwrap(); + must_get_equal(&cluster.get_engine(4), b"k2", b"v2"); + + // add a delay to let send snapshot fail due to timeout. + fail::cfg("snap_send_timer_delay", "return(1000)").unwrap(); + cluster.run_node(5).unwrap(); + thread::sleep(Duration::from_millis(150)); + must_get_none(&cluster.get_engine(5), b"k2"); + + // only delay once, the snapshot should success after retry. + fail::cfg("snap_send_timer_delay", "1*return(1000)").unwrap(); + thread::sleep(Duration::from_millis(500)); + must_get_equal(&cluster.get_engine(5), b"k2", b"v2"); + + fail::remove("snap_send_timer_delay"); + fail::remove("snap_send_duration_timeout"); +} diff --git a/tests/failpoints/cases/test_split_region.rs b/tests/failpoints/cases/test_split_region.rs index 3520de4e3ad..35148f20db2 100644 --- a/tests/failpoints/cases/test_split_region.rs +++ b/tests/failpoints/cases/test_split_region.rs @@ -1,5 +1,4 @@ // Copyright 2018 TiKV Project Authors. Licensed under Apache-2.0. - use std::{ sync::{ atomic::{AtomicBool, Ordering}, @@ -18,6 +17,7 @@ use kvproto::{ Mutation, Op, PessimisticLockRequest, PrewriteRequest, PrewriteRequestPessimisticAction::*, }, metapb::Region, + pdpb::CheckPolicy, raft_serverpb::{PeerState, RaftMessage}, tikvpb::TikvClient, }; @@ -32,6 +32,7 @@ use raftstore::{ Result, }; use test_raftstore::*; +use test_raftstore_macro::test_case; use tikv::storage::{kv::SnapshotExt, Snapshot}; use tikv_util::{ config::{ReadableDuration, ReadableSize}, @@ -41,6 +42,85 @@ use tikv_util::{ }; use txn_types::{Key, LastChange, PessimisticLock, TimeStamp}; +#[test] +fn test_meta_inconsistency() { + let mut cluster = new_server_cluster(0, 3); + cluster.cfg.raft_store.store_batch_system.pool_size = 2; + cluster.cfg.raft_store.store_batch_system.max_batch_size = Some(1); + cluster.cfg.raft_store.apply_batch_system.pool_size = 2; + cluster.cfg.raft_store.apply_batch_system.max_batch_size = Some(1); + cluster.cfg.raft_store.hibernate_regions = false; + cluster.cfg.raft_store.raft_log_gc_threshold = 1000; + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + let region_id = cluster.run_conf_change(); + pd_client.must_add_peer(region_id, new_peer(2, 2)); + cluster.must_transfer_leader(region_id, new_peer(1, 1)); + cluster.must_put(b"k1", b"v1"); + + // Add new peer on node 3, its snapshot apply is paused. + fail::cfg("before_set_region_on_peer_3", "pause").unwrap(); + pd_client.must_add_peer(region_id, new_peer(3, 3)); + + // Let only heartbeat msg to pass so a replicate peer could be created on node 3 + // for peer 1003. + let region_packet_filter_region_1000_peer_1003 = + RegionPacketFilter::new(1000, 3).skip(MessageType::MsgHeartbeat); + cluster + .sim + .wl() + .add_recv_filter(3, Box::new(region_packet_filter_region_1000_peer_1003)); + + // Trigger a region split to create region 1000 with peer 1001, 1002 and 1003. + let region = cluster.get_region(b""); + cluster.must_split(®ion, b"k5"); + + // Scheduler a larger peed id heartbeat msg to trigger peer destroy for peer + // 1003, pause it before the meta.lock operation so new region insertions by + // region split could go first. + // Thus a inconsistency could happen because the destroy is handled + // by a uninitialized peer but the new initialized region info is inserted into + // the meta by region split. + fail::cfg("before_destroy_peer_on_peer_1003", "pause").unwrap(); + let new_region = cluster.get_region(b"k4"); + let mut larger_id_msg = Box::::default(); + larger_id_msg.set_region_id(1000); + larger_id_msg.set_to_peer(new_peer(3, 1113)); + larger_id_msg.set_region_epoch(new_region.get_region_epoch().clone()); + larger_id_msg + .mut_region_epoch() + .set_conf_ver(new_region.get_region_epoch().get_conf_ver() + 1); + larger_id_msg.set_from_peer(new_peer(1, 1001)); + let raft_message = larger_id_msg.mut_message(); + raft_message.set_msg_type(MessageType::MsgHeartbeat); + raft_message.set_from(1001); + raft_message.set_to(1113); + raft_message.set_term(6); + cluster.sim.wl().send_raft_msg(*larger_id_msg).unwrap(); + thread::sleep(Duration::from_millis(500)); + + // Let snapshot apply continue on peer 3 from region 0, then region split would + // be applied too. + fail::remove("before_set_region_on_peer_3"); + thread::sleep(Duration::from_millis(2000)); + + // Let self destroy continue after the region split is finished. + fail::remove("before_destroy_peer_on_peer_1003"); + sleep_ms(1000); + + // Clear the network partition nemesis, trigger a new region split, panic would + // be encountered The thread 'raftstore-3-1::test_message_order_3' panicked + // at 'meta corrupted: no region for 1000 7A6B35 when creating 1004 + // region_id: 1004 from_peer { id: 1005 store_id: 1 } to_peer { id: 1007 + // store_id: 3 } message { msg_type: MsgRequestPreVote to: 1007 from: 1005 + // term: 6 log_term: 5 index: 5 commit: 5 commit_term: 5 } region_epoch { + // conf_ver: 3 version: 3 } end_key: 6B32'. + cluster.sim.wl().clear_recv_filters(3); + let region = cluster.get_region(b"k1"); + cluster.must_split(®ion, b"k2"); + cluster.must_put(b"k1", b"v1"); +} + #[test] fn test_follower_slow_split() { let mut cluster = new_node_cluster(0, 3); @@ -268,6 +348,68 @@ impl Filter for PrevoteRangeFilter { } } +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] +fn test_region_size_after_split() { + let mut cluster = new_cluster(0, 1); + cluster.cfg.raft_store.right_derive_when_split = true; + cluster.cfg.raft_store.split_region_check_tick_interval = ReadableDuration::millis(100); + cluster.cfg.raft_store.pd_heartbeat_tick_interval = ReadableDuration::millis(100); + cluster.cfg.raft_store.region_split_check_diff = Some(ReadableSize(10)); + let region_max_size = 1440; + let region_split_size = 960; + cluster.cfg.coprocessor.region_max_size = Some(ReadableSize(region_max_size)); + cluster.cfg.coprocessor.region_split_size = Some(ReadableSize(region_split_size)); + let pd_client = cluster.pd_client.clone(); + pd_client.disable_default_operator(); + let _r = cluster.run_conf_change(); + + // insert 20 key value pairs into the cluster. + // from 000000001 to 000000020 + let mut range = 1..; + put_till_size(&mut cluster, region_max_size - 100, &mut range); + sleep_ms(100); + // disable check split. + fail::cfg("on_split_region_check_tick", "return").unwrap(); + let max_key = put_till_size(&mut cluster, region_max_size, &mut range); + // split by use key, split region 1 to region 1 and region 2. + // region 1: ["000000010",""] + // region 2: ["","000000010") + let region = pd_client.get_region(&max_key).unwrap(); + cluster.must_split(®ion, b"000000010"); + let size = cluster + .pd_client + .get_region_approximate_size(region.get_id()) + .unwrap_or_default(); + assert!(size >= region_max_size - 100, "{}", size); + + let region = pd_client.get_region(b"000000009").unwrap(); + let size1 = cluster + .pd_client + .get_region_approximate_size(region.get_id()) + .unwrap_or_default(); + assert_eq!(0, size1, "{}", size1); + + // split region by size check, the region 1 will be split to region 1 and region + // 3. and the region3 will contains one half region size data. + let region = pd_client.get_region(&max_key).unwrap(); + pd_client.split_region(region.clone(), CheckPolicy::Scan, vec![]); + sleep_ms(200); + let size2 = cluster + .pd_client + .get_region_approximate_size(region.get_id()) + .unwrap_or_default(); + assert!(size > size2, "{}:{}", size, size2); + fail::remove("on_split_region_check_tick"); + + let region = pd_client.get_region(b"000000010").unwrap(); + let size3 = cluster + .pd_client + .get_region_approximate_size(region.get_id()) + .unwrap_or_default(); + assert!(size3 > 0, "{}", size3); +} + // Test if a peer is created from splitting when another initialized peer with // the same region id has already existed. In previous implementation, it can be // created and panic will happen because there are two initialized peer with the @@ -410,6 +552,115 @@ fn test_split_not_to_split_existing_tombstone_region() { must_get_equal(&cluster.get_engine(2), b"k1", b"v1"); } +#[test] +fn test_stale_peer_handle_snap() { + test_stale_peer_handle_raft_msg("on_snap_msg_1000_2"); +} + +#[test] +fn test_stale_peer_handle_vote() { + test_stale_peer_handle_raft_msg("on_vote_msg_1000_2"); +} + +#[test] +fn test_stale_peer_handle_append() { + test_stale_peer_handle_raft_msg("on_append_msg_1000_2"); +} + +#[test] +fn test_stale_peer_handle_heartbeat() { + test_stale_peer_handle_raft_msg("on_heartbeat_msg_1000_2"); +} + +fn test_stale_peer_handle_raft_msg(on_handle_raft_msg_1000_2_fp: &str) { + // The following diagram represents the final state of the test: + // + // ┌───────────┐ ┌───────────┐ ┌───────────┐ + // │ │ │ │ │ │ + // Region 1 │ Peer 1 │ │ Peer 2 │ │ Peer 3 │ + // [k2, +∞) │ │ │ │ │ │ + // ───────────────────┼───────────┼──┼───────────┼──┼───────────┼── + // │ │ │ │ │ │ + // Region 1000 │ Peer 1001 │ │ Peer 1003 │ │ Peer 1002 │ + // (-∞, k2) │ │ │ │ │ │ + // └───────────┘ └───────────┘ └───────────┘ + // Store 1 Store 2 Store 3 + // + // In this test, there is a split operation and Peer 1003 will be created + // twice (by raft message and by split). The new Peer 1003 will replace the + // old Peer 1003 and but it will be immediately removed. This test verifies + // that TiKV would not panic if the old Peer 1003 continues to process a + // remaining raft message (which may be a snapshot/vote/heartbeat/append + // message). + + let mut cluster = new_node_cluster(0, 3); + configure_for_merge(&mut cluster.cfg); + cluster.cfg.raft_store.right_derive_when_split = true; + cluster.cfg.raft_store.store_batch_system.max_batch_size = Some(1); + cluster.cfg.raft_store.store_batch_system.pool_size = 2; + cluster.cfg.raft_store.apply_batch_system.max_batch_size = Some(1); + cluster.cfg.raft_store.apply_batch_system.pool_size = 2; + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + + fail::cfg("on_raft_gc_log_tick", "return()").unwrap(); + let r1 = cluster.run_conf_change(); + // Add Peer 3 + pd_client.must_add_peer(r1, new_peer(3, 3)); + assert_eq!(r1, 1); + + // Pause the snapshot apply of Peer 2. + let before_check_snapshot_1_2_fp = "before_check_snapshot_1_2"; + fail::cfg(before_check_snapshot_1_2_fp, "pause").unwrap(); + + // Add Peer 2. The peer will be created but stuck at applying snapshot due + // to the failpoint above. + pd_client.must_add_peer(r1, new_peer(2, 2)); + cluster.must_put(b"k1", b"v1"); + + // Before the split, pause Peer 1003 when processing a certain raft message. + // The message type depends on the failpoint name input. + fail::cfg(on_handle_raft_msg_1000_2_fp, "pause").unwrap(); + + // Split the region into Region 1 and Region 1000. Peer 1003 will be created + // for the first time when it receives a raft message from Peer 1001, but it + // will remain uninitialized because it's paused due to the failpoint above. + let region = pd_client.get_region(b"k1").unwrap(); + + cluster.must_split(®ion, b"k2"); + cluster.must_put(b"k22", b"v22"); + + // Check that Store 2 doesn't have any data yet. + must_get_none(&cluster.get_engine(2), b"k1"); + must_get_none(&cluster.get_engine(2), b"k22"); + + // Unblock Peer 2. It will proceed to apply the split operation, which + // creates Peer 1003 for the second time and replaces the old Peer 1003. + fail::remove(before_check_snapshot_1_2_fp); + + // Verify that data can be accessed from Peer 2 and the new Peer 1003. + must_get_equal(&cluster.get_engine(2), b"k1", b"v1"); + must_get_equal(&cluster.get_engine(2), b"k22", b"v22"); + + // Immediately remove the new Peer 1003. This removes the region metadata. + let left = pd_client.get_region(b"k1").unwrap(); + let left_peer_2 = find_peer(&left, 2).cloned().unwrap(); + pd_client.must_remove_peer(left.get_id(), left_peer_2); + must_get_none(&cluster.get_engine(2), b"k1"); + must_get_equal(&cluster.get_engine(2), b"k22", b"v22"); + + // Unblock the old Peer 1003 so that it can continue to process its raft + // message. It would lead to a panic when it processes a snapshot message if + // #17469 is not fixed. + fail::remove(on_handle_raft_msg_1000_2_fp); + + // Waiting for the stale peer to handle its raft message. + sleep_ms(300); + + must_get_none(&cluster.get_engine(2), b"k1"); + must_get_equal(&cluster.get_engine(2), b"k22", b"v22"); +} + // TiKV uses memory lock to control the order between spliting and creating // new peer. This case test if tikv continues split if the peer is destroyed // after memory lock check. @@ -674,7 +925,7 @@ impl Filter for CollectSnapshotFilter { #[test] fn test_split_duplicated_batch() { let mut cluster = new_node_cluster(0, 3); - configure_for_request_snapshot(&mut cluster); + configure_for_request_snapshot(&mut cluster.cfg); // Disable raft log gc in this test case. cluster.cfg.raft_store.raft_log_gc_tick_interval = ReadableDuration::secs(60); // Use one thread to make it more possible to be fetched into one batch. @@ -1407,3 +1658,65 @@ fn test_split_region_with_no_valid_split_keys() { rx.recv_timeout(Duration::from_secs(5)).unwrap(); rx.try_recv().unwrap_err(); } + +/// This test case test if a split failed for some reason, +/// it can continue run split check and eventually the split will finish +#[test_case(test_raftstore::new_node_cluster)] +fn test_split_by_split_check_on_size() { + let mut cluster = new_cluster(0, 1); + cluster.cfg.raft_store.right_derive_when_split = true; + cluster.cfg.raft_store.split_region_check_tick_interval = ReadableDuration::millis(50); + cluster.cfg.raft_store.pd_heartbeat_tick_interval = ReadableDuration::millis(100); + cluster.cfg.raft_store.region_split_check_diff = Some(ReadableSize(10)); + let region_max_size = 1440; + let region_split_size = 960; + cluster.cfg.coprocessor.region_max_size = Some(ReadableSize(region_max_size)); + cluster.cfg.coprocessor.region_split_size = Some(ReadableSize(region_split_size)); + let pd_client = cluster.pd_client.clone(); + pd_client.disable_default_operator(); + let _r = cluster.run_conf_change(); + + // make first split fail + // 1*return means it would run "return" action once + fail::cfg("fail_pre_propose_split", "1*return").unwrap(); + + // Insert region_max_size into the cluster. + // It should trigger the split + let mut range = 1..; + let key = put_till_size(&mut cluster, region_max_size / 2, &mut range); + let region = pd_client.get_region(&key).unwrap(); + put_till_size(&mut cluster, region_max_size / 2 + 100, &mut range); + // waiting the split, + cluster.wait_region_split(®ion); +} + +/// This test case test if a split failed for some reason, +/// it can continue run split check and eventually the split will finish +#[test_case(test_raftstore::new_node_cluster)] +fn test_split_by_split_check_on_keys() { + let mut cluster = new_cluster(0, 1); + cluster.cfg.raft_store.right_derive_when_split = true; + cluster.cfg.raft_store.split_region_check_tick_interval = ReadableDuration::millis(50); + cluster.cfg.raft_store.pd_heartbeat_tick_interval = ReadableDuration::millis(100); + cluster.cfg.raft_store.region_split_check_diff = Some(ReadableSize(10)); + let region_max_keys = 15; + let region_split_keys = 10; + cluster.cfg.coprocessor.region_max_keys = Some(region_max_keys); + cluster.cfg.coprocessor.region_split_keys = Some(region_split_keys); + let pd_client = cluster.pd_client.clone(); + pd_client.disable_default_operator(); + let _r = cluster.run_conf_change(); + + // make first split fail + // 1*return means it would run "return" action once + fail::cfg("fail_pre_propose_split", "1*return").unwrap(); + + // Insert region_max_size into the cluster. + // It should trigger the split + let mut range = 1..; + let key = put_till_count(&mut cluster, region_max_keys / 2, &mut range); + let region = pd_client.get_region(&key).unwrap(); + put_till_count(&mut cluster, region_max_keys / 2 + 3, &mut range); + // waiting the split, + cluster.wait_region_split(®ion); +} diff --git a/tests/failpoints/cases/test_sst_recovery.rs b/tests/failpoints/cases/test_sst_recovery.rs index a4c1f10b5ae..da5a3da1a32 100644 --- a/tests/failpoints/cases/test_sst_recovery.rs +++ b/tests/failpoints/cases/test_sst_recovery.rs @@ -105,7 +105,7 @@ fn test_sst_recovery_overlap_range_sst_exist() { must_get_equal(&engine1, b"7", b"val_1"); // Validate the damaged sst has been deleted. - compact_files_to_target_level(&engine1, true, 3).unwrap(); + compact_files_to_target_level(&engine1, true, 6).unwrap(); let files = engine1.as_inner().get_live_files(); assert_eq!(files.get_files_count(), 1); @@ -252,7 +252,7 @@ fn create_tikv_cluster_with_one_node_damaged() disturb_sst_file(&sst_path); // The sst file is damaged, so this action will fail. - assert_corruption(compact_files_to_target_level(&engine1, true, 3)); + assert_corruption(compact_files_to_target_level(&engine1, true, 6)); (cluster, pd_client, engine1) } diff --git a/tests/failpoints/cases/test_stale_peer.rs b/tests/failpoints/cases/test_stale_peer.rs index 39fa09ef014..df714ff7b09 100644 --- a/tests/failpoints/cases/test_stale_peer.rs +++ b/tests/failpoints/cases/test_stale_peer.rs @@ -12,6 +12,7 @@ use kvproto::raft_serverpb::{PeerState, RaftLocalState, RaftMessage}; use pd_client::PdClient; use raft::eraftpb::MessageType; use test_raftstore::*; +use test_raftstore_macro::test_case; use tikv_util::{config::ReadableDuration, time::Instant, HandyRwLock}; #[test] @@ -44,7 +45,8 @@ fn test_one_node_leader_missing() { fail::remove(check_stale_state); } -#[test] +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] fn test_node_update_localreader_after_removed() { let mut cluster = new_node_cluster(0, 6); let pd_client = cluster.pd_client.clone(); @@ -90,7 +92,8 @@ fn test_node_update_localreader_after_removed() { cluster.must_region_not_exist(r1, 2); } -#[test] +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] fn test_stale_learner_restart() { let mut cluster = new_node_cluster(0, 2); cluster.pd_client.disable_default_operator(); @@ -133,9 +136,11 @@ fn test_stale_learner_restart() { must_get_equal(&cluster.get_engine(2), b"k2", b"v2"); } +/// pass /// Test if a peer can be destroyed through tombstone msg when applying /// snapshot. -#[test] +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] fn test_stale_peer_destroy_when_apply_snapshot() { let mut cluster = new_node_cluster(0, 3); configure_for_snapshot(&mut cluster.cfg); @@ -205,14 +210,17 @@ fn test_stale_peer_destroy_when_apply_snapshot() { fail::remove(region_apply_snap_fp); // Wait for peer 3 changing `SnapState` sleep_ms(100); - cluster.sim.wl().send_raft_msg(tombstone_msg).unwrap(); + // we expect the peer would be destroyed after applying the snapshot without + // another message trigger must_get_none(&cluster.get_engine(3), b"k1"); } +/// pass /// Test if destroy a uninitialized peer through tombstone msg would allow a /// staled peer be created again. -#[test] +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] fn test_destroy_uninitialized_peer_when_there_exists_old_peer() { // 4 stores cluster. let mut cluster = new_node_cluster(0, 4); @@ -291,7 +299,8 @@ fn test_destroy_uninitialized_peer_when_there_exists_old_peer() { /// Logs scan are now moved to raftlog gc threads. The case is to test if logs /// are still cleaned up when there is stale logs before first index during /// destroy. -#[test] +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] fn test_destroy_clean_up_logs_with_unfinished_log_gc() { let mut cluster = new_node_cluster(0, 3); cluster.cfg.raft_store.raft_log_gc_count_limit = Some(15); diff --git a/tests/failpoints/cases/test_stale_read.rs b/tests/failpoints/cases/test_stale_read.rs index 523bb54f7cb..a9c6fa5d6e6 100644 --- a/tests/failpoints/cases/test_stale_read.rs +++ b/tests/failpoints/cases/test_stale_read.rs @@ -325,7 +325,7 @@ fn test_read_index_when_transfer_leader_2() { // Increase the election tick to make this test case running reliably. configure_for_lease_read(&mut cluster.cfg, Some(50), Some(10_000)); // Stop log compaction to transfer leader with filter easier. - configure_for_request_snapshot(&mut cluster); + configure_for_request_snapshot(&mut cluster.cfg); let max_lease = Duration::from_secs(2); cluster.cfg.raft_store.raft_store_max_leader_lease = ReadableDuration(max_lease); diff --git a/tests/failpoints/cases/test_storage.rs b/tests/failpoints/cases/test_storage.rs index 57047bef9d4..fec1ccc931d 100644 --- a/tests/failpoints/cases/test_storage.rs +++ b/tests/failpoints/cases/test_storage.rs @@ -24,6 +24,7 @@ use kvproto::{ }; use resource_control::ResourceGroupManager; use test_raftstore::*; +use test_raftstore_macro::test_case; use tikv::{ config::{ConfigController, Module}, storage::{ @@ -44,10 +45,11 @@ use tikv::{ use tikv_util::{future::paired_future_callback, worker::dummy_scheduler, HandyRwLock}; use txn_types::{Key, Mutation, TimeStamp}; -#[test] +#[test_case(test_raftstore::new_server_cluster)] +#[test_case(test_raftstore_v2::new_server_cluster)] fn test_scheduler_leader_change_twice() { let snapshot_fp = "scheduler_async_snapshot_finish"; - let mut cluster = new_server_cluster(0, 2); + let mut cluster = new_cluster(0, 2); cluster.run(); let region0 = cluster.get_region(b""); let peers = region0.get_peers(); @@ -108,10 +110,11 @@ fn test_scheduler_leader_change_twice() { } } -#[test] +#[test_case(test_raftstore::new_server_cluster)] +#[test_case(test_raftstore_v2::new_server_cluster)] fn test_server_catching_api_error() { let raftkv_fp = "raftkv_early_error_report"; - let mut cluster = new_server_cluster(0, 1); + let mut cluster = new_cluster(0, 1); cluster.run(); let region = cluster.get_region(b""); let leader = region.get_peers()[0].clone(); @@ -168,10 +171,11 @@ fn test_server_catching_api_error() { must_get_equal(&cluster.get_engine(1), b"k3", b"v3"); } -#[test] +#[test_case(test_raftstore::new_server_cluster)] +#[test_case(test_raftstore_v2::new_server_cluster)] fn test_raftkv_early_error_report() { let raftkv_fp = "raftkv_early_error_report"; - let mut cluster = new_server_cluster(0, 1); + let mut cluster = new_cluster(0, 1); cluster.run(); cluster.must_split(&cluster.get_region(b"k0"), b"k1"); @@ -233,10 +237,12 @@ fn test_raftkv_early_error_report() { fail::remove(raftkv_fp); } -#[test] +// FIXME: #[test_case(test_raftstore_v2::new_server_cluster)] +// Raftstore-v2 not support get the storage engine, returning `None` currently. +#[test_case(test_raftstore::new_server_cluster)] fn test_scale_scheduler_pool() { let snapshot_fp = "scheduler_start_execute"; - let mut cluster = new_server_cluster(0, 1); + let mut cluster = new_cluster(0, 1); cluster.run(); let origin_pool_size = cluster.cfg.storage.scheduler_worker_pool_size; @@ -332,9 +338,10 @@ fn test_scale_scheduler_pool() { fail::remove(snapshot_fp); } -#[test] +#[test_case(test_raftstore::new_server_cluster)] +#[test_case(test_raftstore_v2::new_server_cluster)] fn test_scheduler_pool_auto_switch_for_resource_ctl() { - let mut cluster = new_server_cluster(0, 1); + let mut cluster = new_cluster(0, 1); cluster.run(); let engine = cluster @@ -345,12 +352,12 @@ fn test_scheduler_pool_auto_switch_for_resource_ctl() { .get(&1) .unwrap() .clone(); - let resource_manager = ResourceGroupManager::default(); + let resource_manager = Arc::new(ResourceGroupManager::default()); let resource_ctl = resource_manager.derive_controller("test".to_string(), true); let storage = TestStorageBuilderApiV1::from_engine_and_lock_mgr(engine, MockLockManager::new()) .config(cluster.cfg.tikv.storage.clone()) - .build_for_resource_controller(resource_ctl) + .build_for_resource_controller(resource_manager.clone(), resource_ctl) .unwrap(); let region = cluster.get_region(b"k1"); @@ -1090,9 +1097,10 @@ fn test_async_apply_prewrite_impl( } } -#[test] +#[test_case(test_raftstore::new_server_cluster)] +#[test_case(test_raftstore_v2::new_server_cluster)] fn test_async_apply_prewrite() { - let mut cluster = new_server_cluster(0, 1); + let mut cluster = new_cluster(0, 1); cluster.run(); let engine = cluster @@ -1149,7 +1157,6 @@ fn test_async_apply_prewrite() { true, true, ); - test_async_apply_prewrite_impl( &storage, ctx.clone(), @@ -1188,9 +1195,10 @@ fn test_async_apply_prewrite() { ); } -#[test] +#[test_case(test_raftstore::new_server_cluster)] +#[test_case(test_raftstore_v2::new_server_cluster)] fn test_async_apply_prewrite_fallback() { - let mut cluster = new_server_cluster(0, 1); + let mut cluster = new_cluster(0, 1); cluster.run(); let engine = cluster @@ -1378,9 +1386,10 @@ fn test_async_apply_prewrite_1pc_impl( } } -#[test] +#[test_case(test_raftstore::new_server_cluster)] +#[test_case(test_raftstore_v2::new_server_cluster)] fn test_async_apply_prewrite_1pc() { - let mut cluster = new_server_cluster(0, 1); + let mut cluster = new_cluster(0, 1); cluster.run(); let engine = cluster @@ -1405,9 +1414,10 @@ fn test_async_apply_prewrite_1pc() { test_async_apply_prewrite_1pc_impl(&storage, ctx, b"key", b"value2", 20, true); } -#[test] +#[test_case(test_raftstore::new_server_cluster)] +#[test_case(test_raftstore_v2::new_server_cluster)] fn test_atomic_cas_lock_by_latch() { - let mut cluster = new_server_cluster(0, 1); + let mut cluster = new_cluster(0, 1); cluster.run(); let engine = cluster @@ -1493,9 +1503,10 @@ fn test_atomic_cas_lock_by_latch() { assert_eq!(b"v2".to_vec(), ret); } -#[test] +#[test_case(test_raftstore::new_server_cluster)] +#[test_case(test_raftstore_v2::new_server_cluster)] fn test_before_async_write_deadline() { - let mut cluster = new_server_cluster(0, 1); + let mut cluster = new_cluster(0, 1); cluster.run(); let engine = cluster @@ -1532,12 +1543,13 @@ fn test_before_async_write_deadline() { )); } -#[test] +#[test_case(test_raftstore::new_server_cluster)] +#[test_case(test_raftstore_v2::new_server_cluster)] fn test_deadline_exceeded_on_get_and_batch_get() { use tikv_util::time::Instant; use tracker::INVALID_TRACKER_TOKEN; - let mut cluster = new_server_cluster(0, 1); + let mut cluster = new_cluster(0, 1); cluster.run(); let engine = cluster @@ -1591,9 +1603,10 @@ fn test_deadline_exceeded_on_get_and_batch_get() { fail::remove("after-snapshot"); } -#[test] +#[test_case(test_raftstore::new_server_cluster)] +#[test_case(test_raftstore_v2::new_server_cluster)] fn test_before_propose_deadline() { - let mut cluster = new_server_cluster(0, 1); + let mut cluster = new_cluster(0, 1); cluster.run(); let engine = cluster.sim.read().unwrap().storages[&1].clone(); @@ -1629,9 +1642,10 @@ fn test_before_propose_deadline() { ); } -#[test] +#[test_case(test_raftstore::new_server_cluster)] +#[test_case(test_raftstore_v2::new_server_cluster)] fn test_resolve_lock_deadline() { - let mut cluster = new_server_cluster(0, 1); + let mut cluster = new_cluster(0, 1); cluster.run(); let engine = cluster.sim.read().unwrap().storages[&1].clone(); @@ -1789,10 +1803,11 @@ fn test_mvcc_concurrent_commit_and_rollback_at_shutdown() { assert_eq!(get_resp.value, v); } -#[test] +#[test_case(test_raftstore::new_server_cluster)] +#[test_case(test_raftstore_v2::new_server_cluster)] fn test_raw_put_deadline() { let deadline_fp = "deadline_check_fail"; - let mut cluster = new_server_cluster(0, 1); + let mut cluster = new_cluster(0, 1); cluster.run(); let region = cluster.get_region(b""); let leader = region.get_peers()[0].clone(); diff --git a/tests/failpoints/cases/test_transaction.rs b/tests/failpoints/cases/test_transaction.rs index 14f4161c7ae..0b6e6269e95 100644 --- a/tests/failpoints/cases/test_transaction.rs +++ b/tests/failpoints/cases/test_transaction.rs @@ -2,6 +2,7 @@ use std::{ sync::{ + atomic::{AtomicBool, Ordering}, mpsc::{channel, sync_channel}, Arc, Mutex, }, @@ -9,13 +10,15 @@ use std::{ time::Duration, }; -use futures::executor::block_on; +use engine_traits::CF_DEFAULT; +use futures::{executor::block_on, StreamExt}; use grpcio::{ChannelBuilder, Environment}; use kvproto::{ kvrpcpb::{ self as pb, AssertionLevel, Context, GetRequest, Op, PessimisticLockRequest, PrewriteRequest, PrewriteRequestPessimisticAction::*, }, + raft_serverpb::RaftMessage, tikvpb::TikvClient, }; use raft::prelude::{ConfChangeType, MessageType}; @@ -45,7 +48,9 @@ use tikv::{ Snapshot, TestEngineBuilder, TestStorageBuilderApiV1, }, }; +use tikv_kv::{Engine, Modify, WriteData, WriteEvent}; use tikv_util::{ + config::ReadableDuration, store::{new_peer, peer::new_incoming_voter}, HandyRwLock, }; @@ -803,3 +808,97 @@ fn test_next_last_change_info_called_when_gc() { assert_eq!(h.join().unwrap().unwrap().as_slice(), b"v"); } + +fn must_put(ctx: &Context, engine: &E, key: &[u8], value: &[u8]) { + engine.put(ctx, Key::from_raw(key), value.to_vec()).unwrap(); +} + +fn must_delete(ctx: &Context, engine: &E, key: &[u8]) { + engine.delete(ctx, Key::from_raw(key)).unwrap(); +} + +// Before the fix, a proposal can be proposed twice, which is caused by that +// write proposal validation and propose are not atomic. So a raft message with +// higher term between them can make the proposal goes to msg proposal +// forwarding logic. However, raft proposal forawrd logic is not compatible with +// the raft store, as the failed proposal makes client retry. The retried +// proposal coupled with forward proposal makes the propsal applied twice. +#[test] +fn test_forbid_forward_propose() { + use test_raftstore_v2::*; + let count = 3; + let mut cluster = new_server_cluster(0, count); + cluster.cfg.raft_store.raft_base_tick_interval = ReadableDuration::millis(10); + cluster.cfg.raft_store.store_batch_system.pool_size = 2; + cluster.run(); + + let region = cluster.get_region(b""); + let peer1 = new_peer(1, 1); + let peer2 = new_peer(2, 2); + cluster.must_transfer_leader(region.id, peer2.clone()); + let storage = cluster.sim.rl().storages[&1].clone(); + let storage2 = cluster.sim.rl().storages[&2].clone(); + + let p = Arc::new(AtomicBool::new(false)); + let p2 = p.clone(); + let (tx, rx) = channel(); + let tx = Mutex::new(tx); + cluster.add_recv_filter_on_node( + 2, + Box::new(DropMessageFilter::new(Arc::new(move |_| { + if p2.load(Ordering::Relaxed) { + tx.lock().unwrap().send(()).unwrap(); + // One msg is enough + p2.store(false, Ordering::Relaxed); + true + } else { + false + } + }))), + ); + + let k = Key::from_raw(b"k"); + let mut ctx = Context::default(); + ctx.set_region_id(region.get_id()); + ctx.set_region_epoch(region.get_region_epoch().clone()); + ctx.set_peer(peer2); + + // block node when collecting message to make async write proposal and a raft + // message with higher term occured in a single batch. + fail::cfg("on_peer_collect_message_2", "pause").unwrap(); + let mut res = storage2.async_write( + &ctx, + WriteData::from_modifies(vec![Modify::Put(CF_DEFAULT, k.clone(), b"val".to_vec())]), + WriteEvent::EVENT_PROPOSED, + None, + ); + + // Make node 1 become leader + let router = cluster.get_router(1).unwrap(); + let mut raft_msg = RaftMessage::default(); + raft_msg.set_region_id(1); + raft_msg.set_to_peer(peer1.clone()); + raft_msg.set_region_epoch(region.get_region_epoch().clone()); + raft_msg + .mut_message() + .set_msg_type(MessageType::MsgTimeoutNow); + router.send_raft_message(Box::new(raft_msg)).unwrap(); + + std::thread::sleep(Duration::from_secs(1)); + + ctx.set_peer(peer1); + must_put(&ctx, &storage, b"k", b"val"); + must_delete(&ctx, &storage, b"k"); + + p.store(true, Ordering::Release); + rx.recv().unwrap(); + // Ensure the msg is sent by router. + std::thread::sleep(Duration::from_millis(100)); + fail::remove("on_peer_collect_message_2"); + + let r = block_on(async { res.next().await }).unwrap(); + assert!(matches!(r, WriteEvent::Finished(Err { .. }))); + + std::thread::sleep(Duration::from_secs(1)); + assert_eq!(cluster.get(k.as_encoded()), None); +} diff --git a/tests/failpoints/cases/test_unsafe_recovery.rs b/tests/failpoints/cases/test_unsafe_recovery.rs index cc33a01ff03..9e5a5dffcd9 100644 --- a/tests/failpoints/cases/test_unsafe_recovery.rs +++ b/tests/failpoints/cases/test_unsafe_recovery.rs @@ -440,3 +440,155 @@ fn test_unsafe_recovery_demotion_reentrancy() { assert_eq!(demoted, true); fail::remove("on_handle_apply_store_1"); } + +#[test_case(test_raftstore::new_node_cluster)] +fn test_unsafe_recovery_rollback_merge() { + let mut cluster = new_cluster(0, 3); + cluster.cfg.raft_store.raft_store_max_leader_lease = ReadableDuration::millis(40); + cluster.cfg.raft_store.merge_check_tick_interval = ReadableDuration::millis(20); + cluster.run(); + let nodes = Vec::from_iter(cluster.get_node_ids()); + assert_eq!(nodes.len(), 3); + + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + + for i in 0..10 { + cluster.must_put(format!("k{}", i).as_bytes(), b"v"); + } + + // Block merge commit, let go of the merge prepare. + fail::cfg("on_schedule_merge", "return()").unwrap(); + + let region = pd_client.get_region(b"k1").unwrap(); + cluster.must_split(®ion, b"k2"); + + let left = pd_client.get_region(b"k1").unwrap(); + let right = pd_client.get_region(b"k2").unwrap(); + + // Makes the leadership definite. + let left_peer_2 = find_peer(&left, nodes[2]).unwrap().to_owned(); + let right_peer_2 = find_peer(&right, nodes[2]).unwrap().to_owned(); + cluster.must_transfer_leader(left.get_id(), left_peer_2); + cluster.must_transfer_leader(right.get_id(), right_peer_2); + cluster.must_try_merge(left.get_id(), right.get_id()); + + // Makes the group lose its quorum. + cluster.stop_node(nodes[1]); + cluster.stop_node(nodes[2]); + { + let put = new_put_cmd(b"k2", b"v2"); + let req = new_request( + region.get_id(), + region.get_region_epoch().clone(), + vec![put], + true, + ); + // marjority is lost, can't propose command successfully. + cluster + .call_command_on_leader(req, Duration::from_millis(10)) + .unwrap_err(); + } + + cluster.must_enter_force_leader(left.get_id(), nodes[0], vec![nodes[1], nodes[2]]); + cluster.must_enter_force_leader(right.get_id(), nodes[0], vec![nodes[1], nodes[2]]); + + // Construct recovery plan. + let mut plan = pdpb::RecoveryPlan::default(); + + let left_demote_peers: Vec = left + .get_peers() + .iter() + .filter(|&peer| peer.get_store_id() != nodes[0]) + .cloned() + .collect(); + let mut left_demote = pdpb::DemoteFailedVoters::default(); + left_demote.set_region_id(left.get_id()); + left_demote.set_failed_voters(left_demote_peers.into()); + let right_demote_peers: Vec = right + .get_peers() + .iter() + .filter(|&peer| peer.get_store_id() != nodes[0]) + .cloned() + .collect(); + let mut right_demote = pdpb::DemoteFailedVoters::default(); + right_demote.set_region_id(right.get_id()); + right_demote.set_failed_voters(right_demote_peers.into()); + plan.mut_demotes().push(left_demote); + plan.mut_demotes().push(right_demote); + + // Triggers the unsafe recovery plan execution. + pd_client.must_set_unsafe_recovery_plan(nodes[0], plan.clone()); + cluster.must_send_store_heartbeat(nodes[0]); + + // Can't propose demotion as it's in merging mode + let mut store_report = None; + for _ in 0..20 { + store_report = pd_client.must_get_store_report(nodes[0]); + if store_report.is_some() { + break; + } + sleep_ms(100); + } + assert_ne!(store_report, None); + let has_force_leader = store_report + .unwrap() + .get_peer_reports() + .iter() + .any(|p| p.get_is_force_leader()); + // Force leader is not exited due to demotion failure + assert!(has_force_leader); + + fail::remove("on_schedule_merge"); + fail::cfg("on_schedule_merge_ret_err", "return()").unwrap(); + + // Make sure merge check is scheduled, and rollback merge is triggered + sleep_ms(50); + + // Re-triggers the unsafe recovery plan execution. + pd_client.must_set_unsafe_recovery_plan(nodes[0], plan); + cluster.must_send_store_heartbeat(nodes[0]); + let mut store_report = None; + for _ in 0..20 { + store_report = pd_client.must_get_store_report(nodes[0]); + if store_report.is_some() { + break; + } + sleep_ms(100); + } + assert_ne!(store_report, None); + // No force leader + for peer_report in store_report.unwrap().get_peer_reports() { + assert!(!peer_report.get_is_force_leader()); + } + + // Demotion is done + let mut demoted = false; + for _ in 0..10 { + let new_left = block_on(pd_client.get_region_by_id(left.get_id())) + .unwrap() + .unwrap(); + let new_right = block_on(pd_client.get_region_by_id(right.get_id())) + .unwrap() + .unwrap(); + assert_eq!(new_left.get_peers().len(), 3); + assert_eq!(new_right.get_peers().len(), 3); + demoted = new_left + .get_peers() + .iter() + .filter(|peer| peer.get_store_id() != nodes[0]) + .all(|peer| peer.get_role() == metapb::PeerRole::Learner) + && new_right + .get_peers() + .iter() + .filter(|peer| peer.get_store_id() != nodes[0]) + .all(|peer| peer.get_role() == metapb::PeerRole::Learner); + if demoted { + break; + } + sleep_ms(100); + } + assert_eq!(demoted, true); + + fail::remove("on_schedule_merge_ret_err"); +} diff --git a/tests/failpoints/cases/test_witness.rs b/tests/failpoints/cases/test_witness.rs index 02411ba1b76..33a62f0532b 100644 --- a/tests/failpoints/cases/test_witness.rs +++ b/tests/failpoints/cases/test_witness.rs @@ -16,6 +16,7 @@ fn test_witness_update_region_in_local_reader() { cluster.run(); let nodes = Vec::from_iter(cluster.get_node_ids()); assert_eq!(nodes.len(), 3); + assert_eq!(nodes[2], 3); let pd_client = Arc::clone(&cluster.pd_client); pd_client.disable_default_operator(); @@ -64,6 +65,52 @@ fn test_witness_update_region_in_local_reader() { fail::remove("change_peer_after_update_region_store_3"); } +// This case is almost the same as `test_witness_update_region_in_local_reader`, +// but this omitted changing the peer to witness, for ensuring `peer_is_witness` +// won't be returned in a cluster without witnesses. +#[test] +fn test_witness_not_reported_while_disabled() { + let mut cluster = new_server_cluster(0, 3); + cluster.run(); + let nodes = Vec::from_iter(cluster.get_node_ids()); + assert_eq!(nodes.len(), 3); + assert_eq!(nodes[2], 3); + + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + + let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); + let peer_on_store1 = find_peer(®ion, nodes[0]).unwrap().clone(); + cluster.must_transfer_leader(region.get_id(), peer_on_store1); + let peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); + + cluster.must_put(b"k0", b"v0"); + + // update region but the peer is not destroyed yet + fail::cfg("change_peer_after_update_region_store_3", "pause").unwrap(); + + cluster + .pd_client + .must_remove_peer(region.get_id(), peer_on_store3.clone()); + + let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); + let mut request = new_request( + region.get_id(), + region.get_region_epoch().clone(), + vec![new_get_cmd(b"k0")], + false, + ); + request.mut_header().set_peer(peer_on_store3); + request.mut_header().set_replica_read(true); + + let resp = cluster + .read(None, request.clone(), Duration::from_millis(100)) + .unwrap(); + assert!(resp.get_header().has_error()); + assert!(!resp.get_header().get_error().has_is_witness()); + fail::remove("change_peer_after_update_region_store_3"); +} + // Test the case witness pull voter_replicated_index when has pending compact // cmd. #[test] diff --git a/tests/integrations/backup/disk_snap.rs b/tests/integrations/backup/disk_snap.rs new file mode 100644 index 00000000000..23a61a937e9 --- /dev/null +++ b/tests/integrations/backup/disk_snap.rs @@ -0,0 +1,206 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{collections::HashSet, time::Duration}; + +use futures::executor::block_on; +use kvproto::raft_cmdpb::{CmdType, PutRequest, RaftCmdRequest, Request}; +use raft::prelude::MessageType; +use raftstore::store::Callback; +use test_backup::disk_snap::{ + assert_failure, assert_failure_because, assert_success, must_wait_apply_success, Suite, +}; +use test_raftstore::{must_contains_error, Direction, RegionPacketFilter, Simulator}; +use test_util::eventually; +use tikv_util::HandyRwLock; + +#[test] +fn test_basic() { + let mut suite = Suite::new(1); + let mut call = suite.prepare_backup(1); + call.prepare(60); + let resp = suite.try_split(b"k"); + debug!("Failed to split"; "err" => ?resp.response.get_header().get_error()); + must_contains_error(&resp.response, "[Suspended] Preparing disk snapshot backup"); +} + +#[test] +fn test_conf_change() { + let mut suite = Suite::new(4); + let the_region = suite.cluster.get_region(b""); + let last_peer = the_region.peers.last().unwrap(); + let res = block_on( + suite + .cluster + .async_remove_peer(the_region.get_id(), last_peer.clone()) + .unwrap(), + ); + assert_success(&res); + eventually(Duration::from_millis(100), Duration::from_secs(2), || { + let r = suite.cluster.get_region(b""); + !r.peers.iter().any(|p| p.id == last_peer.id) + }); + let mut calls = vec![]; + for i in 1..=4 { + let mut call = suite.prepare_backup(i); + call.prepare(60); + calls.push(call); + } + + // Make sure the change has been synchronized to all stores. + std::thread::sleep(Duration::from_millis(500)); + let the_region = suite.cluster.get_region(b""); + let res2 = block_on( + suite + .cluster + .async_remove_peer(the_region.get_id(), last_peer.clone()) + .unwrap(), + ); + assert_failure_because(&res2, "rejected by coprocessor"); + let last_peer = the_region.peers.last().unwrap(); + calls.into_iter().for_each(|c| assert!(c.send_finalize())); + let res3 = block_on( + suite + .cluster + .async_remove_peer(the_region.get_id(), last_peer.clone()) + .unwrap(), + ); + assert_success(&res3); + eventually(Duration::from_millis(100), Duration::from_secs(2), || { + let r = suite.cluster.get_region(b""); + !r.peers.iter().any(|p| p.id == last_peer.id) + }); +} + +#[test] +fn test_transfer_leader() { + let mut suite = Suite::new(3); + let mut calls = vec![]; + for i in 1..=3 { + let mut call = suite.prepare_backup(i); + call.prepare(60); + calls.push(call); + } + let region = suite.cluster.get_region(b""); + let leader = suite.cluster.leader_of_region(region.get_id()).unwrap(); + let new_leader = region.peers.iter().find(|r| r.id != leader.id).unwrap(); + let res = suite + .cluster + .try_transfer_leader(region.id, new_leader.clone()); + assert_failure_because(&res, "[Suspended] Preparing disk snapshot backup"); + calls.into_iter().for_each(|c| assert!(c.send_finalize())); + let res = suite + .cluster + .try_transfer_leader(region.id, new_leader.clone()); + assert_success(&res); +} + +#[test] +fn test_prepare_merge() { + let mut suite = Suite::new(1); + suite.split(b"k"); + let source = suite.cluster.get_region(b"a"); + let target = suite.cluster.get_region(b"z"); + assert_ne!(source.id, target.id); + let mut call = suite.prepare_backup(1); + call.prepare(60); + let resp = suite.cluster.try_merge(source.id, target.id); + assert_failure(&resp); +} + +#[test] +fn test_abort_last_one() { + let suite = Suite::new(1); + let mut call = suite.prepare_backup(1); + call.prepare(10); + let mut call2 = suite.prepare_backup(1); + call2.prepare(10); + let should_err = call.try_next(); + assert!(should_err.is_err(), "{:?}", should_err); + assert!(call2.send_finalize()); +} + +#[test] +fn test_wait_apply() { + let mut suite = Suite::new(3); + for key in 'a'..'k' { + suite.split(&[key as u8]); + } + let rc = suite.cluster.get_region(b"ca"); + suite.cluster.add_send_filter(|i| { + RegionPacketFilter::new(rc.id, i) + .msg_type(MessageType::MsgAppend) + .direction(Direction::Send) + }); + let (tx, rx) = std::sync::mpsc::channel::<()>(); + let mut ld_sid = None; + // Propose a simple write command to each region. + for c in 'a'..'k' { + let region = suite.cluster.get_region(&[c as u8]); + let mut cmd = RaftCmdRequest::new(); + let mut put = PutRequest::new(); + put.set_key(vec![c as u8, b'a']); + put.set_value(b"meow?".to_vec()); + let mut req = Request::new(); + req.set_put(put); + req.set_cmd_type(CmdType::Put); + cmd.mut_requests().push(req); + cmd.mut_header().set_region_id(region.id); + cmd.mut_header() + .set_region_epoch(region.get_region_epoch().clone()); + let ld = suite.cluster.leader_of_region(region.id).unwrap(); + if let Some(lid) = ld_sid { + assert_eq!( + lid, ld.store_id, + "not all leader are in the same store, this case cannot run" + ); + } + ld_sid = Some(ld.store_id); + cmd.mut_header().set_peer(ld); + let r = suite.cluster.sim.rl(); + r.async_command_on_node( + ld_sid.unwrap(), + cmd, + Callback::write_ext( + Box::new(|resp| assert_success(&resp.response)), + Some(Box::new({ + let tx = tx.clone(); + move || drop(tx) + })), + None, + ), + ) + .unwrap(); + } + let mut call = suite.prepare_backup(ld_sid.unwrap()); + call.prepare(60); + + drop(tx); + rx.recv_timeout(Duration::from_secs(5)).unwrap_err(); + + let v = ('a'..'k') + .map(|c| suite.cluster.get_region(&[c as u8])) + .collect::>(); + let mut regions_ok = v + .iter() + .map(|r| r.id) + .filter(|id| *id != rc.id) + .collect::>(); + call.send_wait_apply(v); + + // The regions w/o network isolation must success to wait apply. + while !regions_ok.is_empty() { + let res = call.next(); + let removed = regions_ok.remove(&must_wait_apply_success(&res)); + let mut k = res.get_region().start_key.clone(); + k.push(b'a'); + let v = suite.cluster.must_get(&k); + // Due to we have wait to it applied, this write result must be observable. + assert_eq!(v.as_deref(), Some(b"meow?".as_slice()), "{res:?}"); + assert!(removed, "{regions_ok:?} {res:?}"); + } + + suite.cluster.clear_send_filters(); + // After the network partition restored, the item must be restored. + let res = call.next(); + assert_eq!(must_wait_apply_success(&res), rc.id); +} diff --git a/tests/integrations/backup/mod.rs b/tests/integrations/backup/mod.rs index 4cfd4be07be..56074811772 100644 --- a/tests/integrations/backup/mod.rs +++ b/tests/integrations/backup/mod.rs @@ -17,6 +17,8 @@ use tikv::coprocessor::checksum_crc64_xor; use tikv_util::HandyRwLock; use txn_types::TimeStamp; +mod disk_snap; + fn assert_same_file_name(s1: String, s2: String) { let tokens1: Vec<&str> = s1.split('_').collect(); let tokens2: Vec<&str> = s2.split('_').collect(); diff --git a/tests/integrations/config/dynamic/pessimistic_txn.rs b/tests/integrations/config/dynamic/pessimistic_txn.rs index 7af5455a199..dc88bbd93a3 100644 --- a/tests/integrations/config/dynamic/pessimistic_txn.rs +++ b/tests/integrations/config/dynamic/pessimistic_txn.rs @@ -9,11 +9,7 @@ use security::SecurityManager; use test_pd_client::TestPdClient; use tikv::{ config::*, - server::{ - lock_manager::*, - resolve::{Callback, StoreAddrResolver}, - Error, Result, - }, + server::{lock_manager::*, resolve}, }; use tikv_util::config::ReadableDuration; @@ -27,14 +23,6 @@ fn test_config_validate() { invalid_cfg.validate().unwrap_err(); } -#[derive(Clone)] -struct MockResolver; -impl StoreAddrResolver for MockResolver { - fn resolve(&self, _store_id: u64, _cb: Callback) -> Result<()> { - Err(Error::Other(box_err!("unimplemented"))) - } -} - fn setup( cfg: TikvConfig, ) -> ( @@ -50,7 +38,7 @@ fn setup( .start( 1, pd_client, - MockResolver, + resolve::MockStoreAddrResolver::default(), security_mgr, &cfg.pessimistic_txn, ) diff --git a/tests/integrations/config/dynamic/raftstore.rs b/tests/integrations/config/dynamic/raftstore.rs index 4d6551ea27c..0489103018f 100644 --- a/tests/integrations/config/dynamic/raftstore.rs +++ b/tests/integrations/config/dynamic/raftstore.rs @@ -15,7 +15,7 @@ use raftstore::{ store::{ config::{Config, RaftstoreConfigManager}, fsm::{StoreMeta, *}, - AutoSplitController, SnapManager, StoreMsg, Transport, + AutoSplitController, DiskCheckRunner, SnapManager, StoreMsg, Transport, }, Result, }; @@ -113,6 +113,7 @@ fn start_raftstore( CollectorRegHandle::new_for_test(), None, None, + DiskCheckRunner::dummy(), GrpcServiceManager::dummy(), Arc::new(AtomicU64::new(0)), ) diff --git a/tests/integrations/config/dynamic/snap.rs b/tests/integrations/config/dynamic/snap.rs index bb91d0d62eb..3cb7f6e43a3 100644 --- a/tests/integrations/config/dynamic/snap.rs +++ b/tests/integrations/config/dynamic/snap.rs @@ -7,6 +7,7 @@ use std::{ use engine_rocks::RocksEngine; use grpcio::{EnvBuilder, ResourceQuota}; +use online_config::ConfigManager; use raft_log_engine::RaftLogEngine; use raftstore::store::{fsm::create_raft_batch_system, SnapManager}; use security::SecurityManager; @@ -24,6 +25,14 @@ use tikv_util::{ worker::{LazyWorker, Scheduler, Worker}, }; +struct MockCfgManager; + +impl ConfigManager for MockCfgManager { + fn dispatch(&mut self, _: online_config::ConfigChange) -> online_config::Result<()> { + Ok(()) + } +} + fn start_server( cfg: TikvConfig, dir: &TempDir, @@ -57,6 +66,7 @@ fn start_server( snap_worker_scheduler, server_config.clone(), ResourceQuota::new(None), + Box::new(MockCfgManager), )), ); let snap_runner = SnapHandler::new( diff --git a/tests/integrations/config/mod.rs b/tests/integrations/config/mod.rs index a65d4cfb46c..f2a47252589 100644 --- a/tests/integrations/config/mod.rs +++ b/tests/integrations/config/mod.rs @@ -65,7 +65,6 @@ fn read_file_in_project_dir(path: &str) -> String { #[test] fn test_serde_custom_tikv_config() { let mut value = TikvConfig::default(); - value.log_rotation_timespan = ReadableDuration::days(1); value.log.level = Level::Critical.into(); value.log.file.filename = "foo".to_owned(); value.log.format = LogFormat::Json; @@ -77,6 +76,8 @@ fn test_serde_custom_tikv_config() { value.abort_on_panic = true; value.memory_usage_limit = Some(ReadableSize::gb(10)); value.memory_usage_high_water = 0.65; + value.memory.enable_heap_profiling = false; + value.memory.profiling_sample_per_bytes = ReadableSize::mb(1); value.server = ServerConfig { cluster_id: 0, // KEEP IT ZERO, it is skipped by serde. addr: "example.com:443".to_owned(), @@ -103,9 +104,6 @@ fn test_serde_custom_tikv_config() { grpc_stream_initial_window_size: ReadableSize(12_345), grpc_keepalive_time: ReadableDuration::secs(3), grpc_keepalive_timeout: ReadableDuration::secs(60), - end_point_concurrency: None, - end_point_max_tasks: None, - end_point_stack_size: None, end_point_recursion_limit: 100, end_point_stream_channel_size: 16, end_point_batch_row_limit: 64, @@ -125,6 +123,7 @@ fn test_serde_custom_tikv_config() { forward_max_connections_per_address: 5, reject_messages_on_memory_ratio: 0.8, simplify_metrics: false, + ..Default::default() }; value.readpool = ReadPoolConfig { unified: UnifiedReadPoolConfig { @@ -191,16 +190,14 @@ fn test_serde_custom_tikv_config() { raft_engine_purge_interval: ReadableDuration::minutes(20), max_manual_flush_rate: 5.0, raft_entry_cache_life_time: ReadableDuration::secs(12), - raft_reject_transfer_leader_duration: ReadableDuration::secs(3), split_region_check_tick_interval: ReadableDuration::secs(12), region_split_check_diff: Some(ReadableSize::mb(20)), region_compact_check_interval: ReadableDuration::secs(12), - clean_stale_peer_delay: ReadableDuration::secs(0), region_compact_check_step: Some(1_234), region_compact_min_tombstones: 999, region_compact_tombstones_percent: 33, region_compact_min_redundant_rows: 999, - region_compact_redundant_rows_percent: 33, + region_compact_redundant_rows_percent: Some(33), pd_heartbeat_tick_interval: ReadableDuration::minutes(12), pd_store_heartbeat_tick_interval: ReadableDuration::secs(12), notify_capacity: 12_345, @@ -212,6 +209,7 @@ fn test_serde_custom_tikv_config() { max_leader_missing_duration: ReadableDuration::hours(12), abnormal_leader_missing_duration: ReadableDuration::hours(6), peer_stale_state_check_interval: ReadableDuration::hours(2), + gc_peer_check_interval: ReadableDuration::days(1), leader_transfer_max_log_lag: 123, snap_apply_batch_size: ReadableSize::mb(12), snap_apply_copy_symlink: true, @@ -230,8 +228,6 @@ fn test_serde_custom_tikv_config() { use_delete_range: true, snap_generator_pool_size: 2, cleanup_import_sst_interval: ReadableDuration::minutes(12), - region_max_size: ReadableSize(0), - region_split_size: ReadableSize(0), local_read_batch_size: 33, apply_batch_system, store_batch_system, @@ -251,8 +247,8 @@ fn test_serde_custom_tikv_config() { io_reschedule_concurrent_max_count: 1234, io_reschedule_hotpot_duration: ReadableDuration::secs(4321), inspect_interval: ReadableDuration::millis(444), + inspect_cpu_util_thd: 0.666, report_min_resolved_ts_interval: ReadableDuration::millis(233), - raft_msg_flush_interval: ReadableDuration::micros(250), check_leader_lease_interval: ReadableDuration::millis(123), renew_leader_lease_advance_duration: ReadableDuration::millis(456), reactive_memory_lock_tick_interval: ReadableDuration::millis(566), @@ -269,6 +265,7 @@ fn test_serde_custom_tikv_config() { slow_trend_unsensitive_result: 0.5, enable_v2_compatible_learner: false, unsafe_disable_check_quorum: false, + ..Default::default() }; value.pd = PdConfig::new(vec!["example.com:443".to_owned()]); let titan_cf_config = TitanCfConfig { @@ -315,7 +312,6 @@ fn test_serde_custom_tikv_config() { rate_bytes_per_sec: ReadableSize::kb(1), rate_limiter_refill_period: ReadableDuration::millis(10), rate_limiter_mode: DBRateLimiterMode::AllIo, - auto_tuned: None, rate_limiter_auto_tuned: false, bytes_per_sync: ReadableSize::mb(1), wal_bytes_per_sync: ReadableSize::kb(32), @@ -388,6 +384,7 @@ fn test_serde_custom_tikv_config() { max_compactions: Some(3), ttl: Some(ReadableDuration::days(10)), periodic_compaction_seconds: Some(ReadableDuration::days(10)), + write_buffer_limit: None, }, writecf: WriteCfConfig { block_size: ReadableSize::kb(12), @@ -461,6 +458,7 @@ fn test_serde_custom_tikv_config() { max_compactions: Some(3), ttl: Some(ReadableDuration::days(10)), periodic_compaction_seconds: Some(ReadableDuration::days(10)), + write_buffer_limit: None, }, lockcf: LockCfConfig { block_size: ReadableSize::kb(12), @@ -534,6 +532,7 @@ fn test_serde_custom_tikv_config() { max_compactions: Some(3), ttl: Some(ReadableDuration::days(10)), periodic_compaction_seconds: Some(ReadableDuration::days(10)), + write_buffer_limit: Some(ReadableSize::mb(16)), }, raftcf: RaftCfConfig { block_size: ReadableSize::kb(12), @@ -607,8 +606,10 @@ fn test_serde_custom_tikv_config() { max_compactions: Some(3), ttl: Some(ReadableDuration::days(10)), periodic_compaction_seconds: Some(ReadableDuration::days(10)), + write_buffer_limit: None, }, titan: titan_db_config.clone(), + ..Default::default() }; value.raftdb = RaftDbConfig { info_log_level: LogLevel::Info, @@ -695,6 +696,7 @@ fn test_serde_custom_tikv_config() { max_compactions: Some(3), ttl: None, periodic_compaction_seconds: None, + write_buffer_limit: None, }, titan: titan_db_config, }; @@ -755,6 +757,7 @@ fn test_serde_custom_tikv_config() { other_priority: IoPriority::Low, }, background_error_recovery_window: ReadableDuration::hours(1), + txn_status_cache_capacity: 1000, }; value.coprocessor = CopConfig { split_region_on_table: false, @@ -828,6 +831,7 @@ fn test_serde_custom_tikv_config() { max_write_bytes_per_sec: ReadableSize::mb(10), enable_compaction_filter: false, compaction_filter_skip_version_check: true, + num_threads: 2, }; value.pessimistic_txn = PessimisticTxnConfig { wait_for_lock_timeout: ReadableDuration::millis(10), @@ -837,20 +841,23 @@ fn test_serde_custom_tikv_config() { }; value.cdc = CdcConfig { min_ts_interval: ReadableDuration::secs(4), - old_value_cache_size: 0, hibernate_regions_compatible: false, incremental_scan_threads: 3, incremental_scan_concurrency: 4, incremental_scan_speed_limit: ReadableSize(7), + incremental_fetch_speed_limit: ReadableSize(8), incremental_scan_ts_filter_ratio: 0.7, tso_worker_threads: 2, old_value_cache_memory_quota: ReadableSize::mb(14), sink_memory_quota: ReadableSize::mb(7), + ..Default::default() }; value.resolved_ts = ResolvedTsConfig { enable: true, advance_ts_interval: ReadableDuration::secs(5), scan_lock_pool_size: 1, + memory_quota: ReadableSize::mb(1), + incremental_scan_concurrency: 7, }; value.causal_ts = CausalTsConfig { renew_interval: ReadableDuration::millis(100), @@ -858,10 +865,14 @@ fn test_serde_custom_tikv_config() { renew_batch_max_size: 8192, alloc_ahead_buffer: ReadableDuration::millis(3000), }; + value + .split + .optimize_for(value.coprocessor.region_max_size()); value.resource_control = ResourceControlConfig { enabled: false }; let custom = read_file_in_project_dir("integrations/config/test-custom.toml"); - let load = toml::from_str(&custom).unwrap(); + let mut load: TikvConfig = toml::from_str(&custom).unwrap(); + load.split.optimize_for(load.coprocessor.region_max_size()); assert_eq_debug(&value, &load); let dump = toml::to_string_pretty(&load).unwrap(); diff --git a/tests/integrations/config/test-cache-compatible.toml b/tests/integrations/config/test-cache-compatible.toml index 9fce88833ed..f91b5cdafc3 100644 --- a/tests/integrations/config/test-cache-compatible.toml +++ b/tests/integrations/config/test-cache-compatible.toml @@ -2,6 +2,8 @@ [log.file] +[memory] + [readpool.coprocessor] [readpool.storage] diff --git a/tests/integrations/config/test-custom.toml b/tests/integrations/config/test-custom.toml index 053e7c45939..54ed8216ec9 100644 --- a/tests/integrations/config/test-custom.toml +++ b/tests/integrations/config/test-custom.toml @@ -1,13 +1,9 @@ -log-level = "info" -log-file = "" -log-format = "text" slow-log-file = "slow_foo" slow-log-threshold = "1s" -log-rotation-timespan = "1d" panic-when-unexpected-key-or-data = true abort-on-panic = true memory-usage-limit = "10GB" -memory-usage-high-water= 0.65 +memory-usage-high-water = 0.65 [log] level = "fatal" @@ -19,6 +15,10 @@ max-size = 1 max-backups = 2 max-days = 3 +[memory] +enable-heap-profiling = false +profiling-sample-per-bytes = "1MB" + [readpool.unified] min-thread-count = 5 max-thread-count = 10 @@ -101,6 +101,7 @@ reserve-space = "10GB" reserve-raft-space = "2GB" enable-ttl = true ttl-check-poll-interval = "0s" +txn-status-cache-capacity = 1000 [storage.block-cache] capacity = "40GB" @@ -133,9 +134,7 @@ export-priority = "high" other-priority = "low" [pd] -endpoints = [ - "example.com:443", -] +endpoints = ["example.com:443"] [metric] job = "tikv_1" @@ -183,6 +182,7 @@ max-peer-down-duration = "12m" max-leader-missing-duration = "12h" abnormal-leader-missing-duration = "6h" peer-stale-state-check-interval = "2h" +gc-peer-check-interval = "1d" leader-transfer-max-log-lag = 123 snap-apply-batch-size = "12MB" snap-apply-copy-symlink = true @@ -220,6 +220,7 @@ waterfall-metrics = true io-reschedule-concurrent-max-count = 1234 io-reschedule-hotpot-duration = "4321s" inspect-interval = "444ms" +inspect-cpu-util-thd = 0.666 check-leader-lease-interval = "123ms" renew-leader-lease-advance-duration = "456ms" reactive-memory-lock-tick-interval = "566ms" @@ -300,15 +301,7 @@ bloom-filter-bits-per-key = 123 block-based-bloom-filter = true ribbon-filter-above-level = 1 read-amp-bytes-per-bit = 0 -compression-per-level = [ - "no", - "no", - "zstd", - "zstd", - "no", - "zstd", - "lz4", -] +compression-per-level = ["no", "no", "zstd", "zstd", "no", "zstd", "lz4"] bottommost-level-compression = "disable" bottommost-zstd-compression-dict-size = 1024 bottommost-zstd-compression-sample-size = 1024 @@ -371,15 +364,7 @@ bloom-filter-bits-per-key = 123 block-based-bloom-filter = true ribbon-filter-above-level = 1 read-amp-bytes-per-bit = 0 -compression-per-level = [ - "no", - "no", - "zstd", - "zstd", - "no", - "zstd", - "lz4", -] +compression-per-level = ["no", "no", "zstd", "zstd", "no", "zstd", "lz4"] write-buffer-size = "1MB" max-write-buffer-number = 12 min-write-buffer-number-to-merge = 12 @@ -425,16 +410,9 @@ bloom-filter-bits-per-key = 123 block-based-bloom-filter = true ribbon-filter-above-level = 1 read-amp-bytes-per-bit = 0 -compression-per-level = [ - "no", - "no", - "zstd", - "zstd", - "no", - "zstd", - "lz4", -] +compression-per-level = ["no", "no", "zstd", "zstd", "no", "zstd", "lz4"] write-buffer-size = "1MB" +write-buffer-limit = "16MB" max-write-buffer-number = 12 min-write-buffer-number-to-merge = 12 max-bytes-for-level-base = "12KB" @@ -479,15 +457,7 @@ bloom-filter-bits-per-key = 123 block-based-bloom-filter = true ribbon-filter-above-level = 1 read-amp-bytes-per-bit = 0 -compression-per-level = [ - "no", - "no", - "zstd", - "zstd", - "no", - "zstd", - "lz4", -] +compression-per-level = ["no", "no", "zstd", "zstd", "no", "zstd", "lz4"] write-buffer-size = "1MB" max-write-buffer-number = 12 min-write-buffer-number-to-merge = 12 @@ -565,15 +535,7 @@ bloom-filter-bits-per-key = 123 block-based-bloom-filter = true ribbon-filter-above-level = 1 read-amp-bytes-per-bit = 0 -compression-per-level = [ - "no", - "no", - "zstd", - "zstd", - "no", - "zstd", - "lz4", -] +compression-per-level = ["no", "no", "zstd", "zstd", "no", "zstd", "lz4"] write-buffer-size = "1MB" max-write-buffer-number = 12 min-write-buffer-number-to-merge = 12 @@ -633,9 +595,7 @@ ca-path = "invalid path" cert-path = "invalid path" key-path = "invalid path" redact-info-log = true -cert-allowed-cn = [ - "example.tikv.com", -] +cert-allowed-cn = ["example.tikv.com"] [security.encryption] data-encryption-method = "aes128-ctr" @@ -681,11 +641,12 @@ batch-keys = 256 max-write-bytes-per-sec = "10MB" enable-compaction-filter = false compaction-filter-skip-version-check = true +num-threads = 2 [pessimistic-txn] -enabled = false # test backward compatibility +enabled = false # test backward compatibility wait-for-lock-timeout = "10ms" -wake-up-delay-duration = 100 # test backward compatibility +wake-up-delay-duration = 100 # test backward compatibility pipelined = false in-memory = false @@ -696,6 +657,7 @@ hibernate-regions-compatible = false incremental-scan-threads = 3 incremental-scan-concurrency = 4 incremental-scan-speed-limit = 7 +incremental-fetch-speed-limit = 8 incremental-scan-ts-filter-ratio = 0.7 tso-worker-threads = 2 old-value-cache-memory-quota = "14MB" @@ -705,6 +667,8 @@ sink-memory-quota = "7MB" enable = true advance-ts-interval = "5s" scan-lock-pool-size = 1 +memory-quota = "1MB" +incremental-scan-concurrency = 7 [split] detect-times = 10 diff --git a/tests/integrations/config/test-default.toml b/tests/integrations/config/test-default.toml index 23e53b9daf3..ca1abc0081b 100644 --- a/tests/integrations/config/test-default.toml +++ b/tests/integrations/config/test-default.toml @@ -2,6 +2,8 @@ [log.file] +[memory] + [readpool.unified] [readpool.storage] diff --git a/tests/integrations/coprocessor/test_select.rs b/tests/integrations/coprocessor/test_select.rs index 9af28b6e3d6..43ef627a2ee 100644 --- a/tests/integrations/coprocessor/test_select.rs +++ b/tests/integrations/coprocessor/test_select.rs @@ -2086,11 +2086,16 @@ fn test_select_v2_format_with_checksum() { for extra_checksum in [None, Some(132423)] { // The row value encoded with checksum bytes should have no impact on cop task // processing and related result chunk filling. - let (_, endpoint) = + let (mut store, endpoint) = init_data_with_commit_v2_checksum(&product, &data, true, extra_checksum); + store.insert_all_null_row(&product, Context::default(), true, extra_checksum); let req = DagSelect::from(&product).build(); let mut resp = handle_select(&endpoint, req); - let spliter = DagChunkSpliter::new(resp.take_chunks().into(), 3); + let mut spliter = DagChunkSpliter::new(resp.take_chunks().into(), 3); + let first_row = spliter.next().unwrap(); + assert_eq!(first_row[0], Datum::I64(0)); + assert_eq!(first_row[1], Datum::Null); + assert_eq!(first_row[2], Datum::Null); for (row, (id, name, cnt)) in spliter.zip(data.clone()) { let name_datum = name.map(|s| s.as_bytes()).into(); let expected_encoded = datum::encode_value( diff --git a/tests/integrations/import/mod.rs b/tests/integrations/import/mod.rs index 96e2c655e18..4de0fa26472 100644 --- a/tests/integrations/import/mod.rs +++ b/tests/integrations/import/mod.rs @@ -1,4 +1,5 @@ // Copyright 2018 TiKV Project Authors. Licensed under Apache-2.0. +mod test_apply_log; mod test_sst_service; mod util; diff --git a/tests/integrations/import/test_apply_log.rs b/tests/integrations/import/test_apply_log.rs new file mode 100644 index 00000000000..90d43c4dd71 --- /dev/null +++ b/tests/integrations/import/test_apply_log.rs @@ -0,0 +1,101 @@ +use engine_traits::CF_DEFAULT; +use external_storage_export::LocalStorage; +use kvproto::import_sstpb::ApplyRequest; +use tempfile::TempDir; +use tikv_util::sys::disk::{self, DiskUsage}; + +use crate::import::util; + +#[test] +fn test_basic_apply() { + let (_cluster, ctx, tikv, import) = util::new_cluster_and_tikv_import_client(); + let tmp = TempDir::new().unwrap(); + let storage = LocalStorage::new(tmp.path()).unwrap(); + let default = [ + (b"k1", b"v1", 1), + (b"k2", b"v2", 2), + (b"k3", b"v3", 3), + (b"k4", b"v4", 4), + ]; + let default_rewritten = [(b"r1", b"v1", 1), (b"r2", b"v2", 2), (b"r3", b"v3", 3)]; + let mut sst_meta = util::make_plain_file(&storage, "file1.log", default.into_iter()); + util::register_range_for(&mut sst_meta, b"k1", b"k3a"); + let mut req = ApplyRequest::new(); + req.set_context(ctx.clone()); + req.set_rewrite_rules(vec![util::rewrite_for(&mut sst_meta, b"k", b"r")].into()); + req.set_metas(vec![sst_meta].into()); + req.set_storage_backend(util::local_storage(&tmp)); + import.apply(&req).unwrap(); + util::check_applied_kvs_cf(&tikv, &ctx, CF_DEFAULT, default_rewritten.into_iter()); +} + +#[test] +fn test_apply_full_disk() { + let (_cluster, ctx, _tikv, import) = util::new_cluster_and_tikv_import_client(); + let tmp = TempDir::new().unwrap(); + let storage = LocalStorage::new(tmp.path()).unwrap(); + let default = [ + (b"k1", b"v1", 1), + (b"k2", b"v2", 2), + (b"k3", b"v3", 3), + (b"k4", b"v4", 4), + ]; + let mut sst_meta = util::make_plain_file(&storage, "file1.log", default.into_iter()); + util::register_range_for(&mut sst_meta, b"k1", b"k3a"); + let mut req = ApplyRequest::new(); + req.set_context(ctx); + req.set_rewrite_rules(vec![util::rewrite_for(&mut sst_meta, b"k", b"r")].into()); + req.set_metas(vec![sst_meta].into()); + req.set_storage_backend(util::local_storage(&tmp)); + disk::set_disk_status(DiskUsage::AlmostFull); + let result = import.apply(&req).unwrap(); + assert!(result.has_error()); + assert_eq!( + result.get_error().get_message(), + "TiKV disk space is not enough." + ); + disk::set_disk_status(DiskUsage::Normal); +} + +#[test] +fn test_apply_twice() { + let (_cluster, ctx, tikv, import) = util::new_cluster_and_tikv_import_client(); + let tmp = TempDir::new().unwrap(); + let storage = LocalStorage::new(tmp.path()).unwrap(); + let default = [( + b"k1", + b"In this case, we are going to test write twice, but with different rewrite rule.", + 1, + )]; + let default_fst = [( + b"r1", + b"In this case, we are going to test write twice, but with different rewrite rule.", + 1, + )]; + let default_snd = [( + b"z1", + b"In this case, we are going to test write twice, but with different rewrite rule.", + 1, + )]; + + let mut sst_meta = util::make_plain_file(&storage, "file2.log", default.into_iter()); + util::register_range_for(&mut sst_meta, b"k1", b"k1a"); + let mut req = ApplyRequest::new(); + req.set_context(ctx.clone()); + req.set_rewrite_rules(vec![util::rewrite_for(&mut sst_meta, b"k", b"r")].into()); + req.set_metas(vec![sst_meta.clone()].into()); + req.set_storage_backend(util::local_storage(&tmp)); + import.apply(&req).unwrap(); + util::check_applied_kvs_cf(&tikv, &ctx, CF_DEFAULT, default_fst.into_iter()); + + util::register_range_for(&mut sst_meta, b"k1", b"k1a"); + req.set_rewrite_rules(vec![util::rewrite_for(&mut sst_meta, b"k", b"z")].into()); + req.set_metas(vec![sst_meta].into()); + import.apply(&req).unwrap(); + util::check_applied_kvs_cf( + &tikv, + &ctx, + CF_DEFAULT, + default_fst.into_iter().chain(default_snd.into_iter()), + ); +} diff --git a/tests/integrations/import/test_sst_service.rs b/tests/integrations/import/test_sst_service.rs index 22ab9c7d7fe..1ed4b116937 100644 --- a/tests/integrations/import/test_sst_service.rs +++ b/tests/integrations/import/test_sst_service.rs @@ -555,3 +555,109 @@ fn test_duplicate_and_close() { req.set_mode(SwitchMode::Normal); import.switch_mode(&req).unwrap(); } + +#[test] +fn test_suspend_import() { + let (_cluster, ctx, tikv, import) = new_cluster_and_tikv_import_client(); + let sst_range = (0, 10); + let write = |sst_range: (u8, u8)| { + let mut meta = new_sst_meta(0, 0); + meta.set_region_id(ctx.get_region_id()); + meta.set_region_epoch(ctx.get_region_epoch().clone()); + + let mut keys = vec![]; + let mut values = vec![]; + for i in sst_range.0..sst_range.1 { + keys.push(vec![i]); + values.push(vec![i]); + } + send_write_sst(&import, &meta, keys, values, 1) + }; + let ingest = |sst_meta: &SstMeta| { + let mut ingest = IngestRequest::default(); + ingest.set_context(ctx.clone()); + ingest.set_sst(sst_meta.clone()); + import.ingest(&ingest) + }; + let multi_ingest = |sst_metas: &[SstMeta]| { + let mut multi_ingest = MultiIngestRequest::default(); + multi_ingest.set_context(ctx.clone()); + multi_ingest.set_ssts(sst_metas.to_vec().into()); + import.multi_ingest(&multi_ingest) + }; + let suspendctl = |for_time| { + let mut req = SuspendImportRpcRequest::default(); + req.set_caller("test_suspend_import".to_owned()); + if for_time == 0 { + req.set_should_suspend_imports(false); + } else { + req.set_should_suspend_imports(true); + req.set_duration_in_secs(for_time); + } + req + }; + + let write_res = write(sst_range).unwrap(); + assert_eq!(write_res.metas.len(), 1); + let sst = write_res.metas[0].clone(); + + assert!( + !import + .suspend_import_rpc(&suspendctl(6000)) + .unwrap() + .already_suspended + ); + let write_res = write(sst_range); + write_res.unwrap(); + let ingest_res = ingest(&sst).unwrap(); + assert!( + ingest_res.get_error().has_server_is_busy(), + "{:?}", + ingest_res + ); + let multi_ingest_res = multi_ingest(&[sst.clone()]).unwrap(); + assert!( + multi_ingest_res.get_error().has_server_is_busy(), + "{:?}", + multi_ingest_res + ); + + assert!( + import + .suspend_import_rpc(&suspendctl(0)) + .unwrap() + .already_suspended + ); + + let ingest_res = ingest(&sst); + assert!(ingest_res.is_ok(), "{:?} => {:?}", sst, ingest_res); + + check_ingested_txn_kvs(&tikv, &ctx, sst_range, 2); + + // test timeout. + assert!( + !import + .suspend_import_rpc(&suspendctl(1)) + .unwrap() + .already_suspended + ); + let sst_range = (10, 20); + let write_res = write(sst_range); + let sst = write_res.unwrap().metas; + let res = multi_ingest(&sst); + assert!( + res.as_ref().unwrap().get_error().has_server_is_busy(), + "{:?}", + res + ); + std::thread::sleep(Duration::from_secs(1)); + multi_ingest(&sst).unwrap(); + + // check an insane value should be rejected. + import + .suspend_import_rpc(&suspendctl(u64::MAX - 42)) + .unwrap_err(); + let sst_range = (20, 30); + let ssts = write(sst_range).unwrap(); + multi_ingest(ssts.get_metas()).unwrap(); +} diff --git a/tests/integrations/import/util.rs b/tests/integrations/import/util.rs index cc5d22d517d..4f747feeb19 100644 --- a/tests/integrations/import/util.rs +++ b/tests/integrations/import/util.rs @@ -1,16 +1,31 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. -use std::{sync::Arc, thread, time::Duration}; - +use std::{ + io::{Cursor, Write}, + sync::Arc, + thread, + time::Duration, +}; + +use collections::HashMap; use engine_rocks::RocksEngine; -use futures::{executor::block_on, stream, SinkExt}; +use engine_traits::CF_DEFAULT; +use external_storage_export::{ExternalStorage, UnpinReader}; +use futures::{executor::block_on, io::Cursor as AsyncCursor, stream, SinkExt}; use grpcio::{ChannelBuilder, Environment, Result, WriteFlags}; -use kvproto::{import_sstpb::*, kvrpcpb::*, tikvpb::*}; +use kvproto::{ + brpb::{Local, StorageBackend}, + import_sstpb::{KvMeta, *}, + kvrpcpb::*, + tikvpb::*, +}; use security::SecurityConfig; +use tempfile::TempDir; use test_raftstore::*; use test_raftstore_v2::{Cluster as ClusterV2, ServerCluster as ServerClusterV2}; use tikv::config::TikvConfig; -use tikv_util::HandyRwLock; +use tikv_util::{codec::stream_event::EventEncoder, stream::block_on_external_io, HandyRwLock}; +use txn_types::Key; use uuid::Uuid; const CLEANUP_SST_MILLIS: u64 = 10; @@ -18,10 +33,7 @@ const CLEANUP_SST_MILLIS: u64 = 10; pub fn new_cluster(cfg: TikvConfig) -> (Cluster, Context) { let count = 1; let mut cluster = new_server_cluster(0, count); - cluster.cfg = Config { - tikv: cfg, - prefer_mem: true, - }; + cluster.set_cfg(cfg); cluster.run(); let region_id = 1; @@ -43,10 +55,7 @@ pub fn new_cluster_v2( ) { let count = 1; let mut cluster = test_raftstore_v2::new_server_cluster(0, count); - cluster.cfg = Config { - tikv: cfg, - prefer_mem: true, - }; + cluster.set_cfg(cfg); cluster.run(); let region_id = 1; @@ -246,6 +255,40 @@ pub fn check_ingested_kvs_cf(tikv: &TikvClient, ctx: &Context, cf: &str, sst_ran } } +#[track_caller] +pub fn check_applied_kvs_cf, V: AsRef<[u8]> + std::fmt::Debug>( + tikv: &TikvClient, + ctx: &Context, + cf: &str, + entries: impl Iterator, +) { + let mut get = RawBatchGetRequest::default(); + get.set_cf(cf.to_owned()); + get.set_context(ctx.clone()); + let mut keymap = HashMap::default(); + for (key, value, ts) in entries { + let the_key = Key::from_raw(key.as_ref()) + .append_ts(ts.into()) + .into_encoded(); + keymap.insert(the_key.clone(), value); + get.mut_keys().push(the_key); + } + for pair in tikv.raw_batch_get(&get).unwrap().get_pairs() { + let entry = keymap.remove(pair.get_key()).expect("unexpected key"); + assert_eq!( + entry.as_ref(), + pair.get_value(), + "key is {:?}", + pair.get_key() + ); + } + assert!( + keymap.is_empty(), + "not all keys consumed, remained {:?}", + keymap + ); +} + pub fn check_ingested_txn_kvs( tikv: &TikvClient, ctx: &Context, @@ -273,3 +316,67 @@ pub fn check_sst_deleted(client: &ImportSstClient, meta: &SstMeta, data: &[u8]) } send_upload_sst(client, meta, data).unwrap(); } + +pub fn make_plain_file(storage: &dyn ExternalStorage, name: &str, kvs: I) -> KvMeta +where + I: Iterator, + K: AsRef<[u8]>, + V: AsRef<[u8]>, +{ + let mut buf = vec![]; + let mut file = Cursor::new(&mut buf); + let mut start_ts: Option = None; + for (key, value, ts) in kvs { + let the_key = Key::from_raw(key.as_ref()) + .append_ts(ts.into()) + .into_encoded(); + start_ts = Some(start_ts.map_or(ts, |ts0| ts0.min(ts))); + for segment in EventEncoder::encode_event(&the_key, value.as_ref()) { + file.write_all(segment.as_ref()).unwrap(); + } + } + file.flush().unwrap(); + let len = buf.len() as u64; + block_on_external_io(storage.write(name, UnpinReader(Box::new(AsyncCursor::new(buf))), len)) + .unwrap(); + let mut meta = KvMeta::new(); + meta.set_start_ts(start_ts.unwrap_or_default()); + meta.set_length(len); + meta.set_restore_ts(u64::MAX); + meta.set_compression_type(kvproto::brpb::CompressionType::Unknown); + meta.set_name(name.to_owned()); + meta.set_cf(CF_DEFAULT.to_owned()); + meta +} + +pub fn rewrite_for(meta: &mut KvMeta, old_prefix: &[u8], new_prefix: &[u8]) -> RewriteRule { + assert_eq!(old_prefix.len(), new_prefix.len()); + fn rewrite(key: &mut Vec, old_prefix: &[u8], new_prefix: &[u8]) { + assert!(key.starts_with(old_prefix)); + let len = old_prefix.len(); + key.splice(..len, new_prefix.iter().cloned()); + } + rewrite(meta.mut_start_key(), old_prefix, new_prefix); + rewrite(meta.mut_end_key(), old_prefix, new_prefix); + let mut rule = RewriteRule::default(); + rule.set_old_key_prefix(old_prefix.to_vec()); + rule.set_new_key_prefix(new_prefix.to_vec()); + rule +} + +pub fn register_range_for(meta: &mut KvMeta, start: &[u8], end: &[u8]) { + let start = Key::from_raw(start); + let end = Key::from_raw(end); + meta.set_start_key(start.into_encoded()); + meta.set_end_key(end.into_encoded()); +} + +pub fn local_storage(tmp: &TempDir) -> StorageBackend { + let mut backend = StorageBackend::default(); + backend.set_local({ + let mut local = Local::default(); + local.set_path(tmp.path().to_str().unwrap().to_owned()); + local + }); + backend +} diff --git a/tests/integrations/raftstore/test_bootstrap.rs b/tests/integrations/raftstore/test_bootstrap.rs index b43a3d00d16..69fdb446b75 100644 --- a/tests/integrations/raftstore/test_bootstrap.rs +++ b/tests/integrations/raftstore/test_bootstrap.rs @@ -13,7 +13,10 @@ use engine_traits::{ use kvproto::{kvrpcpb::ApiVersion, metapb, raft_serverpb::RegionLocalState}; use raftstore::{ coprocessor::CoprocessorHost, - store::{bootstrap_store, fsm, fsm::store::StoreMeta, AutoSplitController, SnapManager}, + store::{ + bootstrap_store, fsm, fsm::store::StoreMeta, AutoSplitController, DiskCheckRunner, + SnapManager, + }, }; use raftstore_v2::router::PeerMsg; use resource_metering::CollectorRegHandle; @@ -121,6 +124,7 @@ fn test_node_bootstrap_with_prepared_data() { ConcurrencyManager::new(1.into()), CollectorRegHandle::new_for_test(), None, + DiskCheckRunner::dummy(), GrpcServiceManager::dummy(), Arc::new(AtomicU64::new(0)), ) @@ -216,7 +220,7 @@ fn test_flush_before_stop() { let region = cluster.get_region(b"k60"); cluster.must_split(®ion, b"k070"); - fail::cfg("flush_before_cluse_threshold", "return(10)").unwrap(); + fail::cfg("flush_before_close_threshold", "return(10)").unwrap(); for i in 0..100 { let key = format!("k{:03}", i); @@ -252,6 +256,36 @@ fn test_flush_before_stop() { .unwrap(); } +// test flush_before_close will not flush forever +#[test] +fn test_flush_before_stop2() { + use test_raftstore_v2::*; + + let mut cluster = new_server_cluster(0, 3); + cluster.run(); + + fail::cfg("flush_before_close_threshold", "return(10)").unwrap(); + fail::cfg("on_flush_completed", "return").unwrap(); + + for i in 0..20 { + let key = format!("k{:03}", i); + cluster.must_put_cf(CF_WRITE, key.as_bytes(), b"val"); + cluster.must_put_cf(CF_LOCK, key.as_bytes(), b"val"); + } + + let router = cluster.get_router(1).unwrap(); + let raft_engine = cluster.get_raft_engine(1); + + let (tx, rx) = sync_channel(1); + let msg = PeerMsg::FlushBeforeClose { tx }; + router.force_send(1, msg).unwrap(); + + rx.recv().unwrap(); + + let admin_flush = raft_engine.get_flushed_index(1, CF_RAFT).unwrap().unwrap(); + assert!(admin_flush < 10); +} + // We cannot use a flushed index to call `maybe_advance_admin_flushed` // consider a case: // 1. lock `k` with index 6 @@ -301,7 +335,7 @@ fn test_flush_index_exceed_last_modified() { ) .unwrap(); - fail::cfg("flush_before_cluse_threshold", "return(1)").unwrap(); + fail::cfg("flush_before_close_threshold", "return(1)").unwrap(); let router = cluster.get_router(1).unwrap(); let (tx, rx) = sync_channel(1); let msg = PeerMsg::FlushBeforeClose { tx }; diff --git a/tests/integrations/raftstore/test_compact_after_delete.rs b/tests/integrations/raftstore/test_compact_after_delete.rs index 6ba405bb918..24034c83192 100644 --- a/tests/integrations/raftstore/test_compact_after_delete.rs +++ b/tests/integrations/raftstore/test_compact_after_delete.rs @@ -36,6 +36,7 @@ fn test_compact_after_delete(cluster: &mut Cluster) { cluster.cfg.raft_store.region_compact_check_interval = ReadableDuration::millis(100); cluster.cfg.raft_store.region_compact_min_tombstones = 500; cluster.cfg.raft_store.region_compact_tombstones_percent = 50; + cluster.cfg.raft_store.region_compact_redundant_rows_percent = Some(1); cluster.cfg.raft_store.region_compact_check_step = Some(1); cluster.cfg.rocksdb.titan.enabled = true; cluster.run(); @@ -97,8 +98,10 @@ fn test_node_compact_after_delete_v2() { cluster.cfg.raft_store.region_compact_tombstones_percent = 50; // disable it cluster.cfg.raft_store.region_compact_min_redundant_rows = 10000000; + cluster.cfg.raft_store.region_compact_redundant_rows_percent = Some(100); cluster.cfg.raft_store.region_compact_check_step = Some(2); - cluster.cfg.rocksdb.titan.enabled = true; + // TODO: v2 doesn't support titan. + // cluster.cfg.rocksdb.titan.enabled = true; cluster.run(); let region = cluster.get_region(b""); @@ -166,10 +169,11 @@ fn test_node_compact_after_update_v2() { cluster.cfg.raft_store.region_compact_check_interval = ReadableDuration::millis(100); // disable it cluster.cfg.raft_store.region_compact_min_tombstones = 1000000; - cluster.cfg.raft_store.region_compact_redundant_rows_percent = 40; + cluster.cfg.raft_store.region_compact_redundant_rows_percent = Some(40); cluster.cfg.raft_store.region_compact_min_redundant_rows = 50; cluster.cfg.raft_store.region_compact_check_step = Some(2); - cluster.cfg.rocksdb.titan.enabled = true; + // TODO: titan is not supported in v2. + // cluster.cfg.rocksdb.titan.enabled = true; cluster.run(); let region = cluster.get_region(b""); diff --git a/tests/integrations/raftstore/test_conf_change.rs b/tests/integrations/raftstore/test_conf_change.rs index 79b3488d868..08a2ff48d17 100644 --- a/tests/integrations/raftstore/test_conf_change.rs +++ b/tests/integrations/raftstore/test_conf_change.rs @@ -863,3 +863,40 @@ fn test_conf_change_fast() { must_get_equal(&cluster.get_engine(2), b"k1", b"v1"); assert!(timer.saturating_elapsed() < Duration::from_secs(5)); } + +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] +fn test_remove_node_on_partition() { + let count = 3; + let mut cluster = new_cluster(0, count); + let pd_client = Arc::clone(&cluster.pd_client); + // Disable default max peer number check. + pd_client.disable_default_operator(); + cluster.cfg.raft_store.raft_heartbeat_ticks = 1; + cluster.cfg.raft_store.raft_base_tick_interval = ReadableDuration::millis(10); + cluster.cfg.raft_store.raft_election_timeout_ticks = 3; + cluster.cfg.raft_store.raft_store_max_leader_lease = ReadableDuration::millis(20); + let r1 = cluster.run_conf_change(); + + cluster.must_put(b"k0", b"v0"); + pd_client.must_add_peer(r1, new_peer(2, 2)); + must_get_equal(&cluster.get_engine(2), b"k0", b"v0"); + pd_client.must_add_peer(r1, new_peer(3, 3)); + must_get_equal(&cluster.get_engine(3), b"k0", b"v0"); + + // peer 3 isolation + cluster.add_send_filter(IsolationFilterFactory::new(3)); + // sleep for 13 heartbeat interval (>12 should be ok) + let sleep_time = cluster.cfg.raft_store.raft_base_tick_interval.0 + * (4 * cluster.cfg.raft_store.raft_election_timeout_ticks as u32 + 1); + thread::sleep(sleep_time); + pd_client.remove_peer(r1, new_peer(2, 2)); + cluster.must_put(b"k1", b"v1"); + thread::sleep(Duration::from_millis(500)); + // remove peer 2 should not work + pd_client.must_have_peer(r1, new_peer(2, 2)); + + // remove peer 3 should work + pd_client.must_remove_peer(r1, new_peer(3, 3)); + cluster.must_put(b"k3", b"v3"); +} diff --git a/tests/integrations/raftstore/test_early_apply.rs b/tests/integrations/raftstore/test_early_apply.rs index b30a861e2fe..44537e8b409 100644 --- a/tests/integrations/raftstore/test_early_apply.rs +++ b/tests/integrations/raftstore/test_early_apply.rs @@ -109,7 +109,7 @@ fn test_early_apply(mode: DataLost) { let mut cluster = new_node_cluster(0, 3); cluster.pd_client.disable_default_operator(); // So compact log will not be triggered automatically. - configure_for_request_snapshot(&mut cluster); + configure_for_request_snapshot(&mut cluster.cfg); cluster.run(); if mode == DataLost::LeaderCommit || mode == DataLost::AllLost { cluster.must_transfer_leader(1, new_peer(1, 1)); @@ -175,7 +175,7 @@ fn test_update_internal_apply_index() { let mut cluster = new_node_cluster(0, 4); cluster.pd_client.disable_default_operator(); // So compact log will not be triggered automatically. - configure_for_request_snapshot(&mut cluster); + configure_for_request_snapshot(&mut cluster.cfg); cluster.run(); cluster.must_transfer_leader(1, new_peer(3, 3)); cluster.must_put(b"k1", b"v1"); diff --git a/tests/integrations/raftstore/test_joint_consensus.rs b/tests/integrations/raftstore/test_joint_consensus.rs index 282d0d0525c..55def7a099b 100644 --- a/tests/integrations/raftstore/test_joint_consensus.rs +++ b/tests/integrations/raftstore/test_joint_consensus.rs @@ -10,7 +10,7 @@ use pd_client::PdClient; use raft::eraftpb::ConfChangeType; use raftstore::Result; use test_raftstore::*; -use tikv_util::{mpsc::future, store::find_peer}; +use tikv_util::{future::block_on_timeout, store::find_peer}; /// Tests multiple confchange commands can be done by one request #[test] @@ -164,24 +164,18 @@ fn test_request_in_joint_state() { // Isolated peer 2, so the old configuation can't reach quorum cluster.add_send_filter(IsolationFilterFactory::new(2)); - let mut rx = cluster + let rx = cluster .async_request(put_request(®ion, 1, b"k3", b"v3")) .unwrap(); - assert_eq!( - rx.recv_timeout(Duration::from_millis(100)), - Err(future::RecvTimeoutError::Timeout) - ); + block_on_timeout(rx, Duration::from_millis(100)).unwrap_err(); cluster.clear_send_filters(); // Isolated peer 3, so the new configuation can't reach quorum cluster.add_send_filter(IsolationFilterFactory::new(3)); - let mut rx = cluster + let rx = cluster .async_request(put_request(®ion, 1, b"k4", b"v4")) .unwrap(); - assert_eq!( - rx.recv_timeout(Duration::from_millis(100)), - Err(future::RecvTimeoutError::Timeout) - ); + block_on_timeout(rx, Duration::from_millis(100)).unwrap_err(); cluster.clear_send_filters(); // Leave joint diff --git a/tests/integrations/raftstore/test_lease_read.rs b/tests/integrations/raftstore/test_lease_read.rs index 60c87fd4e00..abf17e01e9d 100644 --- a/tests/integrations/raftstore/test_lease_read.rs +++ b/tests/integrations/raftstore/test_lease_read.rs @@ -481,7 +481,7 @@ fn test_read_index_stale_in_suspect_lease() { configure_for_lease_read(&mut cluster.cfg, Some(50), Some(10_000)); let max_lease = Duration::from_secs(2); // Stop log compaction to transfer leader with filter easier. - configure_for_request_snapshot(&mut cluster); + configure_for_request_snapshot(&mut cluster.cfg); cluster.cfg.raft_store.raft_store_max_leader_lease = ReadableDuration(max_lease); cluster.pd_client.disable_default_operator(); diff --git a/tests/integrations/raftstore/test_life.rs b/tests/integrations/raftstore/test_life.rs index e940ca30a7c..0d01799f534 100644 --- a/tests/integrations/raftstore/test_life.rs +++ b/tests/integrations/raftstore/test_life.rs @@ -5,13 +5,18 @@ use std::{ time::Duration, }; -use kvproto::raft_serverpb::{ExtraMessageType, PeerState, RaftMessage}; +use kvproto::{ + metapb::PeerRole::Learner, + raft_serverpb::{ExtraMessageType, PeerState, RaftMessage}, +}; +use raft::{eraftpb::ConfChangeType, prelude::MessageType}; use raftstore::errors::Result; use test_raftstore::{ - new_learner_peer, new_peer, sleep_ms, Filter, FilterFactory, Simulator as S1, + new_admin_request, new_change_peer_request, new_learner_peer, new_peer, Direction, Filter, + FilterFactory, RegionPacketFilter, Simulator as S1, }; use test_raftstore_v2::Simulator as S2; -use tikv_util::{time::Instant, HandyRwLock}; +use tikv_util::{config::ReadableDuration, time::Instant, HandyRwLock}; struct ForwardFactory { node_id: u64, @@ -64,6 +69,7 @@ fn test_gc_peer_tiflash_engine() { let mut cluster_v1 = test_raftstore::new_node_cluster(1, 2); let mut cluster_v2 = test_raftstore_v2::new_node_cluster(1, 2); cluster_v1.cfg.raft_store.enable_v2_compatible_learner = true; + cluster_v2.cfg.raft_store.gc_peer_check_interval = ReadableDuration::millis(500); cluster_v1.pd_client.disable_default_operator(); cluster_v2.pd_client.disable_default_operator(); let r11 = cluster_v1.run_conf_change(); @@ -124,26 +130,14 @@ fn test_gc_peer_tiflash_engine() { .must_remove_peer(r21, new_learner_peer(2, 10)); // Make sure leader cleans up removed_records. - let start = Instant::now(); - loop { - sleep_ms(500); - if cluster_v2 - .region_local_state(r21, 1) - .get_removed_records() - .is_empty() - { - break; - } - if start.saturating_elapsed() > Duration::from_secs(5) { - panic!("timeout"); - } - } + cluster_v2.must_empty_region_removed_records(r21); } #[test] fn test_gc_removed_peer() { let mut cluster = test_raftstore::new_node_cluster(1, 2); cluster.cfg.raft_store.enable_v2_compatible_learner = true; + cluster.cfg.raft_store.gc_peer_check_interval = ReadableDuration::millis(500); cluster.pd_client.disable_default_operator(); let region_id = cluster.run_conf_change(); @@ -218,3 +212,76 @@ fn test_gc_removed_peer() { Duration::from_millis(200) )); } + +#[test] +fn test_gc_peer_with_conf_change() { + let mut cluster = test_raftstore::new_node_cluster(0, 5); + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + + let region_id = cluster.run_conf_change(); + pd_client.must_add_peer(region_id, new_peer(2, 2)); + pd_client.must_add_peer(region_id, new_peer(3, 3)); + cluster.must_transfer_leader(region_id, new_peer(1, 1)); + cluster.must_put(b"k1", b"v1"); + let mut region_epoch = cluster.get_region_epoch(region_id); + + // Create a learner peer 4 on store 4. + let extra_store_id = 4; + let extra_peer_id = 4; + let cc = new_change_peer_request( + ConfChangeType::AddLearnerNode, + new_learner_peer(extra_store_id, extra_peer_id), + ); + let req = new_admin_request(region_id, ®ion_epoch, cc); + let res = cluster + .call_command_on_leader(req, Duration::from_secs(3)) + .unwrap(); + assert!(!res.get_header().has_error(), "{:?}", res); + region_epoch.conf_ver += 1; + cluster.wait_peer_state(region_id, 4, PeerState::Normal); + + // Isolate peer 4 from other region peers. + let left_filter = RegionPacketFilter::new(region_id, extra_store_id) + .direction(Direction::Recv) + .skip(MessageType::MsgHup); + cluster + .sim + .wl() + .add_recv_filter(extra_store_id, Box::new(left_filter)); + + // Change peer 4 to voter. + let cc = new_change_peer_request( + ConfChangeType::AddNode, + new_peer(extra_store_id, extra_peer_id), + ); + let req = new_admin_request(region_id, ®ion_epoch, cc); + let res = cluster + .call_command_on_leader(req, Duration::from_secs(3)) + .unwrap(); + assert!(!res.get_header().has_error(), "{:?}", res); + region_epoch.conf_ver += 1; + + // Remove peer 4 from region 1. + let cc = new_change_peer_request( + ConfChangeType::RemoveNode, + new_peer(extra_store_id, extra_peer_id), + ); + let req = new_admin_request(region_id, ®ion_epoch, cc); + let res = cluster + .call_command_on_leader(req, Duration::from_secs(3)) + .unwrap(); + assert!(!res.get_header().has_error(), "{:?}", res); + region_epoch.conf_ver += 1; + + // GC peer 4 using Voter peer state, peer 4 is learner because it's isolated. + cluster.wait_peer_role(region_id, extra_store_id, extra_peer_id, Learner); + let mut gc_msg = RaftMessage::default(); + gc_msg.set_region_id(region_id); + gc_msg.set_from_peer(new_peer(1, 1)); + gc_msg.set_to_peer(new_peer(4, 4)); + gc_msg.set_region_epoch(region_epoch); + gc_msg.set_is_tombstone(true); + cluster.send_raft_msg(gc_msg).unwrap(); + cluster.wait_peer_state(region_id, 4, PeerState::Tombstone); +} diff --git a/tests/integrations/raftstore/test_merge.rs b/tests/integrations/raftstore/test_merge.rs index afc0c9afab4..8482feb8481 100644 --- a/tests/integrations/raftstore/test_merge.rs +++ b/tests/integrations/raftstore/test_merge.rs @@ -6,7 +6,7 @@ use api_version::{test_kv_format_impl, KvFormat}; use engine_traits::{CF_LOCK, CF_WRITE}; use kvproto::{ raft_cmdpb::CmdType, - raft_serverpb::{PeerState, RaftMessage, RegionLocalState}, + raft_serverpb::{ExtraMessageType, PeerState, RaftMessage, RegionLocalState}, }; use pd_client::PdClient; use raft::eraftpb::{ConfChangeType, MessageType}; @@ -14,7 +14,7 @@ use raftstore::store::{Callback, LocksStatus}; use test_raftstore::*; use test_raftstore_macro::test_case; use tikv::storage::{kv::SnapshotExt, Snapshot}; -use tikv_util::{config::*, HandyRwLock}; +use tikv_util::{config::*, future::block_on_timeout, HandyRwLock}; use txn_types::{Key, LastChange, PessimisticLock}; /// Test if merge is working as expected in a general condition. @@ -407,6 +407,122 @@ fn test_node_check_merged_message() { must_get_none(&engine3, b"v5"); } +/// Test if an uninitialized stale peer will be handled properly after merge. +#[test_case(test_raftstore::new_node_cluster)] +// #[test_case(test_raftstore_v2::new_node_cluster)] +fn test_node_gc_uninitialized_peer_after_merge() { + let mut cluster = new_cluster(0, 4); + configure_for_merge(&mut cluster.cfg); + ignore_merge_target_integrity(&mut cluster.cfg, &cluster.pd_client); + cluster.cfg.raft_store.raft_election_timeout_ticks = 5; + cluster.cfg.raft_store.raft_store_max_leader_lease = ReadableDuration::millis(40); + cluster.cfg.raft_store.max_leader_missing_duration = ReadableDuration::millis(150); + cluster.cfg.raft_store.abnormal_leader_missing_duration = ReadableDuration::millis(100); + cluster.cfg.raft_store.peer_stale_state_check_interval = ReadableDuration::millis(100); + + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + + cluster.run_conf_change(); + + cluster.must_put(b"k1", b"v1"); + cluster.must_put(b"k3", b"v3"); + + // test if an uninitialized stale peer before conf removal is destroyed + // automatically + let region = pd_client.get_region(b"k1").unwrap(); + pd_client.must_add_peer(region.get_id(), new_peer(2, 2)); + pd_client.must_add_peer(region.get_id(), new_peer(3, 3)); + + cluster.must_split(®ion, b"k2"); + let left = pd_client.get_region(b"k1").unwrap(); + let right = pd_client.get_region(b"k2").unwrap(); + + // Block snapshot messages, so that new peers will never be initialized. + cluster.add_send_filter(CloneFilterFactory( + RegionPacketFilter::new(left.get_id(), 4) + .msg_type(MessageType::MsgSnapshot) + .direction(Direction::Recv), + )); + // Add peer (4,4), remove peer (4,4) and then merge regions. + // Peer (4,4) will be an an uninitialized stale peer. + pd_client.must_add_peer(left.get_id(), new_peer(4, 4)); + cluster.must_region_exist(left.get_id(), 4); + cluster.add_send_filter(IsolationFilterFactory::new(4)); + pd_client.must_remove_peer(left.get_id(), new_peer(4, 4)); + pd_client.must_merge(left.get_id(), right.get_id()); + cluster.clear_send_filters(); + + // Wait for the peer (4,4) to be destroyed. + sleep_ms( + 2 * cluster + .cfg + .raft_store + .max_leader_missing_duration + .as_millis(), + ); + cluster.must_region_not_exist(left.get_id(), 4); +} + +/// Test leader missing should issue check stale peer requests. +#[test_case(test_raftstore::new_node_cluster)] +// #[test_case(test_raftstore_v2::new_node_cluster)] +fn test_node_gc_uninitialized_peer_after_merge_on_leader_missing() { + let mut cluster = new_cluster(0, 4); + configure_for_merge(&mut cluster.cfg); + ignore_merge_target_integrity(&mut cluster.cfg, &cluster.pd_client); + cluster.cfg.raft_store.raft_election_timeout_ticks = 5; + cluster.cfg.raft_store.raft_store_max_leader_lease = ReadableDuration::millis(40); + cluster.cfg.raft_store.peer_stale_state_check_interval = ReadableDuration::millis(100); + cluster.cfg.raft_store.abnormal_leader_missing_duration = ReadableDuration::millis(100); + // Set a large max_leader_missing_duration so that check stale peer will + // only be triggered by leader missing. + cluster.cfg.raft_store.max_leader_missing_duration = ReadableDuration::hours(1); + + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + + cluster.run_conf_change(); + + cluster.must_put(b"k1", b"v1"); + cluster.must_put(b"k3", b"v3"); + + // test if an uninitialized stale peer before conf removal is destroyed + // automatically + let region = pd_client.get_region(b"k1").unwrap(); + pd_client.must_add_peer(region.get_id(), new_peer(2, 2)); + pd_client.must_add_peer(region.get_id(), new_peer(3, 3)); + + cluster.must_split(®ion, b"k2"); + let left = pd_client.get_region(b"k1").unwrap(); + let right = pd_client.get_region(b"k2").unwrap(); + + // Block snapshot messages, so that new peers will never be initialized. + cluster.add_send_filter(CloneFilterFactory( + RegionPacketFilter::new(left.get_id(), 4) + .msg_type(MessageType::MsgSnapshot) + .direction(Direction::Recv), + )); + // Add peer (4,4), remove peer (4,4) and then merge regions. + // Peer (4,4) will be an an uninitialized stale peer. + pd_client.must_add_peer(left.get_id(), new_peer(4, 4)); + cluster.must_region_exist(left.get_id(), 4); + cluster.add_send_filter(IsolationFilterFactory::new(4)); + pd_client.must_remove_peer(left.get_id(), new_peer(4, 4)); + pd_client.must_merge(left.get_id(), right.get_id()); + cluster.clear_send_filters(); + + // Wait for the peer (4,4) to be destroyed. + sleep_ms( + 3 * cluster + .cfg + .raft_store + .abnormal_leader_missing_duration + .as_millis(), + ); + cluster.must_region_not_exist(left.get_id(), 4); +} + // Test if a merge handled properly when there is a unfinished slow split before // merge. // No v2, it requires all peers to be available to check trim status. @@ -1444,10 +1560,10 @@ fn test_merge_pessimistic_locks_when_gap_is_too_large() { // The gap is too large, so the previous merge should fail. And this new put // request should be allowed. - let mut res = cluster.async_put(b"k1", b"new_val").unwrap(); + let res = cluster.async_put(b"k1", b"new_val").unwrap(); cluster.clear_send_filters(); - res.recv_timeout(Duration::from_secs(5)).unwrap(); + block_on_timeout(res, Duration::from_secs(5)).unwrap(); assert_eq!(cluster.must_get(b"k1").unwrap(), b"new_val"); } @@ -1731,3 +1847,243 @@ fn test_prepare_merge_with_5_nodes_snapshot() { // Now leader should replicate more logs and figure out a safe index. pd_client.must_merge(left.get_id(), right.get_id()); } + +#[test_case(test_raftstore_v2::new_node_cluster)] +fn test_gc_source_removed_records_after_merge() { + let mut cluster = new_cluster(0, 3); + configure_for_merge(&mut cluster.cfg); + cluster.cfg.raft_store.gc_peer_check_interval = ReadableDuration::millis(500); + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + cluster.run(); + cluster.must_put(b"k1", b"v1"); + cluster.must_put(b"k3", b"v3"); + + let region = cluster.get_region(b"k1"); + cluster.must_split(®ion, b"k2"); + let left = cluster.get_region(b"k1"); + let right = cluster.get_region(b"k3"); + + let left_peer_on_store1 = find_peer(&left, 1).unwrap().clone(); + cluster.must_transfer_leader(left.get_id(), left_peer_on_store1); + must_get_equal(&cluster.get_engine(3), b"k1", b"v1"); + let left_peer_on_store3 = find_peer(&left, 3).unwrap().clone(); + pd_client.must_remove_peer(left.get_id(), left_peer_on_store3); + must_get_none(&cluster.get_engine(3), b"k1"); + + let right_peer_on_store1 = find_peer(&right, 1).unwrap().clone(); + cluster.must_transfer_leader(right.get_id(), right_peer_on_store1); + let right_peer_on_store3 = find_peer(&right, 3).unwrap().clone(); + cluster.add_send_filter(IsolationFilterFactory::new(3)); + pd_client.must_remove_peer(right.get_id(), right_peer_on_store3.clone()); + + // So cluster becomes + // left region: 1(leader) 2 | + // right region: 1(leader) 2 | 3 (removed but not yet destroyed) + // | means isolation. + + // Merge right to left. + pd_client.must_merge(right.get_id(), left.get_id()); + let region_state = cluster.region_local_state(left.get_id(), 1); + assert!( + !region_state.get_merged_records()[0] + .get_source_removed_records() + .is_empty(), + "{:?}", + region_state + ); + assert!( + !region_state + .get_removed_records() + .iter() + .any(|p| p.get_id() == right_peer_on_store3.get_id()), + "{:?}", + region_state + ); + + // Cluster filters and wait for gc peer ticks. + cluster.clear_send_filters(); + sleep_ms(3 * cluster.cfg.raft_store.gc_peer_check_interval.as_millis()); + + // Right region replica on store 3 must be removed. + cluster.must_region_not_exist(right.get_id(), 3); + + // Right region must clean up removed and merged records. + cluster.must_empty_region_merged_records(left.get_id()); + cluster.must_empty_region_removed_records(left.get_id()); +} + +#[test_case(test_raftstore_v2::new_node_cluster)] +fn test_gc_source_peers_forward_by_target_peer_after_merge() { + let mut cluster = new_cluster(0, 3); + configure_for_merge(&mut cluster.cfg); + cluster.cfg.raft_store.raft_log_gc_threshold = 40; + cluster.cfg.raft_store.raft_log_gc_count_limit = Some(40); + cluster.cfg.raft_store.merge_max_log_gap = 15; + cluster.cfg.raft_store.gc_peer_check_interval = ReadableDuration::millis(500); + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + cluster.run(); + + let region = cluster.get_region(b"k1"); + cluster.must_split(®ion, b"k2"); + let left = cluster.get_region(b"k1"); + let right = cluster.get_region(b"k3"); + + let left_peer_on_store1 = find_peer(&left, 1).unwrap().clone(); + cluster.must_transfer_leader(left.get_id(), left_peer_on_store1); + let right_peer_on_store1 = find_peer(&right, 1).unwrap().clone(); + cluster.must_transfer_leader(right.get_id(), right_peer_on_store1); + cluster.must_put(b"k1", b"v1"); + cluster.must_put(b"k3", b"v3"); + must_get_equal(&cluster.get_engine(3), b"k1", b"v1"); + must_get_equal(&cluster.get_engine(3), b"k3", b"v3"); + // Use DropMessageFilter to drop messages to store 3 without reporting error. + cluster.add_recv_filter_on_node( + 3, + Box::new(DropMessageFilter::new(Arc::new(|m| { + // Do not drop MsgAvailabilityRequest and MsgAvailabilityResponse + // messages, otherwise merge is blocked. + matches!( + m.get_extra_msg().get_type(), + ExtraMessageType::MsgAvailabilityRequest + | ExtraMessageType::MsgAvailabilityResponse + ) + }))), + ); + + // So cluster becomes + // left region: 1(leader) 2 | 3 + // right region: 1(leader) 2 | 3 + // | means isolation. + + // Merge left to right and remove left peer on store 3. + pd_client.must_merge(left.get_id(), right.get_id()); + let right_peer_on_store3 = find_peer(&right, 3).unwrap().clone(); + pd_client.must_remove_peer(right.get_id(), right_peer_on_store3); + let region_state = cluster.region_local_state(right.get_id(), 1); + assert!( + !region_state.get_merged_records().is_empty(), + "{:?}", + region_state + ); + + // So cluster becomes + // left region: merged + // right region: 1(leader) 2 | 3 (removed but not yet destroyed) + // | means isolation. + + let state1 = cluster.truncated_state(right.get_id(), 1); + (0..50).for_each(|i| cluster.must_put(b"k2", format!("v{}", i).as_bytes())); + // Wait to trigger compact raft log + cluster.wait_log_truncated(right.get_id(), 1, state1.get_index() + 1); + + // Cluster filters and wait for gc peer ticks. + cluster.clear_recv_filter_on_node(3); + sleep_ms(3 * cluster.cfg.raft_store.gc_peer_check_interval.as_millis()); + + // Left region replica on store 3 must be removed. + cluster.must_region_not_exist(left.get_id(), 3); + // Right region must clean up removed and merged records. + cluster.must_empty_region_merged_records(right.get_id()); + cluster.must_empty_region_removed_records(right.get_id()); +} + +#[test_case(test_raftstore_v2::new_node_cluster)] +fn test_gc_source_peers_forward_by_store_after_merge() { + let mut cluster = new_cluster(0, 3); + configure_for_merge(&mut cluster.cfg); + cluster.cfg.raft_store.gc_peer_check_interval = ReadableDuration::millis(500); + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + cluster.run(); + + let region = cluster.get_region(b"k1"); + cluster.must_split(®ion, b"k2"); + let left = cluster.get_region(b"k1"); + let right = cluster.get_region(b"k3"); + + let left_peer_on_store1 = find_peer(&left, 1).unwrap().clone(); + cluster.must_transfer_leader(left.get_id(), left_peer_on_store1); + let right_peer_on_store1 = find_peer(&right, 1).unwrap().clone(); + cluster.must_transfer_leader(right.get_id(), right_peer_on_store1); + cluster.must_put(b"k1", b"v1"); + cluster.must_put(b"k3", b"v3"); + must_get_equal(&cluster.get_engine(3), b"k1", b"v1"); + must_get_equal(&cluster.get_engine(3), b"k3", b"v3"); + // Drop GcPeerResponse. + cluster.add_recv_filter_on_node( + 1, + Box::new(DropMessageFilter::new(Arc::new(|m| { + m.get_extra_msg().get_type() != ExtraMessageType::MsgGcPeerResponse + }))), + ); + + // So cluster becomes + // left region: 1(leader) 2 | 3 + // right region: 1(leader) 2 | 3 + // | means isolation. + + // Merge left to right and remove left peer on store 3. + pd_client.must_merge(left.get_id(), right.get_id()); + let right_peer_on_store3 = find_peer(&right, 3).unwrap().clone(); + pd_client.must_remove_peer(right.get_id(), right_peer_on_store3); + // Right region replica on store 3 must be removed. + cluster.must_region_not_exist(right.get_id(), 3); + let region_state = cluster.region_local_state(right.get_id(), 1); + assert!( + !region_state.get_merged_records().is_empty(), + "{:?}", + region_state + ); + assert!( + !region_state.get_removed_records().is_empty(), + "{:?}", + region_state + ); + + // So cluster becomes + // left region: merged + // right region: 1(leader) 2 | 3 (destroyed but not yet cleaned in removed + // records) + // | means isolation. + + // Cluster filters and wait for gc peer ticks. + cluster.clear_recv_filter_on_node(1); + sleep_ms(3 * cluster.cfg.raft_store.gc_peer_check_interval.as_millis()); + + // Right region must clean up removed and merged records. + cluster.must_empty_region_merged_records(right.get_id()); + cluster.must_empty_region_removed_records(right.get_id()); +} + +#[test_case(test_raftstore_v2::new_node_cluster)] +fn test_gc_merged_record_in_time() { + let mut cluster = new_cluster(0, 3); + configure_for_merge(&mut cluster.cfg); + cluster.cfg.raft_store.gc_peer_check_interval = ReadableDuration::millis(100); + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + cluster.run(); + + let region = cluster.get_region(b"k1"); + cluster.must_split(®ion, b"k2"); + let left = cluster.get_region(b"k1"); + let right = cluster.get_region(b"k3"); + + let left_peer_on_store1 = find_peer(&left, 1).unwrap().clone(); + cluster.must_transfer_leader(left.get_id(), left_peer_on_store1); + let right_peer_on_store1 = find_peer(&right, 1).unwrap().clone(); + cluster.must_transfer_leader(right.get_id(), right_peer_on_store1); + + // Wait enough time to trigger gc peer, and if there is nothing to gc, + // leader skips registering gc peer tick. + sleep_ms(3 * cluster.cfg.raft_store.gc_peer_check_interval.as_millis()); + + // Merge left to right. + pd_client.must_merge(left.get_id(), right.get_id()); + + // Once merge complete, gc peer tick should be registered and merged record + // will be cleaned up in time. + cluster.must_empty_region_merged_records(right.get_id()); +} diff --git a/tests/integrations/raftstore/test_replication_mode.rs b/tests/integrations/raftstore/test_replication_mode.rs index 367ac63aabb..76059fa8f87 100644 --- a/tests/integrations/raftstore/test_replication_mode.rs +++ b/tests/integrations/raftstore/test_replication_mode.rs @@ -1,6 +1,6 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. -use std::{sync::Arc, thread, time::Duration}; +use std::{iter::FromIterator, sync::Arc, thread, time::Duration}; use kvproto::replication_modepb::*; use pd_client::PdClient; @@ -34,6 +34,18 @@ fn run_cluster(cluster: &mut Cluster) { cluster.must_put(b"k1", b"v0"); } +fn prepare_labels(cluster: &mut Cluster) { + cluster.add_label(1, "dc", "dc1"); + cluster.add_label(2, "dc", "dc1"); + cluster.add_label(3, "dc", "dc2"); + cluster.add_label(1, "zone", "z1"); + cluster.add_label(2, "zone", "z2"); + cluster.add_label(3, "zone", "z3"); + cluster.add_label(1, "host", "h1"); + cluster.add_label(2, "host", "h2"); + cluster.add_label(3, "host", "h3"); +} + /// When using DrAutoSync replication mode, data should be replicated to /// different labels before committed. #[test] @@ -87,6 +99,67 @@ fn test_dr_auto_sync() { assert_eq!(state.state, RegionReplicationState::IntegrityOverLabel); } +// When in sync recover state, and the region is in joint state. The leave joint +// state should be committed successfully. +#[test] +fn test_sync_recover_joint_state() { + let mut cluster = new_server_cluster(0, 5); + cluster.pd_client.disable_default_operator(); + cluster.pd_client.configure_dr_auto_sync("zone"); + cluster.cfg.raft_store.pd_store_heartbeat_tick_interval = ReadableDuration::millis(50); + cluster.cfg.raft_store.raft_log_gc_threshold = 1; + cluster.add_label(1, "zone", "ES"); + cluster.add_label(2, "zone", "ES"); + cluster.add_label(3, "zone", "ES"); + cluster.add_label(4, "zone", "WS"); // old dr + cluster.add_label(5, "zone", "WS"); // new dr + + let pd_client = Arc::clone(&cluster.pd_client); + let region_id = cluster.run_conf_change(); + let nodes = Vec::from_iter(cluster.get_node_ids()); + assert_eq!(nodes.len(), 5); + cluster.must_put(b"k1", b"v1"); + + cluster + .pd_client + .switch_replication_mode(Some(DrAutoSyncState::Async), vec![]); + + pd_client.must_add_peer(region_id, new_peer(2, 2)); + pd_client.must_add_peer(region_id, new_peer(3, 3)); + pd_client.must_add_peer(region_id, new_peer(4, 4)); + pd_client.must_add_peer(region_id, new_learner_peer(5, 5)); + + // Make one node down + cluster.stop_node(4); + + // Switch to sync recover + cluster + .pd_client + .switch_replication_mode(Some(DrAutoSyncState::SyncRecover), vec![]); + + cluster.must_put(b"k2", b"v2"); + assert_eq!(cluster.must_get(b"k2").unwrap(), b"v2"); + + // Enter joint, now we have C_old(1, 2, 3, 4) and C_new(1, 2, 3, 5) + pd_client.must_joint_confchange( + region_id, + vec![ + (ConfChangeType::AddLearnerNode, new_learner_peer(4, 4)), + (ConfChangeType::AddNode, new_peer(5, 5)), + ], + ); + + let region = pd_client.get_region(b"k1").unwrap(); + cluster.must_split(®ion, b"k2"); + let left = pd_client.get_region(b"k1").unwrap(); + let right = pd_client.get_region(b"k2").unwrap(); + assert_ne!(left.get_id(), right.get_id()); + + // Leave joint + pd_client.must_leave_joint(left.get_id()); + pd_client.must_leave_joint(right.get_id()); +} + #[test] fn test_sync_recover_after_apply_snapshot() { let mut cluster = prepare_cluster(); @@ -119,7 +192,7 @@ fn test_sync_recover_after_apply_snapshot() { // swith to async cluster .pd_client - .switch_replication_mode(DrAutoSyncState::Async, vec![]); + .switch_replication_mode(Some(DrAutoSyncState::Async), vec![]); rx.recv_timeout(Duration::from_millis(100)).unwrap(); must_get_equal(&cluster.get_engine(1), b"k2", b"v2"); thread::sleep(Duration::from_millis(100)); @@ -136,7 +209,7 @@ fn test_sync_recover_after_apply_snapshot() { cluster .pd_client - .switch_replication_mode(DrAutoSyncState::SyncRecover, vec![]); + .switch_replication_mode(Some(DrAutoSyncState::SyncRecover), vec![]); thread::sleep(Duration::from_millis(100)); // Add node 3 back, snapshot will apply cluster.clear_send_filters(); @@ -265,7 +338,7 @@ fn test_switching_replication_mode() { cluster .pd_client - .switch_replication_mode(DrAutoSyncState::Async, vec![]); + .switch_replication_mode(Some(DrAutoSyncState::Async), vec![]); rx.recv_timeout(Duration::from_millis(100)).unwrap(); must_get_equal(&cluster.get_engine(1), b"k2", b"v2"); thread::sleep(Duration::from_millis(100)); @@ -275,7 +348,7 @@ fn test_switching_replication_mode() { cluster .pd_client - .switch_replication_mode(DrAutoSyncState::SyncRecover, vec![]); + .switch_replication_mode(Some(DrAutoSyncState::SyncRecover), vec![]); thread::sleep(Duration::from_millis(100)); let mut request = new_request( region.get_id(), @@ -331,7 +404,7 @@ fn test_replication_mode_allowlist() { run_cluster(&mut cluster); cluster .pd_client - .switch_replication_mode(DrAutoSyncState::Async, vec![1]); + .switch_replication_mode(Some(DrAutoSyncState::Async), vec![1]); thread::sleep(Duration::from_millis(100)); // 2,3 are paused, so it should not be able to write. @@ -357,7 +430,7 @@ fn test_replication_mode_allowlist() { // clear allowlist. cluster .pd_client - .switch_replication_mode(DrAutoSyncState::Async, vec![]); + .switch_replication_mode(Some(DrAutoSyncState::Async), vec![]); rx.recv_timeout(Duration::from_millis(100)).unwrap(); must_get_equal(&cluster.get_engine(1), b"k2", b"v2"); } @@ -456,6 +529,70 @@ fn test_migrate_replication_mode() { assert_eq!(state.state, RegionReplicationState::IntegrityOverLabel); } +#[test] +fn test_migrate_majority_to_drautosync() { + // 1. start cluster, enable dr-auto-sync and set labels. + let mut cluster = new_server_cluster(0, 3); + cluster.pd_client.disable_default_operator(); + cluster.cfg.raft_store.pd_store_heartbeat_tick_interval = ReadableDuration::millis(50); + cluster.cfg.raft_store.raft_log_gc_threshold = 10; + prepare_labels(&mut cluster); + cluster.run(); + cluster.must_transfer_leader(1, new_peer(1, 1)); + cluster.must_put(b"k1", b"v0"); + cluster.pd_client.configure_dr_auto_sync("dc"); + thread::sleep(Duration::from_millis(100)); + let region = cluster.get_region(b"k1"); + let mut request = new_request( + region.get_id(), + region.get_region_epoch().clone(), + vec![new_put_cf_cmd("default", b"k2", b"v2")], + false, + ); + request.mut_header().set_peer(new_peer(1, 1)); + let (cb, mut rx) = make_cb(&request); + cluster + .sim + .rl() + .async_command_on_node(1, request, cb) + .unwrap(); + assert_eq!(rx.recv_timeout(Duration::from_millis(100)).is_ok(), true); + must_get_equal(&cluster.get_engine(1), b"k2", b"v2"); + let state = cluster.pd_client.region_replication_status(region.get_id()); + assert_eq!(state.state_id, 1); + assert_eq!(state.state, RegionReplicationState::IntegrityOverLabel); + + // 2. switch to majority mode. + cluster.pd_client.switch_replication_mode(None, vec![]); + thread::sleep(Duration::from_millis(150)); + + // 3. spilt the region and make a new region, the regions status must be + // SimpleMajority. + cluster.must_split(®ion, b"m1"); + thread::sleep(Duration::from_millis(150)); + cluster.must_put(b"n4", b"v4"); + must_get_equal(&cluster.get_engine(1), b"n4", b"v4"); + let region_m = cluster.get_region(b"n4"); + let region_k = cluster.get_region(b"k1"); + + // 4. switch to dy-auto-sync mode, the new region generated at majority mode + // becomes IntegrityOverLabel again. + cluster + .pd_client + .switch_replication_mode(Some(DrAutoSyncState::SyncRecover), vec![]); + thread::sleep(Duration::from_millis(100)); + let state_m = cluster + .pd_client + .region_replication_status(region_m.get_id()); + let state_k = cluster + .pd_client + .region_replication_status(region_k.get_id()); + assert_eq!(state_m.state_id, 3); + assert_eq!(state_m.state, RegionReplicationState::IntegrityOverLabel); + assert_eq!(state_k.state_id, 3); + assert_eq!(state_k.state, RegionReplicationState::IntegrityOverLabel); +} + /// Tests if labels are loaded correctly after rolling start. #[test] fn test_loading_label_after_rolling_start() { diff --git a/tests/integrations/raftstore/test_snap.rs b/tests/integrations/raftstore/test_snap.rs index 9eda281e9e4..0b71978f63b 100644 --- a/tests/integrations/raftstore/test_snap.rs +++ b/tests/integrations/raftstore/test_snap.rs @@ -227,8 +227,6 @@ fn test_server_snap_gc() { #[test_case(test_raftstore::new_node_cluster)] #[test_case(test_raftstore::new_server_cluster)] -#[test_case(test_raftstore_v2::new_node_cluster)] -#[test_case(test_raftstore_v2::new_server_cluster)] fn test_concurrent_snap() { let mut cluster = new_cluster(0, 3); // Test that the handling of snapshot is correct when there are multiple @@ -279,6 +277,59 @@ fn test_concurrent_snap() { must_get_equal(&cluster.get_engine(3), b"k4", b"v4"); } +#[test_case(test_raftstore_v2::new_node_cluster)] +#[test_case(test_raftstore_v2::new_server_cluster)] +fn test_concurrent_snap_v2() { + let mut cluster = new_cluster(0, 3); + // TODO: v2 doesn't support titan. + // Test that the handling of snapshot is correct when there are multiple + // snapshots which have overlapped region ranges arrive at the same + // raftstore. + // cluster.cfg.rocksdb.titan.enabled = true; + // Disable raft log gc in this test case. + cluster.cfg.raft_store.raft_log_gc_tick_interval = ReadableDuration::secs(60); + // For raftstore v2, after split, follower delays first messages (see + // is_first_message() for details), so leader does not send snapshot to + // follower and CollectSnapshotFilter holds parent region snapshot forever. + // We need to set a short wait duration so that leader can send snapshot + // in time and thus CollectSnapshotFilter can send parent region snapshot. + cluster.cfg.raft_store.snap_wait_split_duration = ReadableDuration::millis(100); + + let pd_client = Arc::clone(&cluster.pd_client); + // Disable default max peer count check. + pd_client.disable_default_operator(); + + let r1 = cluster.run_conf_change(); + cluster.must_put(b"k1", b"v1"); + pd_client.must_add_peer(r1, new_peer(2, 2)); + // Force peer 2 to be followers all the way. + cluster.add_send_filter(CloneFilterFactory( + RegionPacketFilter::new(r1, 2) + .msg_type(MessageType::MsgRequestVote) + .direction(Direction::Send), + )); + cluster.must_transfer_leader(r1, new_peer(1, 1)); + cluster.must_put(b"k3", b"v3"); + // Pile up snapshots of overlapped region ranges and deliver them all at once. + let (tx, rx) = mpsc::channel(); + cluster.add_recv_filter_on_node(3, Box::new(CollectSnapshotFilter::new(tx))); + pd_client.must_add_peer(r1, new_peer(3, 3)); + let region = cluster.get_region(b"k1"); + // Ensure the snapshot of range ("", "") is sent and piled in filter. + if let Err(e) = rx.recv_timeout(Duration::from_secs(1)) { + panic!("the snapshot is not sent before split, e: {:?}", e); + } + // Split the region range and then there should be another snapshot for the + // split ranges. + cluster.must_split(®ion, b"k2"); + must_get_equal(&cluster.get_engine(3), b"k3", b"v3"); + // Ensure the regions work after split. + cluster.must_put(b"k11", b"v11"); + must_get_equal(&cluster.get_engine(3), b"k11", b"v11"); + cluster.must_put(b"k4", b"v4"); + must_get_equal(&cluster.get_engine(3), b"k4", b"v4"); +} + #[test_case(test_raftstore::new_node_cluster)] #[test_case(test_raftstore::new_server_cluster)] #[test_case(test_raftstore_v2::new_node_cluster)] diff --git a/tests/integrations/raftstore/test_snap_recovery.rs b/tests/integrations/raftstore/test_snap_recovery.rs index 70f9ae8d97c..f3fcec4d8a9 100644 --- a/tests/integrations/raftstore/test_snap_recovery.rs +++ b/tests/integrations/raftstore/test_snap_recovery.rs @@ -2,11 +2,15 @@ use std::time::Duration; -use futures::StreamExt; +use futures::{executor::block_on, StreamExt}; use raft::eraftpb::MessageType; -use raftstore::store::{PeerMsg, SignificantMsg, SnapshotRecoveryWaitApplySyncer}; +use raftstore::store::{ + snapshot_backup::{SnapshotBrWaitApplyRequest, SyncReport}, + PeerMsg, SignificantMsg, SnapshotBrWaitApplySyncer, +}; use test_raftstore::*; -use tikv_util::HandyRwLock; +use tikv_util::{future::block_on_timeout, HandyRwLock}; +use tokio::sync::oneshot; #[test] fn test_check_pending_admin() { @@ -41,7 +45,7 @@ fn test_check_pending_admin() { let (tx, mut rx) = futures::channel::mpsc::unbounded(); router.broadcast_normal(|| { - PeerMsg::SignificantMsg(SignificantMsg::CheckPendingAdmin(tx.clone())) + PeerMsg::SignificantMsg(Box::new(SignificantMsg::CheckPendingAdmin(tx.clone()))) }); futures::executor::block_on(async { let r = rx.next().await; @@ -57,7 +61,7 @@ fn test_check_pending_admin() { let (tx, mut rx) = futures::channel::mpsc::unbounded(); router.broadcast_normal(|| { - PeerMsg::SignificantMsg(SignificantMsg::CheckPendingAdmin(tx.clone())) + PeerMsg::SignificantMsg(Box::new(SignificantMsg::CheckPendingAdmin(tx.clone()))) }); futures::executor::block_on(async { let r = rx.next().await; @@ -94,17 +98,17 @@ fn test_snap_wait_apply() { let router = cluster.sim.wl().get_router(1).unwrap(); - let (tx, rx) = std::sync::mpsc::sync_channel(1); - + let (tx, rx) = oneshot::channel(); + let syncer = SnapshotBrWaitApplySyncer::new(1, tx); router.broadcast_normal(|| { - PeerMsg::SignificantMsg(SignificantMsg::SnapshotRecoveryWaitApply( - SnapshotRecoveryWaitApplySyncer::new(1, tx.clone()), - )) + PeerMsg::SignificantMsg(Box::new(SignificantMsg::SnapshotBrWaitApply( + SnapshotBrWaitApplyRequest::relaxed(syncer.clone()), + ))) }); // we expect recv timeout because the leader peer on store 1 cannot finished the // apply. so the wait apply will timeout. - rx.recv_timeout(Duration::from_secs(1)).unwrap_err(); + block_on_timeout(rx, Duration::from_secs(1)).unwrap_err(); // clear filter so we can make wait apply finished. cluster.clear_send_filters(); @@ -112,13 +116,21 @@ fn test_snap_wait_apply() { // after clear the filter the leader peer on store 1 can finsihed the wait // apply. - let (tx, rx) = std::sync::mpsc::sync_channel(1); + let (tx, rx) = oneshot::channel(); + let syncer = SnapshotBrWaitApplySyncer::new(1, tx); router.broadcast_normal(|| { - PeerMsg::SignificantMsg(SignificantMsg::SnapshotRecoveryWaitApply( - SnapshotRecoveryWaitApplySyncer::new(1, tx.clone()), - )) + PeerMsg::SignificantMsg(Box::new(SignificantMsg::SnapshotBrWaitApply( + SnapshotBrWaitApplyRequest::relaxed(syncer.clone()), + ))) }); + drop(syncer); // we expect recv the region id from rx. - assert_eq!(rx.recv(), Ok(1)); + assert_eq!( + block_on(rx), + Ok(SyncReport { + report_id: 1, + aborted: None + }) + ); } diff --git a/tests/integrations/raftstore/test_split_region.rs b/tests/integrations/raftstore/test_split_region.rs index c0f75487998..8e957190f7b 100644 --- a/tests/integrations/raftstore/test_split_region.rs +++ b/tests/integrations/raftstore/test_split_region.rs @@ -609,7 +609,7 @@ fn test_node_split_region_after_reboot_with_config_change() { sleep_ms(200); assert_eq!(pd_client.get_split_count(), 0); - // change the config to make the region splittable + // change the config to make the region splitable cluster.cfg.coprocessor.region_max_size = Some(ReadableSize(region_max_size / 3)); cluster.cfg.coprocessor.region_split_size = Some(ReadableSize(region_split_size / 3)); cluster.cfg.coprocessor.region_bucket_size = ReadableSize(region_split_size / 3); @@ -976,14 +976,13 @@ fn test_refresh_region_bucket_keys() { cluster.run(); let pd_client = Arc::clone(&cluster.pd_client); + // case: init bucket info cluster.must_put(b"k11", b"v1"); let mut region = pd_client.get_region(b"k11").unwrap(); - let bucket = Bucket { keys: vec![b"k11".to_vec()], size: 1024 * 1024 * 200, }; - let mut expected_buckets = metapb::Buckets::default(); expected_buckets.set_keys(bucket.clone().keys.into()); expected_buckets @@ -997,6 +996,8 @@ fn test_refresh_region_bucket_keys() { Option::None, Some(expected_buckets.clone()), ); + + // case: bucket range should refresh if epoch changed let conf_ver = region.get_region_epoch().get_conf_ver() + 1; region.mut_region_epoch().set_conf_ver(conf_ver); @@ -1018,6 +1019,7 @@ fn test_refresh_region_bucket_keys() { ); assert_eq!(bucket_version2, bucket_version + 1); + // case: stale epoch will not refresh buckets info let conf_ver = 0; region.mut_region_epoch().set_conf_ver(conf_ver); let bucket_version3 = cluster.refresh_region_bucket_keys( @@ -1028,6 +1030,7 @@ fn test_refresh_region_bucket_keys() { ); assert_eq!(bucket_version3, bucket_version2); + // case: bucket split // now the buckets is ["", "k12", ""]. further split ["", k12], [k12, ""] // buckets into more buckets let region = pd_client.get_region(b"k11").unwrap(); @@ -1066,6 +1069,7 @@ fn test_refresh_region_bucket_keys() { ); assert_eq!(bucket_version4, bucket_version3 + 1); + // case: merge buckets // remove k11~k12, k12~k121, k122~[] bucket let buckets = vec![ Bucket { @@ -1107,7 +1111,7 @@ fn test_refresh_region_bucket_keys() { assert_eq!(bucket_version5, bucket_version4 + 1); - // split the region + // case: split the region pd_client.must_split_region(region, pdpb::CheckPolicy::Usekey, vec![b"k11".to_vec()]); let mut buckets = vec![Bucket { keys: vec![b"k10".to_vec()], @@ -1132,7 +1136,7 @@ fn test_refresh_region_bucket_keys() { cluster.refresh_region_bucket_keys(®ion, buckets, None, Some(expected_buckets.clone())); assert_eq!(bucket_version6, bucket_version5 + 1); - // merge the region + // case: merge the region pd_client.must_merge(left_id, right.get_id()); let region = pd_client.get_region(b"k10").unwrap(); let buckets = vec![Bucket { @@ -1145,6 +1149,7 @@ fn test_refresh_region_bucket_keys() { cluster.refresh_region_bucket_keys(®ion, buckets, None, Some(expected_buckets.clone())); assert_eq!(bucket_version7, bucket_version6 + 1); + // case: nothing changed let bucket_version8 = cluster.refresh_region_bucket_keys( ®ion, vec![], @@ -1157,26 +1162,24 @@ fn test_refresh_region_bucket_keys() { #[test] fn test_gen_split_check_bucket_ranges() { - let count = 5; - let mut cluster = new_server_cluster(0, count); - cluster.cfg.coprocessor.region_bucket_size = ReadableSize(5); + let mut cluster = new_server_cluster(0, 1); + let region_bucket_size = ReadableSize::kb(1); + cluster.cfg.coprocessor.region_bucket_size = region_bucket_size; cluster.cfg.coprocessor.enable_region_bucket = Some(true); // disable report buckets; as it will reset the user traffic stats to randomize // the test result - cluster.cfg.raft_store.check_leader_lease_interval = ReadableDuration::secs(5); - // Make merge check resume quickly. - cluster.cfg.raft_store.merge_check_tick_interval = ReadableDuration::millis(100); cluster.run(); let pd_client = Arc::clone(&cluster.pd_client); - cluster.must_put(b"k11", b"v1"); - let region = pd_client.get_region(b"k11").unwrap(); + let mut range = 1..; + let mid_key = put_till_size(&mut cluster, region_bucket_size.0, &mut range); + let second_key = put_till_size(&mut cluster, region_bucket_size.0, &mut range); + let region = pd_client.get_region(&second_key).unwrap(); let bucket = Bucket { - keys: vec![b"k11".to_vec()], - size: 1024 * 1024 * 200, + keys: vec![mid_key.clone()], + size: region_bucket_size.0 * 2, }; - let mut expected_buckets = metapb::Buckets::default(); expected_buckets.set_keys(bucket.clone().keys.into()); expected_buckets @@ -1184,7 +1187,6 @@ fn test_gen_split_check_bucket_ranges() { .insert(0, region.get_start_key().to_vec()); expected_buckets.keys.push(region.get_end_key().to_vec()); let buckets = vec![bucket]; - // initialize fsm.peer.bucket_regions cluster.refresh_region_bucket_keys( ®ion, @@ -1192,32 +1194,28 @@ fn test_gen_split_check_bucket_ranges() { Option::None, Some(expected_buckets.clone()), ); - cluster.must_put(b"k10", b"v1"); - cluster.must_put(b"k12", b"v1"); - let expected_bucket_ranges = vec![ - BucketRange(vec![], b"k11".to_vec()), - BucketRange(b"k11".to_vec(), vec![]), - ]; + // put some data into the right buckets, so the bucket range will be check by + // split check. + let latest_key = put_till_size(&mut cluster, region_bucket_size.0 + 100, &mut range); + let expected_bucket_ranges = vec![BucketRange(mid_key.clone(), vec![])]; cluster.send_half_split_region_message(®ion, Some(expected_bucket_ranges)); - // set fsm.peer.last_bucket_regions + // reset bucket stats. cluster.refresh_region_bucket_keys( ®ion, buckets, Option::None, Some(expected_buckets.clone()), ); - // because the diff between last_bucket_regions and bucket_regions is zero, - // bucket range for split check should be empty. - let expected_bucket_ranges = vec![]; - cluster.send_half_split_region_message(®ion, Some(expected_bucket_ranges)); - // split the region - pd_client.must_split_region(region, pdpb::CheckPolicy::Usekey, vec![b"k11".to_vec()]); + thread::sleep(Duration::from_millis(100)); + cluster.send_half_split_region_message(®ion, Some(vec![])); - let left = pd_client.get_region(b"k10").unwrap(); - let right = pd_client.get_region(b"k12").unwrap(); + // split the region + pd_client.must_split_region(region, pdpb::CheckPolicy::Usekey, vec![second_key]); + let left = pd_client.get_region(&mid_key).unwrap(); + let right = pd_client.get_region(&latest_key).unwrap(); if right.get_id() == 1 { // the bucket_ranges should be None to refresh the bucket cluster.send_half_split_region_message(&right, None); @@ -1225,11 +1223,10 @@ fn test_gen_split_check_bucket_ranges() { // the bucket_ranges should be None to refresh the bucket cluster.send_half_split_region_message(&left, None); } - + thread::sleep(Duration::from_millis(300)); // merge the region pd_client.must_merge(left.get_id(), right.get_id()); - let region = pd_client.get_region(b"k10").unwrap(); - // the bucket_ranges should be None to refresh the bucket + let region = pd_client.get_region(&mid_key).unwrap(); cluster.send_half_split_region_message(®ion, None); } diff --git a/tests/integrations/raftstore/test_stale_peer.rs b/tests/integrations/raftstore/test_stale_peer.rs index e12584d6c60..f76373756f9 100644 --- a/tests/integrations/raftstore/test_stale_peer.rs +++ b/tests/integrations/raftstore/test_stale_peer.rs @@ -6,8 +6,10 @@ use std::{sync::Arc, thread, time::*}; use engine_traits::{Peekable, CF_RAFT}; use kvproto::raft_serverpb::{PeerState, RegionLocalState}; +use pd_client::PdClient; use raft::eraftpb::MessageType; use test_raftstore::*; +use test_raftstore_macro::test_case; use tikv_util::{config::ReadableDuration, HandyRwLock}; /// A helper function for testing the behaviour of the gc of stale peer @@ -310,3 +312,48 @@ fn test_stale_learner_with_read_index() { let state: RegionLocalState = engine3.get_msg_cf(CF_RAFT, &state_key).unwrap().unwrap(); assert_eq!(state.get_state(), PeerState::Tombstone); } + +/// Test if an uninitialized stale peer will be removed after restart. +#[test_case(test_raftstore::new_node_cluster)] +// #[test_case(test_raftstore_v2::new_node_cluster)] +fn test_node_restart_gc_uninitialized_peer_after_merge() { + let mut cluster = new_cluster(0, 4); + configure_for_merge(&mut cluster.cfg); + ignore_merge_target_integrity(&mut cluster.cfg, &cluster.pd_client); + cluster.cfg.raft_store.raft_election_timeout_ticks = 5; + cluster.cfg.raft_store.raft_store_max_leader_lease = ReadableDuration::millis(40); + cluster.cfg.raft_store.max_leader_missing_duration = ReadableDuration::millis(150); + cluster.cfg.raft_store.abnormal_leader_missing_duration = ReadableDuration::millis(100); + cluster.cfg.raft_store.peer_stale_state_check_interval = ReadableDuration::millis(100); + + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + + cluster.run_conf_change(); + + cluster.must_put(b"k1", b"v1"); + + // test if an uninitialized stale peer before conf removal is destroyed + // automatically + let region = pd_client.get_region(b"k1").unwrap(); + pd_client.must_add_peer(region.get_id(), new_peer(2, 2)); + pd_client.must_add_peer(region.get_id(), new_peer(3, 3)); + + // Block snapshot messages, so that new peers will never be initialized. + cluster.add_send_filter(CloneFilterFactory( + RegionPacketFilter::new(region.get_id(), 4) + .msg_type(MessageType::MsgSnapshot) + .direction(Direction::Recv), + )); + // Add peer (4,4), remove peer (4,4) and then merge regions. + // Peer (4,4) will be an an uninitialized stale peer. + pd_client.must_add_peer(region.get_id(), new_peer(4, 4)); + cluster.must_region_exist(region.get_id(), 4); + cluster.add_send_filter(IsolationFilterFactory::new(4)); + pd_client.must_remove_peer(region.get_id(), new_peer(4, 4)); + + // An uninitialized stale peer is removed automatically after restart. + cluster.stop_node(4); + cluster.run_node(4).unwrap(); + cluster.must_region_not_exist(region.get_id(), 4); +} diff --git a/tests/integrations/raftstore/test_stale_read.rs b/tests/integrations/raftstore/test_stale_read.rs index 24e13003f7e..5de9bda1f64 100644 --- a/tests/integrations/raftstore/test_stale_read.rs +++ b/tests/integrations/raftstore/test_stale_read.rs @@ -8,7 +8,7 @@ use kvproto::{ metapb::{Peer, Region}, tikvpb_grpc::TikvClient, }; -use test_raftstore::{must_get_equal, new_mutation, new_peer, PeerClient}; +use test_raftstore::{must_get_equal, new_mutation, new_peer}; use test_raftstore_macro::test_case; use tikv_util::{config::ReadableDuration, time::Instant}; diff --git a/tests/integrations/raftstore/test_stats.rs b/tests/integrations/raftstore/test_stats.rs index 67e5e261dab..60f10936f2d 100644 --- a/tests/integrations/raftstore/test_stats.rs +++ b/tests/integrations/raftstore/test_stats.rs @@ -12,6 +12,7 @@ use futures::{executor::block_on, SinkExt, StreamExt}; use grpcio::*; use kvproto::{kvrpcpb::*, pdpb::QueryKind, tikvpb::*, tikvpb_grpc::TikvClient}; use pd_client::PdClient; +use test_coprocessor::{DagSelect, ProductTable}; use test_raftstore::*; use tikv_util::{config::*, store::QueryStats}; use txn_types::Key; @@ -262,19 +263,10 @@ fn test_raw_query_stats_tmpl() { req.set_raw_get(get_req); req }); - batch_commands(&ctx, &client, get_command, &start_key); - assert!(check_split_key( - cluster, - F::encode_raw_key_owned(start_key.clone(), None).into_encoded(), - None - )); - if check_query_num_read( - cluster, - store_id, - region_id, - QueryKind::Get, - (i + 1) * 1000, - ) { + if i == 0 { + batch_commands(&ctx, &client, get_command, &start_key); + } + if check_query_num_read(cluster, store_id, region_id, QueryKind::Get, 1000) { flag = true; break; } @@ -284,14 +276,16 @@ fn test_raw_query_stats_tmpl() { fail::cfg("mock_hotspot_threshold", "return(0)").unwrap(); fail::cfg("mock_tick_interval", "return(0)").unwrap(); fail::cfg("mock_collect_tick_interval", "return(0)").unwrap(); - test_query_num::(raw_get, true); - test_query_num::(raw_batch_get, true); - test_query_num::(raw_scan, true); - test_query_num::(raw_batch_scan, true); + test_query_num::(raw_get, true, true); + test_query_num::(raw_batch_get, true, true); + test_query_num::(raw_scan, true, true); + test_query_num::(raw_batch_scan, true, true); if F::IS_TTL_ENABLED { - test_query_num::(raw_get_key_ttl, true); + test_query_num::(raw_get_key_ttl, true, true); } - test_query_num::(raw_batch_get_command, true); + // requests may failed caused by `EpochNotMatch` after split when auto split is + // enabled, disable it. + test_query_num::(raw_batch_get_command, true, false); test_raw_delete_query::(); fail::remove("mock_tick_interval"); fail::remove("mock_hotspot_threshold"); @@ -385,19 +379,34 @@ fn test_txn_query_stats_tmpl() { req.set_get(get_req); req }); - batch_commands(&ctx, &client, get_command, &start_key); - assert!(check_split_key( - cluster, - Key::from_raw(&start_key).as_encoded().to_vec(), - None - )); - if check_query_num_read( - cluster, - store_id, - region_id, - QueryKind::Get, - (i + 1) * 1000, - ) { + if i == 0 { + batch_commands(&ctx, &client, get_command, &start_key); + } + if check_query_num_read(cluster, store_id, region_id, QueryKind::Get, 1000) { + flag = true; + break; + } + } + assert!(flag); + }); + let batch_coprocessor: Box = + Box::new(|ctx, cluster, client, store_id, region_id, start_key| { + let mut flag = false; + for i in 0..3 { + let coprocessor: Box = Box::new(|ctx, _start_key| { + let mut req = BatchCommandsRequestRequest::new(); + let table = ProductTable::new(); + let mut cop_req = DagSelect::from(&table).build(); + cop_req.set_context(ctx.clone()); + req.set_coprocessor(cop_req); + req + }); + if i == 0 { + batch_commands(&ctx, &client, coprocessor, &start_key); + } + // here cannot read any data, so expect is 0. may need fix. here mainly used to + // verify the request source is as expect. + if check_query_num_read(cluster, store_id, region_id, QueryKind::Coprocessor, 0) { flag = true; break; } @@ -407,17 +416,22 @@ fn test_txn_query_stats_tmpl() { fail::cfg("mock_hotspot_threshold", "return(0)").unwrap(); fail::cfg("mock_tick_interval", "return(0)").unwrap(); fail::cfg("mock_collect_tick_interval", "return(0)").unwrap(); - test_query_num::(get, false); - test_query_num::(batch_get, false); - test_query_num::(scan, false); - test_query_num::(scan_lock, false); - test_query_num::(batch_get_command, false); + fail::cfg("only_check_source_task_name", "return(test_stats)").unwrap(); + test_query_num::(get, false, true); + test_query_num::(batch_get, false, true); + test_query_num::(scan, false, true); + test_query_num::(scan_lock, false, true); + // requests may failed caused by `EpochNotMatch` after split when auto split is + // enabled, disable it. + test_query_num::(batch_get_command, false, false); + test_query_num::(batch_coprocessor, false, false); test_txn_delete_query::(); test_pessimistic_lock(); test_rollback(); fail::remove("mock_tick_interval"); fail::remove("mock_hotspot_threshold"); fail::remove("mock_collect_tick_interval"); + fail::remove("only_check_source_task_name"); } fn raw_put( @@ -501,10 +515,11 @@ fn put( } fn test_pessimistic_lock() { - let (cluster, client, ctx) = must_new_and_configure_cluster_and_kv_client(|cluster| { + let (cluster, client, mut ctx) = must_new_and_configure_cluster_and_kv_client(|cluster| { cluster.cfg.raft_store.pd_store_heartbeat_tick_interval = ReadableDuration::millis(50); }); + ctx.set_request_source("test_stats".to_owned()); let key = b"key2".to_vec(); let store_id = 1; put(&cluster, &client, &ctx, store_id, key.clone()); @@ -541,9 +556,10 @@ fn test_pessimistic_lock() { } pub fn test_rollback() { - let (cluster, client, ctx) = must_new_and_configure_cluster_and_kv_client(|cluster| { + let (cluster, client, mut ctx) = must_new_and_configure_cluster_and_kv_client(|cluster| { cluster.cfg.raft_store.pd_store_heartbeat_tick_interval = ReadableDuration::millis(50); }); + ctx.set_request_source("test_stats".to_owned()); let key = b"key2".to_vec(); let store_id = 1; put(&cluster, &client, &ctx, store_id, key.clone()); @@ -572,17 +588,23 @@ pub fn test_rollback() { )); } -fn test_query_num(query: Box, is_raw_kv: bool) { +fn test_query_num(query: Box, is_raw_kv: bool, auto_split: bool) { let (mut cluster, client, mut ctx) = must_new_and_configure_cluster_and_kv_client(|cluster| { cluster.cfg.raft_store.pd_store_heartbeat_tick_interval = ReadableDuration::millis(50); - cluster.cfg.split.qps_threshold = 0; + if auto_split { + cluster.cfg.split.qps_threshold = Some(0); + } else { + cluster.cfg.split.qps_threshold = Some(1000000); + } cluster.cfg.split.split_balance_score = 2.0; cluster.cfg.split.split_contained_score = 2.0; cluster.cfg.split.detect_times = 1; cluster.cfg.split.sample_threshold = 0; cluster.cfg.storage.set_api_version(F::TAG); + cluster.cfg.server.enable_request_batch = false; }); ctx.set_api_version(F::CLIENT_TAG); + ctx.set_request_source("test_stats".to_owned()); let mut k = b"key".to_vec(); // When a peer becomes leader, it can't read before committing to current term. @@ -610,6 +632,7 @@ fn test_raw_delete_query() { cluster.cfg.storage.set_api_version(F::TAG); }); ctx.set_api_version(F::CLIENT_TAG); + ctx.set_request_source("test_stats".to_owned()); raw_put::(&cluster, &client, &ctx, store_id, k.clone()); // Raw Delete @@ -635,10 +658,10 @@ fn test_txn_delete_query() { let store_id = 1; { - let (cluster, client, ctx) = must_new_and_configure_cluster_and_kv_client(|cluster| { + let (cluster, client, mut ctx) = must_new_and_configure_cluster_and_kv_client(|cluster| { cluster.cfg.raft_store.pd_store_heartbeat_tick_interval = ReadableDuration::millis(50); }); - + ctx.set_request_source("test_stats".to_owned()); put(&cluster, &client, &ctx, store_id, k.clone()); // DeleteRange let mut delete_req = DeleteRangeRequest::default(); @@ -762,4 +785,13 @@ fn batch_commands( } }); rx.recv_timeout(Duration::from_secs(10)).unwrap(); + sleep_ms(100); + // triage metrics flush + for _ in 0..10 { + let mut req = ScanRequest::default(); + req.set_context(ctx.to_owned()); + req.start_key = start_key.to_owned(); + req.end_key = vec![]; + client.kv_scan(&req).unwrap(); + } } diff --git a/tests/integrations/raftstore/test_status_command.rs b/tests/integrations/raftstore/test_status_command.rs index 8565d936d9f..22caef23dc7 100644 --- a/tests/integrations/raftstore/test_status_command.rs +++ b/tests/integrations/raftstore/test_status_command.rs @@ -4,7 +4,7 @@ use raftstore::store::{msg::StoreMsg as StoreMsgV1, util::LatencyInspector}; use raftstore_v2::router::StoreMsg as StoreMsgV2; use test_raftstore::Simulator as S1; use test_raftstore_v2::Simulator as S2; -use tikv_util::{time::Instant, HandyRwLock}; +use tikv_util::{config::ReadableDuration, time::Instant, HandyRwLock, InspectFactor}; #[test] fn test_region_detail() { @@ -32,6 +32,7 @@ fn test_region_detail() { fn test_latency_inspect() { let mut cluster_v1 = test_raftstore::new_node_cluster(0, 1); cluster_v1.cfg.raft_store.store_io_pool_size = 2; + cluster_v1.cfg.raft_store.inspect_kvdb_interval = ReadableDuration::millis(500); cluster_v1.run(); let mut cluster_v2 = test_raftstore_v2::new_node_cluster(0, 1); cluster_v2.run(); @@ -42,19 +43,24 @@ fn test_latency_inspect() { { // Test send LatencyInspect to V1. let (tx, rx) = std::sync::mpsc::sync_channel(10); - let inspector = LatencyInspector::new( - 1, - Box::new(move |_, duration| { - let dur = duration.sum(); - tx.send(dur).unwrap(); - }), - ); - let msg = StoreMsgV1::LatencyInspect { - send_time: Instant::now(), - inspector, - }; - router_v1.send_control(msg).unwrap(); - rx.recv_timeout(std::time::Duration::from_secs(2)).unwrap(); + // Inspect different factors. + for factor in [InspectFactor::RaftDisk, InspectFactor::KvDisk].iter() { + let cloned_tx = tx.clone(); + let inspector = LatencyInspector::new( + 1, + Box::new(move |_, duration| { + let dur = duration.sum(); + cloned_tx.send(dur).unwrap(); + }), + ); + let msg = StoreMsgV1::LatencyInspect { + factor: *factor, + send_time: Instant::now(), + inspector, + }; + router_v1.send_control(msg).unwrap(); + rx.recv_timeout(std::time::Duration::from_secs(2)).unwrap(); + } } { // Test send LatencyInspect to V2. @@ -82,17 +88,22 @@ fn test_sync_latency_inspect() { cluster.run(); let router = cluster.sim.wl().get_router(1).unwrap(); let (tx, rx) = std::sync::mpsc::sync_channel(10); - let inspector = LatencyInspector::new( - 1, - Box::new(move |_, duration| { - let dur = duration.sum(); - tx.send(dur).unwrap(); - }), - ); - let msg = StoreMsgV1::LatencyInspect { - send_time: Instant::now(), - inspector, - }; - router.send_control(msg).unwrap(); - rx.recv_timeout(std::time::Duration::from_secs(2)).unwrap(); + // Inspect different factors. + for factor in [InspectFactor::RaftDisk, InspectFactor::KvDisk].iter() { + let cloned_tx = tx.clone(); + let inspector = LatencyInspector::new( + 1, + Box::new(move |_, duration| { + let dur = duration.sum(); + cloned_tx.send(dur).unwrap(); + }), + ); + let msg = StoreMsgV1::LatencyInspect { + factor: *factor, + send_time: Instant::now(), + inspector, + }; + router.send_control(msg).unwrap(); + rx.recv_timeout(std::time::Duration::from_secs(2)).unwrap(); + } } diff --git a/tests/integrations/resource_metering/test_cpu.rs b/tests/integrations/resource_metering/test_cpu.rs index c15bf445ed3..12d6fa4fbe0 100644 --- a/tests/integrations/resource_metering/test_cpu.rs +++ b/tests/integrations/resource_metering/test_cpu.rs @@ -12,6 +12,7 @@ use std::{ use concurrency_manager::ConcurrencyManager; use futures::{executor::block_on, StreamExt}; use kvproto::kvrpcpb::Context; +use resource_control::ResourceGroupManager; use test_coprocessor::{DagSelect, Insert, ProductTable, Store}; use tidb_query_datatype::codec::Datum; use tikv::{ @@ -95,7 +96,10 @@ pub fn test_reschedule_coprocessor() { let mut req = DagSelect::from(&table).build(); let mut ctx = Context::default(); ctx.set_resource_group_tag(tag.as_bytes().to_vec()); + ctx.set_request_source("test".to_owned()); req.set_context(ctx); + fail::cfg("only_check_source_task_name", "return(test)").unwrap(); + defer!(fail::remove("only_check_source_task_name")); assert!( !block_on(endpoint.parse_and_handle_unary_request(req, None)) .consume() @@ -229,7 +233,7 @@ fn setup_test_suite() -> (TestSuite, Store, Endpoint) cm, test_suite.get_tag_factory(), Arc::new(QuotaLimiter::default()), - None, + Some(Arc::new(ResourceGroupManager::default())), ); (test_suite, store, endpoint) } diff --git a/tests/integrations/server/gc_worker.rs b/tests/integrations/server/gc_worker.rs index cfadde84405..238102df6b6 100644 --- a/tests/integrations/server/gc_worker.rs +++ b/tests/integrations/server/gc_worker.rs @@ -7,15 +7,17 @@ use grpcio::{ChannelBuilder, Environment}; use keys::data_key; use kvproto::{kvrpcpb::*, tikvpb::TikvClient}; use test_raftstore::*; +use test_raftstore_macro::test_case; use tikv::server::gc_worker::sync_gc; use tikv_util::HandyRwLock; use txn_types::Key; // Since v5.0 GC bypasses Raft, which means GC scans/deletes records with // `keys::DATA_PREFIX`. This case ensures it's performed correctly. -#[test] +#[test_case(test_raftstore::must_new_cluster_mul)] +#[test_case(test_raftstore_v2::must_new_cluster_mul)] fn test_gc_bypass_raft() { - let (cluster, leader, ctx) = must_new_cluster_mul(2); + let (cluster, leader, ctx) = new_cluster(2); cluster.pd_client.disable_default_operator(); let env = Arc::new(Environment::new(1)); @@ -25,7 +27,7 @@ fn test_gc_bypass_raft() { let pk = b"k1".to_vec(); let value = vec![b'x'; 300]; - let engine = cluster.engines.get(&leader_store).unwrap(); + let engine = cluster.get_engine(leader_store); for &start_ts in &[10, 20, 30, 40] { let commit_ts = start_ts + 5; @@ -37,11 +39,11 @@ fn test_gc_bypass_raft() { let key = Key::from_raw(b"k1").append_ts(start_ts.into()); let key = data_key(key.as_encoded()); - assert!(engine.kv.get_value(&key).unwrap().is_some()); + assert!(engine.get_value(&key).unwrap().is_some()); let key = Key::from_raw(b"k1").append_ts(commit_ts.into()); let key = data_key(key.as_encoded()); - assert!(engine.kv.get_value_cf(CF_WRITE, &key).unwrap().is_some()); + assert!(engine.get_value_cf(CF_WRITE, &key).unwrap().is_some()); } let node_ids = cluster.get_node_ids(); @@ -53,16 +55,16 @@ fn test_gc_bypass_raft() { region.set_end_key(b"k2".to_vec()); sync_gc(&gc_sched, region, 200.into()).unwrap(); - let engine = cluster.engines.get(&store_id).unwrap(); + let engine = cluster.get_engine(store_id); for &start_ts in &[10, 20, 30] { let commit_ts = start_ts + 5; let key = Key::from_raw(b"k1").append_ts(start_ts.into()); let key = data_key(key.as_encoded()); - assert!(engine.kv.get_value(&key).unwrap().is_none()); + assert!(engine.get_value(&key).unwrap().is_none()); let key = Key::from_raw(b"k1").append_ts(commit_ts.into()); let key = data_key(key.as_encoded()); - assert!(engine.kv.get_value_cf(CF_WRITE, &key).unwrap().is_none()); + assert!(engine.get_value_cf(CF_WRITE, &key).unwrap().is_none()); } } } diff --git a/tests/integrations/server/kv_service.rs b/tests/integrations/server/kv_service.rs index 24b6a87bfa5..164fafdf964 100644 --- a/tests/integrations/server/kv_service.rs +++ b/tests/integrations/server/kv_service.rs @@ -29,7 +29,7 @@ use pd_client::PdClient; use raft::eraftpb; use raftstore::{ coprocessor::CoprocessorHost, - store::{fsm::store::StoreMeta, AutoSplitController, SnapManager}, + store::{fsm::store::StoreMeta, AutoSplitController, DiskCheckRunner, SnapManager}, }; use resource_metering::CollectorRegHandle; use service::service_manager::GrpcServiceManager; @@ -1410,6 +1410,7 @@ fn test_double_run_node() { ConcurrencyManager::new(1.into()), CollectorRegHandle::new_for_test(), None, + DiskCheckRunner::dummy(), GrpcServiceManager::dummy(), Arc::new(AtomicU64::new(0)), ) diff --git a/tests/integrations/server/raft_client.rs b/tests/integrations/server/raft_client.rs index aad9ab7ceb1..2b51bb1f21b 100644 --- a/tests/integrations/server/raft_client.rs +++ b/tests/integrations/server/raft_client.rs @@ -21,8 +21,8 @@ use kvproto::{ use raft::eraftpb::Entry; use raftstore::errors::DiscardReason; use tikv::server::{ - self, load_statistics::ThreadLoadPool, raftkv::RaftRouterWrap, resolve, resolve::Callback, - Config, ConnectionBuilder, RaftClient, StoreAddrResolver, TestRaftStoreRouter, + load_statistics::ThreadLoadPool, raftkv::RaftRouterWrap, resolve, Config, ConnectionBuilder, + RaftClient, StoreAddrResolver, TestRaftStoreRouter, }; use tikv_kv::{FakeExtension, RaftExtension}; use tikv_util::{ @@ -32,24 +32,6 @@ use tikv_util::{ use super::*; -#[derive(Clone)] -pub struct StaticResolver { - port: u16, -} - -impl StaticResolver { - fn new(port: u16) -> StaticResolver { - StaticResolver { port } - } -} - -impl StoreAddrResolver for StaticResolver { - fn resolve(&self, _store_id: u64, cb: Callback) -> server::Result<()> { - cb(Ok(format!("localhost:{}", self.port))); - Ok(()) - } -} - fn get_raft_client(router: R, resolver: T) -> RaftClient where R: RaftExtension + Unpin + 'static, @@ -75,8 +57,16 @@ where RaftClient::new(0, builder) } -fn get_raft_client_by_port(port: u16) -> RaftClient { - get_raft_client(FakeExtension, StaticResolver::new(port)) +fn get_raft_client_by_port(port: u16) -> RaftClient { + get_raft_client( + FakeExtension, + resolve::MockStoreAddrResolver { + resolve_fn: Arc::new(move |_, cb| { + cb(Ok(format!("localhost:{}", port))); + Ok(()) + }), + }, + ) } #[derive(Clone)] @@ -177,7 +167,15 @@ fn test_raft_client_reconnect() { let (significant_msg_sender, _significant_msg_receiver) = mpsc::channel(); let router = TestRaftStoreRouter::new(tx, significant_msg_sender); let wrap = RaftRouterWrap::new(router); - let mut raft_client = get_raft_client(wrap, StaticResolver::new(port)); + let mut raft_client = get_raft_client( + wrap, + resolve::MockStoreAddrResolver { + resolve_fn: Arc::new(move |_, cb| { + cb(Ok(format!("localhost:{}", port))); + Ok(()) + }), + }, + ); (0..50).for_each(|_| raft_client.send(RaftMessage::default()).unwrap()); raft_client.flush(); diff --git a/tests/integrations/storage/test_titan.rs b/tests/integrations/storage/test_titan.rs index 9c3eeec0c83..752c6aaee1a 100644 --- a/tests/integrations/storage/test_titan.rs +++ b/tests/integrations/storage/test_titan.rs @@ -159,7 +159,9 @@ fn test_delete_files_in_range_for_titan() { cfg.rocksdb.defaultcf.titan.min_gc_batch_size = ReadableSize(0); cfg.rocksdb.defaultcf.titan.discardable_ratio = 0.4; cfg.rocksdb.defaultcf.titan.min_blob_size = ReadableSize(0); - let resource = cfg.rocksdb.build_resources(Default::default()); + let resource = cfg + .rocksdb + .build_resources(Default::default(), cfg.storage.engine); let kv_db_opts = cfg.rocksdb.build_opt(&resource, cfg.storage.engine); let kv_cfs_opts = cfg.rocksdb.build_cf_opts( &cfg.rocksdb.build_cf_resources(cache), @@ -374,6 +376,7 @@ fn test_delete_files_in_range_for_titan() { b"{", u64::MAX, &limiter, + None, ) .unwrap(); let mut cf_file_write = CfFile::new( @@ -390,6 +393,7 @@ fn test_delete_files_in_range_for_titan() { b"{", u64::MAX, &limiter, + None, ) .unwrap();